diff options
72 files changed, 2150 insertions, 4720 deletions
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 8c66349f316b2..c89a803cbc20d 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -161,6 +161,9 @@ static inline int srmmu_pte_none(pte_t pte) static inline int srmmu_pte_present(pte_t pte) { return ((pte_val(pte) & SRMMU_ET_MASK) == SRMMU_ET_PTE); } +static inline int srmmu_pte_read(pte_t pte) +{ return !(pte_val(pte) & SRMMU_NOREAD); } + static inline void srmmu_pte_clear(pte_t *ptep) { srmmu_set_pte(ptep, __pte(0)); } @@ -2166,6 +2169,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_present, srmmu_pte_present, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_clear, srmmu_pte_clear, BTFIXUPCALL_SWAPO0G0); + BTFIXUPSET_CALL(pte_read, srmmu_pte_read, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_bad, srmmu_pmd_bad, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_present, srmmu_pmd_present, BTFIXUPCALL_NORM); @@ -2196,7 +2200,6 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(free_pgd_fast, srmmu_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, srmmu_get_pgd_fast, BTFIXUPCALL_NORM); - BTFIXUPSET_HALF(pte_readi, SRMMU_NOREAD); BTFIXUPSET_HALF(pte_writei, SRMMU_WRITE); BTFIXUPSET_HALF(pte_dirtyi, SRMMU_DIRTY); BTFIXUPSET_HALF(pte_youngi, SRMMU_REF); diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index 03342120f1f85..1d560390e2821 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -1746,6 +1746,11 @@ static int sun4c_pte_present(pte_t pte) } static void sun4c_pte_clear(pte_t *ptep) { *ptep = __pte(0); } +static int sun4c_pte_read(pte_t pte) +{ + return (pte_val(pte) & _SUN4C_PAGE_READ); +} + static int sun4c_pmd_bad(pmd_t pmd) { return (((pmd_val(pmd) & ~PAGE_MASK) != PGD_TABLE) || @@ -2199,6 +2204,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(pte_present, sun4c_pte_present, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_clear, sun4c_pte_clear, BTFIXUPCALL_STG0O0); + BTFIXUPSET_CALL(pte_read, sun4c_pte_read, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_bad, sun4c_pmd_bad, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_present, sun4c_pmd_present, BTFIXUPCALL_NORM); @@ -2225,7 +2231,6 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM); - BTFIXUPSET_HALF(pte_readi, _SUN4C_PAGE_READ); BTFIXUPSET_HALF(pte_writei, _SUN4C_PAGE_WRITE); BTFIXUPSET_HALF(pte_dirtyi, _SUN4C_PAGE_MODIFIED); BTFIXUPSET_HALF(pte_youngi, _SUN4C_PAGE_ACCESSED); diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index b31687f3e7214..46a2436c9600c 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -16,6 +16,33 @@ config TIME_INTERPOLATION bool default y +choice + prompt "Kernel page size" + default SPARC64_PAGE_SIZE_8KB + +config SPARC64_PAGE_SIZE_8KB + bool "8KB" + help + This lets you select the page size of the kernel. + + 8KB and 64KB work quite well, since Sparc ELF sections + provide for up to 64KB alignment. + + Therefore, 512KB and 4MB are for expert hackers only. + + If you don't know what to do, choose 8KB. + +config SPARC64_PAGE_SIZE_64KB + bool "64KB" + +config SPARC64_PAGE_SIZE_512KB + bool "512KB" + +config SPARC64_PAGE_SIZE_4MB + bool "4MB" + +endchoice + source "init/Kconfig" config SYSVIPC_COMPAT @@ -198,9 +225,11 @@ config HUGETLB_PAGE_SIZE_4MB bool "4MB" config HUGETLB_PAGE_SIZE_512K + depends on !SPARC64_PAGE_SIZE_4MB bool "512K" config HUGETLB_PAGE_SIZE_64K + depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512K bool "64K" endchoice diff --git a/arch/sparc64/Makefile b/arch/sparc64/Makefile index 61724880f20d3..43fe382da0789 100644 --- a/arch/sparc64/Makefile +++ b/arch/sparc64/Makefile @@ -41,10 +41,10 @@ endif ifneq ($(NEW_GCC),y) CFLAGS := $(CFLAGS) -pipe -mno-fpu -mtune=ultrasparc -mmedlow \ - -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare + -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare else CFLAGS := $(CFLAGS) -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow \ - -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare \ + -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare \ $(CC_UNDECL) AFLAGS += -m64 -mcpu=ultrasparc $(CC_UNDECL) endif diff --git a/arch/sparc64/kernel/cpu.c b/arch/sparc64/kernel/cpu.c index 9043e2e03a1f4..48756958116b5 100644 --- a/arch/sparc64/kernel/cpu.c +++ b/arch/sparc64/kernel/cpu.c @@ -38,6 +38,7 @@ struct cpu_fp_info linux_sparc_fpu[] = { { 0x3e, 0x14, 0, "UltraSparc III integrated FPU"}, { 0x3e, 0x15, 0, "UltraSparc III+ integrated FPU"}, { 0x3e, 0x16, 0, "UltraSparc IIIi integrated FPU"}, + { 0x3e, 0x18, 0, "UltraSparc IV integrated FPU"}, }; #define NSPARCFPU (sizeof(linux_sparc_fpu)/sizeof(struct cpu_fp_info)) @@ -51,6 +52,7 @@ struct cpu_iu_info linux_sparc_chips[] = { { 0x3e, 0x14, "TI UltraSparc III (Cheetah)"}, { 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)"}, { 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)"}, + { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)"}, }; #define NSPARCCHIPS (sizeof(linux_sparc_chips)/sizeof(struct cpu_iu_info)) diff --git a/arch/sparc64/kernel/dtlb_backend.S b/arch/sparc64/kernel/dtlb_backend.S index e6bc4a26aeb9b..b73a3c8587704 100644 --- a/arch/sparc64/kernel/dtlb_backend.S +++ b/arch/sparc64/kernel/dtlb_backend.S @@ -7,60 +7,143 @@ */ #include <asm/pgtable.h> -#include <asm/mmu_context.h> +#include <asm/mmu.h> #if PAGE_SHIFT == 13 -#define FILL_VALID_SZ_BITS1(r1) \ - sllx %g2, 62, r1 -#define FILL_VALID_SZ_BITS2(r1) -#define FILL_VALID_SZ_BITS_NOP nop +#define SZ_BITS _PAGE_SZ8K #elif PAGE_SHIFT == 16 -#define FILL_VALID_SZ_BITS1(r1) \ - or %g0, 5, r1 -#define FILL_VALID_SZ_BITS2(r1) \ - sllx r1, 61, r1 -#define FILL_VALID_SZ_BITS_NOP -#else -#error unsupported PAGE_SIZE -#endif /* PAGE_SHIFT */ +#define SZ_BITS _PAGE_SZ64K +#elif PAGE_SHIFT == 19 +#define SZ_BITS _PAGE_SZ512K +#elif PAGE_SHIFT == 22 +#define SZ_BITS _PAGE_SZ4M +#endif + +#define VALID_SZ_BITS (_PAGE_VALID | SZ_BITS) #define VPTE_BITS (_PAGE_CP | _PAGE_CV | _PAGE_P ) #define VPTE_SHIFT (PAGE_SHIFT - 3) -#define TLB_PMD_SHIFT (PAGE_SHIFT - 3 + 3) -#define TLB_PGD_SHIFT (PMD_BITS + PAGE_SHIFT - 3 + 3) -#define TLB_PMD_MASK (((1 << PMD_BITS) - 1) << 1) -#define TLB_PGD_MASK (((1 << (VA_BITS - PAGE_SHIFT - (PAGE_SHIFT - 3) - PMD_BITS)) - 1) << 2) /* Ways we can get here: * * 1) Nucleus loads and stores to/from PA-->VA direct mappings at tl>1. * 2) Nucleus loads and stores to/from user/kernel window save areas. * 3) VPTE misses from dtlb_base and itlb_base. + * + * We need to extract out the PMD and PGDIR indexes from the + * linear virtual page table access address. The PTE index + * is at the bottom, but we are not concerned with it. Bits + * 0 to 2 are clear since each PTE is 8 bytes in size. Each + * PMD and PGDIR entry are 4 bytes in size. Thus, this + * address looks something like: + * + * |---------------------------------------------------------------| + * | ... | PGDIR index | PMD index | PTE index | | + * |---------------------------------------------------------------| + * 63 F E D C B A 3 2 0 <- bit nr + * + * The variable bits above are defined as: + * A --> 3 + (PAGE_SHIFT - log2(8)) + * --> 3 + (PAGE_SHIFT - 3) - 1 + * (ie. this is "bit 3" + PAGE_SIZE - size of PTE entry in bits - 1) + * B --> A + 1 + * C --> B + (PAGE_SHIFT - log2(4)) + * --> B + (PAGE_SHIFT - 2) - 1 + * (ie. this is "bit B" + PAGE_SIZE - size of PMD entry in bits - 1) + * D --> C + 1 + * E --> D + (PAGE_SHIFT - log2(4)) + * --> D + (PAGE_SHIFT - 2) - 1 + * (ie. this is "bit D" + PAGE_SIZE - size of PGDIR entry in bits - 1) + * F --> E + 1 + * + * (Note how "B" always evalutes to PAGE_SHIFT, all the other constants + * cancel out.) + * + * For 8K PAGE_SIZE (thus, PAGE_SHIFT of 13) the bit numbers are: + * A --> 12 + * B --> 13 + * C --> 23 + * D --> 24 + * E --> 34 + * F --> 35 + * + * For 64K PAGE_SIZE (thus, PAGE_SHIFT of 16) the bit numbers are: + * A --> 15 + * B --> 16 + * C --> 29 + * D --> 30 + * E --> 43 + * F --> 44 + * + * Because bits both above and below each PGDIR and PMD index need to + * be masked out, and the index can be as long as 14 bits (when using a + * 64K PAGE_SIZE, and thus a PAGE_SHIFT of 16), we need 3 instructions + * to extract each index out. + * + * Shifts do not pair very well on UltraSPARC-I, II, IIi, and IIe, so + * we try to avoid using them for the entire operation. We could setup + * a mask anywhere from bit 31 down to bit 10 using the sethi instruction. + * + * We need a mask covering bits B --> C and one covering D --> E. + * For 8K PAGE_SIZE these masks are 0x00ffe000 and 0x7ff000000. + * For 64K PAGE_SIZE these masks are 0x3fff0000 and 0xfffc0000000. + * The second in each set cannot be loaded with a single sethi + * instruction, because the upper bits are past bit 32. We would + * need to use a sethi + a shift. + * + * For the time being, we use 2 shifts and a simple "and" mask. + * We shift left to clear the bits above the index, we shift down + * to clear the bits below the index (sans the log2(4 or 8) bits) + * and a mask to clear the log2(4 or 8) bits. We need therefore + * define 4 shift counts, all of which are relative to PAGE_SHIFT. + * + * Although unsupportable for other reasons, this does mean that + * 512K and 4MB page sizes would be generaally supported by the + * kernel. (ELF binaries would break with > 64K PAGE_SIZE since + * the sections are only aligned that strongly). + * + * The operations performed for extraction are thus: + * + * ((X << FOO_SHIFT_LEFT) >> FOO_SHIFT_RIGHT) & ~0x3 + * */ +#define A (3 + (PAGE_SHIFT - 3) - 1) +#define B (A + 1) +#define C (B + (PAGE_SHIFT - 2) - 1) +#define D (C + 1) +#define E (D + (PAGE_SHIFT - 2) - 1) +#define F (E + 1) + +#define PMD_SHIFT_LEFT (64 - D) +#define PMD_SHIFT_RIGHT (64 - (D - B) - 2) +#define PGDIR_SHIFT_LEFT (64 - F) +#define PGDIR_SHIFT_RIGHT (64 - (F - D) - 2) +#define LOW_MASK_BITS 0x3 + /* TLB1 ** ICACHE line 1: tl1 DTLB and quick VPTE miss */ ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Get TAG_ACCESS add %g3, %g3, %g5 ! Compute VPTE base cmp %g4, %g5 ! VPTE miss? bgeu,pt %xcc, 1f ! Continue here - andcc %g4, TAG_CONTEXT_BITS, %g5 ! From Nucleus? (for tl0 miss) - ba,pt %xcc, from_tl1_trap ! Fall to tl0 miss - rdpr %tl, %g5 ! For tl0 miss TL==3 test + andcc %g4, TAG_CONTEXT_BITS, %g5 ! tl0 miss Nucleus test + ba,a,pt %xcc, from_tl1_trap ! Fall to tl0 miss 1: sllx %g6, VPTE_SHIFT, %g4 ! Position TAG_ACCESS + or %g4, %g5, %g4 ! Prepare TAG_ACCESS /* TLB1 ** ICACHE line 2: Quick VPTE miss */ - or %g4, %g5, %g4 ! Prepare TAG_ACCESS mov TSB_REG, %g1 ! Grab TSB reg ldxa [%g1] ASI_DMMU, %g5 ! Doing PGD caching? - srlx %g6, (TLB_PMD_SHIFT - 1), %g1 ! Position PMD offset + sllx %g6, PMD_SHIFT_LEFT, %g1 ! Position PMD offset be,pn %xcc, sparc64_vpte_nucleus ! Is it from Nucleus? - and %g1, TLB_PMD_MASK, %g1 ! Mask PMD offset bits + srlx %g1, PMD_SHIFT_RIGHT, %g1 ! Mask PMD offset bits brnz,pt %g5, sparc64_vpte_continue ! Yep, go like smoke - add %g1, %g1, %g1 ! Position PMD offset some more + andn %g1, LOW_MASK_BITS, %g1 ! Final PMD mask + sllx %g6, PGDIR_SHIFT_LEFT, %g5 ! Position PGD offset /* TLB1 ** ICACHE line 3: Quick VPTE miss */ - srlx %g6, (TLB_PGD_SHIFT - 2), %g5 ! Position PGD offset - and %g5, TLB_PGD_MASK, %g5 ! Mask PGD offset + srlx %g5, PGDIR_SHIFT_RIGHT, %g5 ! Mask PGD offset bits + andn %g5, LOW_MASK_BITS, %g5 ! Final PGD mask lduwa [%g7 + %g5] ASI_PHYS_USE_EC, %g5! Load PGD brz,pn %g5, vpte_noent ! Valid? sparc64_kpte_continue: @@ -71,23 +154,28 @@ sparc64_vpte_continue: brz,pn %g5, vpte_noent ! Valid? /* TLB1 ** ICACHE line 4: Quick VPTE miss */ - FILL_VALID_SZ_BITS1(%g1) ! Put _PAGE_VALID into %g1 - FILL_VALID_SZ_BITS2(%g1) ! Put _PAGE_VALID into %g1 + mov (VALID_SZ_BITS >> 61), %g1 ! upper vpte into %g1 + sllx %g1, 61, %g1 ! finish calc or %g5, VPTE_BITS, %g5 ! Prepare VPTE data or %g5, %g1, %g5 ! ... mov TLB_SFSR, %g1 ! Restore %g1 value stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Load VPTE into TLB stxa %g4, [%g1 + %g1] ASI_DMMU ! Restore previous TAG_ACCESS retry ! Load PTE once again - FILL_VALID_SZ_BITS_NOP +#undef SZ_BITS +#undef VALID_SZ_BITS #undef VPTE_SHIFT -#undef TLB_PMD_SHIFT -#undef TLB_PGD_SHIFT #undef VPTE_BITS -#undef TLB_PMD_MASK -#undef TLB_PGD_MASK -#undef FILL_VALID_SZ_BITS1 -#undef FILL_VALID_SZ_BITS2 -#undef FILL_VALID_SZ_BITS_NOP +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef PMD_SHIFT_LEFT +#undef PMD_SHIFT_RIGHT +#undef PGDIR_SHIFT_LEFT +#undef PGDIR_SHIFT_RIGHT +#undef LOW_MASK_BITS diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S index 294fb44aeb2c9..ded2fed23fcc5 100644 --- a/arch/sparc64/kernel/dtlb_base.S +++ b/arch/sparc64/kernel/dtlb_base.S @@ -7,7 +7,7 @@ */ #include <asm/pgtable.h> -#include <asm/mmu_context.h> +#include <asm/mmu.h> /* %g1 TLB_SFSR (%g1 + %g1 == TLB_TAG_ACCESS) * %g2 (KERN_HIGHBITS | KERN_LOWBITS) @@ -68,8 +68,8 @@ /* DTLB ** ICACHE line 1: Quick user TLB misses */ ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Get TAG_ACCESS andcc %g4, TAG_CONTEXT_BITS, %g0 ! From Nucleus? - mov 1, %g5 ! For TL==3 test from_tl1_trap: + rdpr %tl, %g5 ! For TL==3 test CREATE_VPTE_OFFSET1(%g4, %g6) ! Create VPTE offset be,pn %xcc, 3f ! Yep, special processing CREATE_VPTE_OFFSET2(%g4, %g6) ! Create VPTE offset diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index c4b705d0e00ca..a47f2d0b1a29b 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -38,97 +38,150 @@ * range (note that this is only possible for instruction miss, data misses to * obp range do not use vpte). If so, go back directly to the faulting address. * This is because we want to read the tpc, otherwise we have no way of knowing - * the 8k aligned faulting address if we are using >8k kernel pagesize. This also - * ensures no vpte range addresses are dropped into tlb while obp is executing - * (see inherit_locked_prom_mappings() rant). + * the 8k aligned faulting address if we are using >8k kernel pagesize. This + * also ensures no vpte range addresses are dropped into tlb while obp is + * executing (see inherit_locked_prom_mappings() rant). */ sparc64_vpte_nucleus: + /* Load 0xf0000000, which is LOW_OBP_ADDRESS. */ mov 0xf, %g5 - sllx %g5, 28, %g5 ! Load 0xf0000000 - cmp %g4, %g5 ! Is addr >= LOW_OBP_ADDRESS? + sllx %g5, 28, %g5 + + /* Is addr >= LOW_OBP_ADDRESS? */ + cmp %g4, %g5 blu,pn %xcc, sparc64_vpte_patchme1 mov 0x1, %g5 - sllx %g5, 32, %g5 ! Load 0x100000000 - cmp %g4, %g5 ! Is addr < HI_OBP_ADDRESS? + + /* Load 0x100000000, which is HI_OBP_ADDRESS. */ + sllx %g5, 32, %g5 + + /* Is addr < HI_OBP_ADDRESS? */ + cmp %g4, %g5 blu,pn %xcc, obp_iaddr_patch nop + + /* These two instructions are patched by paginig_init(). */ sparc64_vpte_patchme1: - sethi %hi(0), %g5 ! This has to be patched + sethi %hi(0), %g5 sparc64_vpte_patchme2: - or %g5, %lo(0), %g5 ! This is patched too - ba,pt %xcc, sparc64_kpte_continue ! Part of dtlb_backend - add %g1, %g1, %g1 ! Finish PMD offset adjustment + or %g5, %lo(0), %g5 + + /* With kernel PGD in %g5, branch back into dtlb_backend. */ + ba,pt %xcc, sparc64_kpte_continue + andn %g1, 0x3, %g1 /* Finish PMD offset adjustment. */ vpte_noent: - mov TLB_SFSR, %g1 ! Restore %g1 value - stxa %g4, [%g1 + %g1] ASI_DMMU ! Restore previous TAG_ACCESS - done ! Slick trick + /* Restore previous TAG_ACCESS, %g5 is zero, and we will + * skip over the trap instruction so that the top level + * TLB miss handler will thing this %g5 value is just an + * invalid PTE, thus branching to full fault processing. + */ + mov TLB_SFSR, %g1 + stxa %g4, [%g1 + %g1] ASI_DMMU + done .globl obp_iaddr_patch - .globl obp_daddr_patch - obp_iaddr_patch: - sethi %hi(0), %g5 ! This and following is patched - or %g5, %lo(0), %g5 ! g5 now holds obp pmd base physaddr - wrpr %g0, 1, %tl ! Behave as if we are at TL0 - rdpr %tpc, %g4 ! Find original faulting iaddr - srlx %g4, 13, %g4 ! Throw out context bits - sllx %g4, 13, %g4 ! g4 has vpn + ctx0 now - mov TLB_SFSR, %g1 ! Restore %g1 value - stxa %g4, [%g1 + %g1] ASI_IMMU ! Restore previous TAG_ACCESS - srlx %g4, 23, %g6 ! Find pmd number - and %g6, 0x7ff, %g6 ! Find pmd number - sllx %g6, 2, %g6 ! Find pmd offset - lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pmd, ie pagetable physaddr - brz,pn %g5, longpath ! Kill the PROM ? :-) - sllx %g5, 11, %g5 ! Shift into place - srlx %g4, 13, %g6 ! find pte number in pagetable - and %g6, 0x3ff, %g6 ! find pte number in pagetable - sllx %g6, 3, %g6 ! find pte offset in pagetable - ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pte - brgez,pn %g5, longpath ! Kill the PROM ? :-) + /* These two instructions patched by inherit_prom_mappings(). */ + sethi %hi(0), %g5 + or %g5, %lo(0), %g5 + + /* Behave as if we are at TL0. */ + wrpr %g0, 1, %tl + rdpr %tpc, %g4 /* Find original faulting iaddr */ + srlx %g4, 13, %g4 /* Throw out context bits */ + sllx %g4, 13, %g4 /* g4 has vpn + ctx0 now */ + + /* Restore previous TAG_ACCESS. */ + mov TLB_SFSR, %g1 + stxa %g4, [%g1 + %g1] ASI_IMMU + + /* Get PMD offset. */ + srlx %g4, 23, %g6 + and %g6, 0x7ff, %g6 + sllx %g6, 2, %g6 + + /* Load PMD, is it valid? */ + lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5 + brz,pn %g5, longpath + sllx %g5, 11, %g5 + + /* Get PTE offset. */ + srlx %g4, 13, %g6 + and %g6, 0x3ff, %g6 + sllx %g6, 3, %g6 + + /* Load PTE. */ + ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5 + brgez,pn %g5, longpath nop - stxa %g5, [%g0] ASI_ITLB_DATA_IN ! put into tlb - retry ! go back to original fault + /* TLB load and return from trap. */ + stxa %g5, [%g0] ASI_ITLB_DATA_IN + retry + + .globl obp_daddr_patch obp_daddr_patch: - sethi %hi(0), %g5 ! This and following is patched - or %g5, %lo(0), %g5 ! g5 now holds obp pmd base physaddr - srlx %g4, 23, %g6 ! Find pmd number - and %g6, 0x7ff, %g6 ! Find pmd number - sllx %g6, 2, %g6 ! Find pmd offset - lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pmd, ie pagetable physaddr + /* These two instructions patched by inherit_prom_mappings(). */ + sethi %hi(0), %g5 + or %g5, %lo(0), %g5 + + /* Get PMD offset. */ + srlx %g4, 23, %g6 + and %g6, 0x7ff, %g6 + sllx %g6, 2, %g6 + + /* Load PMD, is it valid? */ + lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5 brz,pn %g5, longpath - sllx %g5, 11, %g5 ! Shift into place - srlx %g4, 13, %g6 ! find pte number in pagetable - and %g6, 0x3ff, %g6 ! find pte number in pagetable - sllx %g6, 3, %g6 ! find pte offset in pagetable - ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pte + sllx %g5, 11, %g5 + + /* Get PTE offset. */ + srlx %g4, 13, %g6 + and %g6, 0x3ff, %g6 + sllx %g6, 3, %g6 + + /* Load PTE. */ + ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5 brgez,pn %g5, longpath nop - stxa %g5, [%g0] ASI_DTLB_DATA_IN ! put into tlb + + /* TLB load and return from trap. */ + stxa %g5, [%g0] ASI_DTLB_DATA_IN retry /* - * On a first level data miss, check whether this is to the OBP range (note that - * such accesses can be made by prom, as well as by kernel using prom_getproperty - * on "address"), and if so, do not use vpte access ... rather, use information - * saved during inherit_prom_mappings() using 8k pagesize. + * On a first level data miss, check whether this is to the OBP range (note + * that such accesses can be made by prom, as well as by kernel using + * prom_getproperty on "address"), and if so, do not use vpte access ... + * rather, use information saved during inherit_prom_mappings() using 8k + * pagesize. */ kvmap: + /* Load 0xf0000000, which is LOW_OBP_ADDRESS. */ mov 0xf, %g5 - sllx %g5, 28, %g5 ! Load 0xf0000000 - cmp %g4, %g5 ! Is addr >= LOW_OBP_ADDRESS? + sllx %g5, 28, %g5 + + /* Is addr >= LOW_OBP_ADDRESS? */ + cmp %g4, %g5 blu,pn %xcc, vmalloc_addr mov 0x1, %g5 - sllx %g5, 32, %g5 ! Load 0x100000000 - cmp %g4, %g5 ! Is addr < HI_OBP_ADDRESS? + + /* Load 0x100000000, which is HI_OBP_ADDRESS. */ + sllx %g5, 32, %g5 + + /* Is addr < HI_OBP_ADDRESS? */ + cmp %g4, %g5 blu,pn %xcc, obp_daddr_patch nop -vmalloc_addr: ! vmalloc addr accessed - ldxa [%g3 + %g6] ASI_N, %g5 ! Yep, load k-vpte - brgez,pn %g5, longpath ! Valid, load into TLB + +vmalloc_addr: + /* If we get here, a vmalloc addr accessed, load kernel VPTE. */ + ldxa [%g3 + %g6] ASI_N, %g5 + brgez,pn %g5, longpath nop + + /* PTE is valid, load into TLB and return from trap. */ stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Reload TLB retry @@ -199,9 +252,11 @@ do_fpdis: faddd %f0, %f2, %f4 fmuld %f0, %f2, %f6 ldxa [%g3] ASI_DMMU, %g5 - add %g6, TI_FPREGS + 0xc0, %g2 - stxa %g0, [%g3] ASI_DMMU +cplus_fptrap_insn_1: + sethi %hi(0), %g2 + stxa %g2, [%g3] ASI_DMMU membar #Sync + add %g6, TI_FPREGS + 0xc0, %g2 faddd %f0, %f2, %f8 fmuld %f0, %f2, %f10 ldda [%g1] ASI_BLK_S, %f32 ! grrr, where is ASI_BLK_NUCLEUS 8-( @@ -225,7 +280,9 @@ do_fpdis: fzero %f34 ldxa [%g3] ASI_DMMU, %g5 add %g6, TI_FPREGS, %g1 - stxa %g0, [%g3] ASI_DMMU +cplus_fptrap_insn_2: + sethi %hi(0), %g2 + stxa %g2, [%g3] ASI_DMMU membar #Sync add %g6, TI_FPREGS + 0x40, %g2 faddd %f32, %f34, %f36 @@ -249,9 +306,11 @@ do_fpdis: 3: mov SECONDARY_CONTEXT, %g3 add %g6, TI_FPREGS, %g1 ldxa [%g3] ASI_DMMU, %g5 - mov 0x40, %g2 - stxa %g0, [%g3] ASI_DMMU +cplus_fptrap_insn_3: + sethi %hi(0), %g2 + stxa %g2, [%g3] ASI_DMMU membar #Sync + mov 0x40, %g2 ldda [%g1] ASI_BLK_S, %f0 ! grrr, where is ASI_BLK_NUCLEUS 8-( ldda [%g1 + %g2] ASI_BLK_S, %f16 add %g1, 0x80, %g1 @@ -412,10 +471,12 @@ do_fptrap_after_fsr: rd %gsr, %g3 stx %g3, [%g6 + TI_GSR] mov SECONDARY_CONTEXT, %g3 - add %g6, TI_FPREGS, %g2 ldxa [%g3] ASI_DMMU, %g5 - stxa %g0, [%g3] ASI_DMMU +cplus_fptrap_insn_4: + sethi %hi(0), %g2 + stxa %g2, [%g3] ASI_DMMU membar #Sync + add %g6, TI_FPREGS, %g2 andcc %g1, FPRS_DL, %g0 be,pn %icc, 4f mov 0x40, %g3 @@ -433,6 +494,33 @@ do_fptrap_after_fsr: ba,pt %xcc, etrap wr %g0, 0, %fprs +cplus_fptrap_1: + sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2 + + .globl cheetah_plus_patch_fpdis +cheetah_plus_patch_fpdis: + /* We configure the dTLB512_0 for 4MB pages and the + * dTLB512_1 for 8K pages when in context zero. + */ + sethi %hi(cplus_fptrap_1), %o0 + lduw [%o0 + %lo(cplus_fptrap_1)], %o1 + + set cplus_fptrap_insn_1, %o2 + stw %o1, [%o2] + flush %o2 + set cplus_fptrap_insn_2, %o2 + stw %o1, [%o2] + flush %o2 + set cplus_fptrap_insn_3, %o2 + stw %o1, [%o2] + flush %o2 + set cplus_fptrap_insn_4, %o2 + stw %o1, [%o2] + flush %o2 + + retl + nop + /* The registers for cross calls will be: * * DATA 0: [low 32-bits] Address of function to call, jmp to this @@ -1642,7 +1730,7 @@ ret_from_syscall: andn %o7, _TIF_NEWCHILD, %l0 stx %l0, [%g6 + TI_FLAGS] call schedule_tail - mov %g5, %o0 + mov %g7, %o0 andcc %l0, _TIF_PERFCTR, %g0 be,pt %icc, 1f nop diff --git a/arch/sparc64/kernel/etrap.S b/arch/sparc64/kernel/etrap.S index d50b755c7e9c3..52cde3a262313 100644 --- a/arch/sparc64/kernel/etrap.S +++ b/arch/sparc64/kernel/etrap.S @@ -14,6 +14,7 @@ #include <asm/spitfire.h> #include <asm/head.h> #include <asm/processor.h> +#include <asm/mmu.h> #define TASK_REGOFF (THREAD_SIZE-TRACEREG_SZ-STACKFRAME_SZ) #define ETRAP_PSTATE1 (PSTATE_RMO | PSTATE_PRIV) @@ -67,7 +68,13 @@ etrap_irq: wrpr %g3, 0, %otherwin wrpr %g2, 0, %wstate - stxa %g0, [%l4] ASI_DMMU +cplus_etrap_insn_1: + sethi %hi(0), %g3 + sllx %g3, 32, %g3 +cplus_etrap_insn_2: + sethi %hi(0), %g2 + or %g3, %g2, %g3 + stxa %g3, [%l4] ASI_DMMU flush %l6 wr %g0, ASI_AIUS, %asi 2: wrpr %g0, 0x0, %tl @@ -95,11 +102,15 @@ etrap_irq: stx %i7, [%sp + PTREGS_OFF + PT_V9_I7] wrpr %g0, ETRAP_PSTATE2, %pstate mov %l6, %g6 +#ifdef CONFIG_SMP + ldub [%g6 + TI_CPU], %g3 + sethi %hi(__per_cpu_offset), %g2 + or %g2, %lo(__per_cpu_offset), %g2 + sllx %g3, 3, %g3 + ldx [%g2 + %g3], %g5 +#endif jmpl %l2 + 0x4, %g0 ldx [%g6 + TI_TASK], %g4 - nop - nop - nop 3: ldub [%l6 + TI_FPDEPTH], %l5 add %l6, TI_FPSAVED + 1, %l4 @@ -207,7 +218,13 @@ scetrap: rdpr %pil, %g2 mov PRIMARY_CONTEXT, %l4 wrpr %g3, 0, %otherwin wrpr %g2, 0, %wstate - stxa %g0, [%l4] ASI_DMMU +cplus_etrap_insn_3: + sethi %hi(0), %g3 + sllx %g3, 32, %g3 +cplus_etrap_insn_4: + sethi %hi(0), %g2 + or %g3, %g2, %g3 + stxa %g3, [%l4] ASI_DMMU flush %l6 mov ASI_AIUS, %l7 @@ -241,11 +258,50 @@ scetrap: rdpr %pil, %g2 stx %i6, [%sp + PTREGS_OFF + PT_V9_I6] mov %l6, %g6 stx %i7, [%sp + PTREGS_OFF + PT_V9_I7] +#ifdef CONFIG_SMP + ldub [%g6 + TI_CPU], %g3 + sethi %hi(__per_cpu_offset), %g2 + or %g2, %lo(__per_cpu_offset), %g2 + sllx %g3, 3, %g3 + ldx [%g2 + %g3], %g5 +#endif ldx [%g6 + TI_TASK], %g4 done - nop - nop #undef TASK_REGOFF #undef ETRAP_PSTATE1 -#undef ETRAP_PSTATE2 + +cplus_einsn_1: + sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3 +cplus_einsn_2: + sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2 + + .globl cheetah_plus_patch_etrap +cheetah_plus_patch_etrap: + /* We configure the dTLB512_0 for 4MB pages and the + * dTLB512_1 for 8K pages when in context zero. + */ + sethi %hi(cplus_einsn_1), %o0 + sethi %hi(cplus_etrap_insn_1), %o2 + lduw [%o0 + %lo(cplus_einsn_1)], %o1 + or %o2, %lo(cplus_etrap_insn_1), %o2 + stw %o1, [%o2] + flush %o2 + sethi %hi(cplus_etrap_insn_3), %o2 + or %o2, %lo(cplus_etrap_insn_3), %o2 + stw %o1, [%o2] + flush %o2 + + sethi %hi(cplus_einsn_2), %o0 + sethi %hi(cplus_etrap_insn_2), %o2 + lduw [%o0 + %lo(cplus_einsn_2)], %o1 + or %o2, %lo(cplus_etrap_insn_2), %o2 + stw %o1, [%o2] + flush %o2 + sethi %hi(cplus_etrap_insn_4), %o2 + or %o2, %lo(cplus_etrap_insn_4), %o2 + stw %o1, [%o2] + flush %o2 + + retl + nop diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S index 4a286a8000b07..954093551597f 100644 --- a/arch/sparc64/kernel/head.S +++ b/arch/sparc64/kernel/head.S @@ -25,6 +25,7 @@ #include <asm/dcu.h> #include <asm/head.h> #include <asm/ttable.h> +#include <asm/mmu.h> /* This section from from _start to sparc64_boot_end should fit into * 0x0000.0000.0040.4000 to 0x0000.0000.0040.8000 and will be sharing space @@ -88,8 +89,8 @@ sparc_ramdisk_image64: * PROM entry point is on %o4 */ sparc64_boot: - BRANCH_IF_CHEETAH_BASE(g1,g5,cheetah_boot) - BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g5,cheetah_plus_boot) + BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot) + BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot) ba,pt %xcc, spitfire_boot nop @@ -102,11 +103,11 @@ cheetah_boot: mov DCR_BPE | DCR_RPE | DCR_SI | DCR_IFPOE | DCR_MS, %g1 wr %g1, %asr18 - sethi %uhi(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5 - or %g5, %ulo(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5 - sllx %g5, 32, %g5 - or %g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5 - stxa %g5, [%g0] ASI_DCU_CONTROL_REG + sethi %uhi(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g7 + or %g7, %ulo(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g7 + sllx %g7, 32, %g7 + or %g7, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g7 + stxa %g7, [%g0] ASI_DCU_CONTROL_REG membar #Sync cheetah_generic_boot: @@ -491,7 +492,7 @@ sun4u_init: stxa %g3, [%g2] ASI_DMMU membar #Sync - BRANCH_IF_ANY_CHEETAH(g1,g5,cheetah_tlb_fixup) + BRANCH_IF_ANY_CHEETAH(g1,g7,cheetah_tlb_fixup) ba,pt %xcc, spitfire_tlb_fixup nop @@ -515,14 +516,31 @@ cheetah_tlb_fixup: membar #Sync mov 2, %g2 /* Set TLB type to cheetah+. */ - BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g5,g7,1f) + BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f) mov 1, %g2 /* Set TLB type to cheetah. */ -1: sethi %hi(tlb_type), %g5 - stw %g2, [%g5 + %lo(tlb_type)] +1: sethi %hi(tlb_type), %g1 + stw %g2, [%g1 + %lo(tlb_type)] - /* Patch copy/page operations to cheetah optimized versions. */ + BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f) + ba,pt %xcc, 2f + nop + +1: /* Patch context register writes to support nucleus page + * size correctly. + */ + call cheetah_plus_patch_etrap + nop + call cheetah_plus_patch_rtrap + nop + call cheetah_plus_patch_fpdis + nop + call cheetah_plus_patch_winfixup + nop + + +2: /* Patch copy/page operations to cheetah optimized versions. */ call cheetah_patch_copyops nop call cheetah_patch_cachetlbops @@ -549,8 +567,8 @@ spitfire_tlb_fixup: /* Set TLB type to spitfire. */ mov 0, %g2 - sethi %hi(tlb_type), %g5 - stw %g2, [%g5 + %lo(tlb_type)] + sethi %hi(tlb_type), %g1 + stw %g2, [%g1 + %lo(tlb_type)] tlb_fixup_done: sethi %hi(init_thread_union), %g6 @@ -578,12 +596,18 @@ tlb_fixup_done: #endif wr %g0, ASI_P, %asi - mov 1, %g5 - sllx %g5, THREAD_SHIFT, %g5 - sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 - add %g6, %g5, %sp + mov 1, %g1 + sllx %g1, THREAD_SHIFT, %g1 + sub %g1, (STACKFRAME_SZ + STACK_BIAS), %g1 + add %g6, %g1, %sp mov 0, %fp + /* Set per-cpu pointer initially to zero, this makes + * the boot-cpu use the in-kernel-image per-cpu areas + * before setup_per_cpu_area() is invoked. + */ + clr %g5 + wrpr %g0, 0, %wstate wrpr %g0, 0x0, %tl @@ -619,8 +643,8 @@ setup_tba: /* i0 = is_starfire */ rdpr %pstate, %o1 mov %g6, %o2 wrpr %o1, (PSTATE_AG|PSTATE_IE), %pstate - sethi %hi(sparc64_ttable_tl0), %g5 - wrpr %g5, %tba + sethi %hi(sparc64_ttable_tl0), %g1 + wrpr %g1, %tba mov %o2, %g6 /* Set up MMU globals */ @@ -685,10 +709,23 @@ spitfire_vpte_base: call init_irqwork_curcpu nop - sethi %hi(sparc64_ttable_tl0), %g5 call prom_set_trap_table - mov %g5, %o0 + sethi %hi(sparc64_ttable_tl0), %o0 + + BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g2,g3,1f) + ba,pt %xcc, 2f + nop +1: /* Start using proper page size encodings in ctx register. */ + sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3 + mov PRIMARY_CONTEXT, %g1 + sllx %g3, 32, %g3 + sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2 + or %g3, %g2, %g3 + stxa %g3, [%g1] ASI_DMMU + membar #Sync + +2: rdpr %pstate, %o1 or %o1, PSTATE_IE, %o1 wrpr %o1, 0, %pstate diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S index b7c3277bb92ac..e917752080062 100644 --- a/arch/sparc64/kernel/rtrap.S +++ b/arch/sparc64/kernel/rtrap.S @@ -222,8 +222,9 @@ rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1 ldx [%sp + PTREGS_OFF + PT_V9_G3], %g3 ldx [%sp + PTREGS_OFF + PT_V9_G4], %g4 - ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5 - ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6 + brz,a,pn %l3, 1f + ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5 +1: ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6 ldx [%sp + PTREGS_OFF + PT_V9_G7], %g7 wrpr %g0, RTRAP_PSTATE_AG_IRQOFF, %pstate ldx [%sp + PTREGS_OFF + PT_V9_I0], %i0 @@ -250,6 +251,10 @@ rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1 brnz,pn %l3, kern_rtt mov PRIMARY_CONTEXT, %l7 ldxa [%l7 + %l7] ASI_DMMU, %l0 +cplus_rtrap_insn_1: + sethi %hi(0), %l1 + sllx %l1, 32, %l1 + or %l0, %l1, %l0 stxa %l0, [%l7] ASI_DMMU flush %g6 rdpr %wstate, %l1 @@ -298,10 +303,10 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5 andcc %l2, FPRS_FEF, %g0 be,pn %icc, 5f sll %o0, 3, %o5 - rd %fprs, %g5 + rd %fprs, %g1 - wr %g5, FPRS_FEF, %fprs - ldx [%o1 + %o5], %g5 + wr %g1, FPRS_FEF, %fprs + ldx [%o1 + %o5], %g1 add %g6, TI_XFSR, %o1 membar #StoreLoad | #LoadLoad sll %o0, 8, %o2 @@ -313,7 +318,7 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5 ldda [%o4 + %o2] ASI_BLK_P, %f16 1: andcc %l2, FPRS_DU, %g0 be,pn %icc, 1f - wr %g5, 0, %gsr + wr %g1, 0, %gsr add %o2, 0x80, %o2 ldda [%o3 + %o2] ASI_BLK_P, %f32 ldda [%o4 + %o2] ASI_BLK_P, %f48 @@ -335,3 +340,21 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5 wr %g0, FPRS_DU, %fprs ba,pt %xcc, rt_continue stb %l5, [%g6 + TI_FPDEPTH] + +cplus_rinsn_1: + sethi %uhi(CTX_CHEETAH_PLUS_NUC), %l1 + + .globl cheetah_plus_patch_rtrap +cheetah_plus_patch_rtrap: + /* We configure the dTLB512_0 for 4MB pages and the + * dTLB512_1 for 8K pages when in context zero. + */ + sethi %hi(cplus_rinsn_1), %o0 + sethi %hi(cplus_rtrap_insn_1), %o2 + lduw [%o0 + %lo(cplus_rinsn_1)], %o1 + or %o2, %lo(cplus_rtrap_insn_1), %o2 + stw %o1, [%o2] + flush %o2 + + retl + nop diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index 9ddfcb9a19001..63496c43fe173 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -65,30 +65,25 @@ void up(struct semaphore *sem) __asm__ __volatile__("\n" " ! up sem(%0)\n" " membar #StoreLoad | #LoadLoad\n" -"1: lduw [%0], %%g5\n" -" add %%g5, 1, %%g7\n" -" cas [%0], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +"1: lduw [%0], %%g1\n" +" add %%g1, 1, %%g7\n" +" cas [%0], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " addcc %%g7, 1, %%g0\n" " ble,pn %%icc, 3f\n" " membar #StoreLoad | #StoreStore\n" "2:\n" " .subsection 2\n" -"3: mov %0, %%g5\n" +"3: mov %0, %%g1\n" " save %%sp, -160, %%sp\n" -" mov %%g1, %%l1\n" -" mov %%g2, %%l2\n" -" mov %%g3, %%l3\n" " call %1\n" -" mov %%g5, %%o0\n" -" mov %%l1, %%g1\n" -" mov %%l2, %%g2\n" +" mov %%g1, %%o0\n" " ba,pt %%xcc, 2b\n" -" restore %%l3, %%g0, %%g3\n" +" restore\n" " .previous\n" : : "r" (sem), "i" (__up) - : "g5", "g7", "memory", "cc"); + : "g1", "g2", "g3", "g7", "memory", "cc"); } static void __sched __down(struct semaphore * sem) @@ -127,30 +122,25 @@ void __sched down(struct semaphore *sem) __asm__ __volatile__("\n" " ! down sem(%0)\n" -"1: lduw [%0], %%g5\n" -" sub %%g5, 1, %%g7\n" -" cas [%0], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +"1: lduw [%0], %%g1\n" +" sub %%g1, 1, %%g7\n" +" cas [%0], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " cmp %%g7, 1\n" " bl,pn %%icc, 3f\n" " membar #StoreLoad | #StoreStore\n" "2:\n" " .subsection 2\n" -"3: mov %0, %%g5\n" +"3: mov %0, %%g1\n" " save %%sp, -160, %%sp\n" -" mov %%g1, %%l1\n" -" mov %%g2, %%l2\n" -" mov %%g3, %%l3\n" " call %1\n" -" mov %%g5, %%o0\n" -" mov %%l1, %%g1\n" -" mov %%l2, %%g2\n" +" mov %%g1, %%o0\n" " ba,pt %%xcc, 2b\n" -" restore %%l3, %%g0, %%g3\n" +" restore\n" " .previous\n" : : "r" (sem), "i" (__down) - : "g5", "g7", "memory", "cc"); + : "g1", "g2", "g3", "g7", "memory", "cc"); } int down_trylock(struct semaphore *sem) @@ -175,20 +165,20 @@ int down_trylock(struct semaphore *sem) __asm__ __volatile__("\n" " ! down_trylock sem(%1) ret(%0)\n" -"1: lduw [%1], %%g5\n" -" sub %%g5, 1, %%g7\n" -" cmp %%g5, 1\n" +"1: lduw [%1], %%g1\n" +" sub %%g1, 1, %%g7\n" +" cmp %%g1, 1\n" " bl,pn %%icc, 2f\n" " mov 1, %0\n" -" cas [%1], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +" cas [%1], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " mov 0, %0\n" " membar #StoreLoad | #StoreStore\n" "2:\n" : "=&r" (ret) : "r" (sem) - : "g5", "g7", "memory", "cc"); + : "g1", "g7", "memory", "cc"); return ret; } @@ -237,31 +227,25 @@ int __sched down_interruptible(struct semaphore *sem) __asm__ __volatile__("\n" " ! down_interruptible sem(%2) ret(%0)\n" -"1: lduw [%2], %%g5\n" -" sub %%g5, 1, %%g7\n" -" cas [%2], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +"1: lduw [%2], %%g1\n" +" sub %%g1, 1, %%g7\n" +" cas [%2], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " cmp %%g7, 1\n" " bl,pn %%icc, 3f\n" " membar #StoreLoad | #StoreStore\n" "2:\n" " .subsection 2\n" -"3: mov %2, %%g5\n" +"3: mov %2, %%g1\n" " save %%sp, -160, %%sp\n" -" mov %%g1, %%l1\n" -" mov %%g2, %%l2\n" -" mov %%g3, %%l3\n" " call %3\n" -" mov %%g5, %%o0\n" -" mov %%l1, %%g1\n" -" mov %%l2, %%g2\n" -" mov %%l3, %%g3\n" +" mov %%g1, %%o0\n" " ba,pt %%xcc, 2b\n" -" restore %%o0, %%g0, %0\n" +" restore\n" " .previous\n" : "=r" (ret) : "0" (ret), "r" (sem), "i" (__down_interruptible) - : "g5", "g7", "memory", "cc"); + : "g1", "g2", "g3", "g7", "memory", "cc"); return ret; } diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index 0c9ce2bb5100a..12c3d84b7460c 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -47,6 +47,7 @@ #include <asm/timer.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/mmu.h> #ifdef CONFIG_IP_PNP #include <net/ipconfig.h> @@ -157,11 +158,11 @@ int prom_callback(long *args) for_each_process(p) { mm = p->mm; - if (CTX_HWBITS(mm->context) == ctx) + if (CTX_NRBITS(mm->context) == ctx) break; } if (!mm || - CTX_HWBITS(mm->context) != ctx) + CTX_NRBITS(mm->context) != ctx) goto done; pgdp = pgd_offset(mm, va); @@ -187,12 +188,19 @@ int prom_callback(long *args) } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { + unsigned long kernel_pctx = 0; + + if (tlb_type == cheetah_plus) + kernel_pctx |= (CTX_CHEETAH_PLUS_NUC | + CTX_CHEETAH_PLUS_CTX0); + /* Spitfire Errata #32 workaround */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); + : "r" (kernel_pctx), + "r" (PRIMARY_CONTEXT), + "i" (ASI_DMMU)); /* * Locked down tlb entry. diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 1441ef81b8abe..6550d981b450c 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -89,7 +89,6 @@ void __init smp_store_cpu_info(int id) cpu_data(id).pgcache_size = 0; cpu_data(id).pte_cache[0] = NULL; cpu_data(id).pte_cache[1] = NULL; - cpu_data(id).pgdcache_size = 0; cpu_data(id).pgd_cache = NULL; cpu_data(id).idle_volume = 1; } @@ -108,6 +107,10 @@ void __init smp_callin(void) __flush_tlb_all(); + __asm__ __volatile__("mov %0, %%g5\n\t" + : /* no outputs */ + : "r" (__per_cpu_offset[cpuid])); + smp_setup_percpu_timer(); local_irq_enable(); @@ -627,7 +630,10 @@ extern unsigned long xcall_flush_tlb_all_spitfire; extern unsigned long xcall_flush_tlb_all_cheetah; extern unsigned long xcall_report_regs; extern unsigned long xcall_receive_signal; + +#ifdef DCACHE_ALIASING_POSSIBLE extern unsigned long xcall_flush_dcache_page_cheetah; +#endif extern unsigned long xcall_flush_dcache_page_spitfire; #ifdef CONFIG_DEBUG_DCFLUSH @@ -637,7 +643,7 @@ extern atomic_t dcpage_flushes_xcall; static __inline__ void __local_flush_dcache_page(struct page *page) { -#if (L1DCACHE_SIZE > PAGE_SIZE) +#ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_page(page_address(page), ((tlb_type == spitfire) && page_mapping(page) != NULL)); @@ -672,11 +678,13 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) (u64) pg_addr, mask); } else { +#ifdef DCACHE_ALIASING_POSSIBLE data0 = ((u64)&xcall_flush_dcache_page_cheetah); cheetah_xcall_deliver(data0, __pa(pg_addr), 0, mask); +#endif } #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes_xcall); @@ -709,10 +717,12 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) (u64) pg_addr, mask); } else { +#ifdef DCACHE_ALIASING_POSSIBLE data0 = ((u64)&xcall_flush_dcache_page_cheetah); cheetah_xcall_deliver(data0, __pa(pg_addr), 0, mask); +#endif } #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes_xcall); @@ -1055,74 +1065,6 @@ void __init smp_tick_init(void) prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1; } -extern unsigned long cheetah_tune_scheduling(void); - -static void __init smp_tune_scheduling(void) -{ - unsigned long orig_flush_base, flush_base, flags, *p; - unsigned int ecache_size, order; - cycles_t tick1, tick2, raw; - int cpu_node; - - /* Approximate heuristic for SMP scheduling. It is an - * estimation of the time it takes to flush the L2 cache - * on the local processor. - * - * The ia32 chooses to use the L1 cache flush time instead, - * and I consider this complete nonsense. The Ultra can service - * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and - * L2 misses are what create extra bus traffic (ie. the "cost" - * of moving a process from one cpu to another). - */ - printk("SMP: Calibrating ecache flush... "); - if (tlb_type == cheetah || tlb_type == cheetah_plus) - return; - - cpu_find_by_instance(0, &cpu_node, NULL); - ecache_size = prom_getintdefault(cpu_node, - "ecache-size", (512 * 1024)); - if (ecache_size > (4 * 1024 * 1024)) - ecache_size = (4 * 1024 * 1024); - orig_flush_base = flush_base = - __get_free_pages(GFP_KERNEL, order = get_order(ecache_size)); - - if (flush_base != 0UL) { - local_irq_save(flags); - - /* Scan twice the size once just to get the TLB entries - * loaded and make sure the second scan measures pure misses. - */ - for (p = (unsigned long *)flush_base; - ((unsigned long)p) < (flush_base + (ecache_size<<1)); - p += (64 / sizeof(unsigned long))) - *((volatile unsigned long *)p); - - tick1 = tick_ops->get_tick(); - - __asm__ __volatile__("1:\n\t" - "ldx [%0 + 0x000], %%g1\n\t" - "ldx [%0 + 0x040], %%g2\n\t" - "ldx [%0 + 0x080], %%g3\n\t" - "ldx [%0 + 0x0c0], %%g5\n\t" - "add %0, 0x100, %0\n\t" - "cmp %0, %2\n\t" - "bne,pt %%xcc, 1b\n\t" - " nop" - : "=&r" (flush_base) - : "0" (flush_base), - "r" (flush_base + ecache_size) - : "g1", "g2", "g3", "g5"); - - tick2 = tick_ops->get_tick(); - - local_irq_restore(flags); - - raw = (tick2 - tick1); - - free_pages(orig_flush_base, order); - } -} - /* /proc/profile writes can call this, don't __init it please. */ static DEFINE_SPINLOCK(prof_setup_lock); @@ -1177,6 +1119,11 @@ void __devinit smp_prepare_boot_cpu(void) } current_thread_info()->cpu = hard_smp_processor_id(); + + __asm__ __volatile__("mov %0, %%g5\n\t" + : /* no outputs */ + : "r" (__per_cpu_offset[smp_processor_id()])); + cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), phys_cpu_present_map); } @@ -1212,11 +1159,6 @@ void __init smp_cpus_done(unsigned int max_cpus) (long) num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - - /* We want to run this with all the other cpus spinning - * in the kernel. - */ - smp_tune_scheduling(); } /* This needn't do anything as we do not sleep the cpu diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 3cec1ebb083b0..cad5a11228006 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -59,6 +59,7 @@ #include <asm/ns87303.h> #include <asm/timer.h> #include <asm/cpudata.h> +#include <asm/rwsem.h> struct poll { int fd; @@ -174,6 +175,15 @@ EXPORT_SYMBOL(down_trylock); EXPORT_SYMBOL(down_interruptible); EXPORT_SYMBOL(up); +/* RW semaphores */ +EXPORT_SYMBOL(__down_read); +EXPORT_SYMBOL(__down_read_trylock); +EXPORT_SYMBOL(__down_write); +EXPORT_SYMBOL(__down_write_trylock); +EXPORT_SYMBOL(__up_read); +EXPORT_SYMBOL(__up_write); +EXPORT_SYMBOL(__downgrade_write); + /* Atomic counter implementation. */ EXPORT_SYMBOL(atomic_add); EXPORT_SYMBOL(atomic_add_ret); @@ -209,8 +219,11 @@ EXPORT_SYMBOL(__flushw_user); EXPORT_SYMBOL(tlb_type); EXPORT_SYMBOL(get_fb_unmapped_area); EXPORT_SYMBOL(flush_icache_range); + EXPORT_SYMBOL(flush_dcache_page); +#ifdef DCACHE_ALIASING_POSSIBLE EXPORT_SYMBOL(__flush_dcache_range); +#endif EXPORT_SYMBOL(mostek_lock); EXPORT_SYMBOL(mstk48t02_regs); @@ -350,7 +363,9 @@ EXPORT_SYMBOL(__memset); EXPORT_SYMBOL(memchr); EXPORT_SYMBOL(csum_partial); -EXPORT_SYMBOL(csum_partial_copy_sparc64); +EXPORT_SYMBOL(csum_partial_copy_nocheck); +EXPORT_SYMBOL(__csum_partial_copy_from_user); +EXPORT_SYMBOL(__csum_partial_copy_to_user); EXPORT_SYMBOL(ip_fast_csum); /* Moving data to/from/in userspace. */ diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index a9fa9a47074d8..567c91c77b20e 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -264,7 +264,7 @@ asmlinkage long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compa switch (call) { case SEMTIMEDOP: - if (third) + if (fifth) /* sign extend semid */ return compat_sys_semtimedop((int)first, compat_ptr(ptr), second, diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S index f1d764b2d39b5..2c8f9344b4eea 100644 --- a/arch/sparc64/kernel/trampoline.S +++ b/arch/sparc64/kernel/trampoline.S @@ -15,6 +15,7 @@ #include <asm/spitfire.h> #include <asm/processor.h> #include <asm/thread_info.h> +#include <asm/mmu.h> .data .align 8 @@ -334,6 +335,20 @@ do_unlock: call init_irqwork_curcpu nop + BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g2,g3,1f) + ba,pt %xcc, 2f + nop + +1: /* Start using proper page size encodings in ctx register. */ + sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3 + mov PRIMARY_CONTEXT, %g1 + sllx %g3, 32, %g3 + sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2 + or %g3, %g2, %g3 + stxa %g3, [%g1] ASI_DMMU + membar #Sync + +2: rdpr %pstate, %o1 or %o1, PSTATE_IE, %o1 wrpr %o1, 0, %pstate diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 7d0e96f00bd00..56b203a2af696 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -806,48 +806,6 @@ static void cheetah_flush_ecache_line(unsigned long physaddr) "i" (ASI_PHYS_USE_EC)); } -#ifdef CONFIG_SMP -unsigned long __init cheetah_tune_scheduling(void) -{ - unsigned long tick1, tick2, raw; - unsigned long flush_base = ecache_flush_physbase; - unsigned long flush_linesize = ecache_flush_linesize; - unsigned long flush_size = ecache_flush_size; - - /* Run through the whole cache to guarantee the timed loop - * is really displacing cache lines. - */ - __asm__ __volatile__("1: subcc %0, %4, %0\n\t" - " bne,pt %%xcc, 1b\n\t" - " ldxa [%2 + %0] %3, %%g0\n\t" - : "=&r" (flush_size) - : "0" (flush_size), "r" (flush_base), - "i" (ASI_PHYS_USE_EC), "r" (flush_linesize)); - - /* The flush area is 2 X Ecache-size, so cut this in half for - * the timed loop. - */ - flush_base = ecache_flush_physbase; - flush_linesize = ecache_flush_linesize; - flush_size = ecache_flush_size >> 1; - - tick1 = tick_ops->get_tick(); - - __asm__ __volatile__("1: subcc %0, %4, %0\n\t" - " bne,pt %%xcc, 1b\n\t" - " ldxa [%2 + %0] %3, %%g0\n\t" - : "=&r" (flush_size) - : "0" (flush_size), "r" (flush_base), - "i" (ASI_PHYS_USE_EC), "r" (flush_linesize)); - - tick2 = tick_ops->get_tick(); - - raw = (tick2 - tick1); - - return (raw - (raw >> 2)); -} -#endif - /* Unfortunately, the diagnostic access to the I-cache tags we need to * use to clear the thing interferes with I-cache coherency transactions. * diff --git a/arch/sparc64/kernel/unaligned.c b/arch/sparc64/kernel/unaligned.c index 8a9d3b6bfe5c9..4372bf32ecf6f 100644 --- a/arch/sparc64/kernel/unaligned.c +++ b/arch/sparc64/kernel/unaligned.c @@ -379,8 +379,8 @@ void kernel_mna_trap_fault(struct pt_regs *regs, unsigned int insn) printk(KERN_ALERT "Unable to handle kernel paging request in mna handler"); printk(KERN_ALERT " at virtual address %016lx\n",address); printk(KERN_ALERT "current->{mm,active_mm}->context = %016lx\n", - (current->mm ? current->mm->context : - current->active_mm->context)); + (current->mm ? CTX_HWBITS(current->mm->context) : + CTX_HWBITS(current->active_mm->context))); printk(KERN_ALERT "current->{mm,active_mm}->pgd = %016lx\n", (current->mm ? (unsigned long) current->mm->pgd : (unsigned long) current->active_mm->pgd)); @@ -413,7 +413,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn, u : : "r" (regs), "r" (insn) : "o0", "o1", "o2", "o3", "o4", "o5", "o7", - "g1", "g2", "g3", "g4", "g5", "g7", "cc"); + "g1", "g2", "g3", "g4", "g7", "cc"); } else { unsigned long addr = compute_effective_address(regs, insn, ((insn >> 25) & 0x1f)); diff --git a/arch/sparc64/kernel/winfixup.S b/arch/sparc64/kernel/winfixup.S index 3427d7a743e1f..ca9891a8dad82 100644 --- a/arch/sparc64/kernel/winfixup.S +++ b/arch/sparc64/kernel/winfixup.S @@ -14,6 +14,25 @@ #include <asm/thread_info.h> .text + +set_pcontext: +cplus_winfixup_insn_1: + sethi %hi(0), %l1 + mov PRIMARY_CONTEXT, %g1 + sllx %l1, 32, %l1 +cplus_winfixup_insn_2: + sethi %hi(0), %g2 + or %l1, %g2, %l1 + stxa %l1, [%g1] ASI_DMMU + flush %g6 + retl + nop + +cplus_wfinsn_1: + sethi %uhi(CTX_CHEETAH_PLUS_NUC), %l1 +cplus_wfinsn_2: + sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2 + .align 32 /* Here are the rules, pay attention. @@ -62,9 +81,8 @@ fill_fixup: wrpr %g0, 0x0, %canrestore ! Standard etrap stuff. wrpr %g2, 0x0, %wstate ! This must be consistent. wrpr %g0, 0x0, %otherwin ! We know this. - mov PRIMARY_CONTEXT, %g1 ! Change contexts... - stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus. - flush %g6 ! Flush instruction buffers + call set_pcontext ! Change contexts... + nop rdpr %pstate, %l1 ! Prepare to change globals. mov %g6, %o7 ! Get current. @@ -75,6 +93,13 @@ fill_fixup: wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate mov %o7, %g6 ldx [%g6 + TI_TASK], %g4 +#ifdef CONFIG_SMP + ldub [%g6 + TI_CPU], %g1 + sethi %hi(__per_cpu_offset), %g2 + or %g2, %lo(__per_cpu_offset), %g2 + sllx %g1, 3, %g1 + ldx [%g2 + %g1], %g5 +#endif /* This is the same as below, except we handle this a bit special * since we must preserve %l5 and %l6, see comment above. @@ -183,9 +208,8 @@ fill_fixup_mna: wrpr %g2, 0x0, %wstate ! This must be consistent. wrpr %g0, 0x0, %otherwin ! We know this. - mov PRIMARY_CONTEXT, %g1 ! Change contexts... - stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus. - flush %g6 ! Flush instruction buffers + call set_pcontext ! Change contexts... + nop rdpr %pstate, %l1 ! Prepare to change globals. mov %g4, %o2 ! Setup args for mov %g5, %o1 ! final call to mem_address_unaligned. @@ -196,6 +220,13 @@ fill_fixup_mna: wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate mov %o7, %g6 ! Get current back. ldx [%g6 + TI_TASK], %g4 ! Finish it. +#ifdef CONFIG_SMP + ldub [%g6 + TI_CPU], %g1 + sethi %hi(__per_cpu_offset), %g2 + or %g2, %lo(__per_cpu_offset), %g2 + sllx %g1, 3, %g1 + ldx [%g2 + %g1], %g5 +#endif call mem_address_unaligned add %sp, PTREGS_OFF, %o0 @@ -289,9 +320,8 @@ fill_fixup_dax: wrpr %g2, 0x0, %wstate ! This must be consistent. wrpr %g0, 0x0, %otherwin ! We know this. - mov PRIMARY_CONTEXT, %g1 ! Change contexts... - stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus. - flush %g6 ! Flush instruction buffers + call set_pcontext ! Change contexts... + nop rdpr %pstate, %l1 ! Prepare to change globals. mov %g4, %o1 ! Setup args for mov %g5, %o2 ! final call to data_access_exception. @@ -302,6 +332,13 @@ fill_fixup_dax: wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate mov %o7, %g6 ! Get current back. ldx [%g6 + TI_TASK], %g4 ! Finish it. +#ifdef CONFIG_SMP + ldub [%g6 + TI_CPU], %g1 + sethi %hi(__per_cpu_offset), %g2 + or %g2, %lo(__per_cpu_offset), %g2 + sllx %g1, 3, %g1 + ldx [%g2 + %g1], %g5 +#endif call data_access_exception add %sp, PTREGS_OFF, %o0 @@ -368,3 +405,22 @@ window_dax_from_user_common: ba,pt %xcc, rtrap clr %l6 + + .globl cheetah_plus_patch_winfixup +cheetah_plus_patch_winfixup: + sethi %hi(cplus_wfinsn_1), %o0 + sethi %hi(cplus_winfixup_insn_1), %o2 + lduw [%o0 + %lo(cplus_wfinsn_1)], %o1 + or %o2, %lo(cplus_winfixup_insn_1), %o2 + stw %o1, [%o2] + flush %o2 + + sethi %hi(cplus_wfinsn_2), %o0 + sethi %hi(cplus_winfixup_insn_2), %o2 + lduw [%o0 + %lo(cplus_wfinsn_2)], %o1 + or %o2, %lo(cplus_winfixup_insn_2), %o2 + stw %o1, [%o2] + flush %o2 + + retl + nop diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index 3cf408cb1695e..40dbeec7e5d6a 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -7,8 +7,8 @@ EXTRA_CFLAGS := -Werror lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ - VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \ - VIScsumcopyusr.o VISsave.o atomic.o bitops.o \ + bzero.o csum_copy.o csum_copy_from_user.o csum_copy_to_user.o \ + VISsave.o atomic.o bitops.o \ U1memcpy.o U1copy_from_user.o U1copy_to_user.o \ U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \ copy_in_user.o user_fixup.o memmove.o \ diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S index fffec2e3cef8e..da9b520c71894 100644 --- a/arch/sparc64/lib/U1memcpy.S +++ b/arch/sparc64/lib/U1memcpy.S @@ -7,7 +7,9 @@ #ifdef __KERNEL__ #include <asm/visasm.h> #include <asm/asi.h> +#define GLOBAL_SPARE g7 #else +#define GLOBAL_SPARE g5 #define ASI_BLK_P 0xf0 #define FPRS_FEF 0x04 #ifdef MEMCPY_DEBUG @@ -123,7 +125,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ cmp %g2, 0 tne %xcc, 5 PREAMBLE - mov %o0, %g5 + mov %o0, %o4 cmp %o2, 0 be,pn %XCC, 85f or %o0, %o1, %o3 @@ -146,7 +148,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ * of bytes to copy to make 'dst' 64-byte aligned. We pre- * subtract this from 'len'. */ - sub %o0, %o1, %o4 + sub %o0, %o1, %GLOBAL_SPARE sub %g2, 0x40, %g2 sub %g0, %g2, %g2 sub %o2, %g2, %o2 @@ -156,11 +158,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 1: subcc %g1, 0x1, %g1 EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) - EX_ST(STORE(stb, %o3, %o1 + %o4)) + EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 0x1, %o1 - add %o1, %o4, %o0 + add %o1, %GLOBAL_SPARE, %o0 2: cmp %g2, 0x0 and %o1, 0x7, %g1 @@ -188,19 +190,19 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 3: membar #LoadStore | #StoreStore | #StoreLoad - subcc %o2, 0x40, %o4 + subcc %o2, 0x40, %GLOBAL_SPARE add %o1, %g1, %g1 - andncc %o4, (0x40 - 1), %o4 + andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE srl %g1, 3, %g2 - sub %o2, %o4, %g3 + sub %o2, %GLOBAL_SPARE, %g3 andn %o1, (0x40 - 1), %o1 and %g2, 7, %g2 andncc %g3, 0x7, %g3 fmovd %f0, %f2 sub %g3, 0x8, %g3 - sub %o2, %o4, %o2 + sub %o2, %GLOBAL_SPARE, %o2 - add %g1, %o4, %g1 + add %g1, %GLOBAL_SPARE, %g1 subcc %o2, %g3, %o2 EX_LD(LOAD_BLK(%o1, %f0)) @@ -208,7 +210,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %g1, %g3, %g1 EX_LD(LOAD_BLK(%o1, %f16)) add %o1, 0x40, %o1 - sub %o4, 0x80, %o4 + sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE EX_LD(LOAD_BLK(%o1, %f32)) add %o1, 0x40, %o1 @@ -229,11 +231,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ .align 64 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f0, %f2, %f48 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) @@ -250,11 +252,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 56f) membar #Sync 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f2, %f4, %f48 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) @@ -271,11 +273,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 57f) membar #Sync 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f4, %f6, %f48 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) @@ -292,11 +294,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 58f) membar #Sync 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f6, %f8, %f48 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) @@ -313,11 +315,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 59f) membar #Sync 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f8, %f10, %f48 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) @@ -334,11 +336,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 60f) membar #Sync 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f10, %f12, %f48 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) @@ -355,11 +357,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 61f) membar #Sync 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f12, %f14, %f48 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) @@ -376,11 +378,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ STORE_JUMP(o0, f48, 62f) membar #Sync 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) - LOOP_CHUNK1(o1, o0, o4, 1f) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) - LOOP_CHUNK2(o1, o0, o4, 2f) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) - LOOP_CHUNK3(o1, o0, o4, 3f) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) ba,pt %xcc, 1b+4 faligndata %f14, %f16, %f48 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) @@ -449,18 +451,18 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 2: membar #StoreLoad | #StoreStore VISExit retl - mov EX_RETVAL(%g5), %o0 + mov EX_RETVAL(%o4), %o0 .align 64 70: /* 16 < len <= (5 * 64) */ bne,pn %XCC, 75f sub %o0, %o1, %o3 -72: andn %o2, 0xf, %o4 +72: andn %o2, 0xf, %GLOBAL_SPARE and %o2, 0xf, %o2 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) - subcc %o4, 0x10, %o4 + subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE EX_ST(STORE(stx, %o5, %o1 + %o3)) add %o1, 0x8, %o1 EX_ST(STORE(stx, %g1, %o1 + %o3)) @@ -512,10 +514,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ andn %o1, 0x7, %o1 EX_LD(LOAD(ldx, %o1, %g2)) sub %o3, %g1, %o3 - andn %o2, 0x7, %o4 + andn %o2, 0x7, %GLOBAL_SPARE sllx %g2, %g1, %g2 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) - subcc %o4, 0x8, %o4 + subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE add %o1, 0x8, %o1 srlx %g3, %o3, %o5 or %o5, %g2, %o5 @@ -544,7 +546,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %o1, 4, %o1 85: retl - mov EX_RETVAL(%g5), %o0 + mov EX_RETVAL(%o4), %o0 .align 32 90: EX_LD(LOAD(ldub, %o1, %g1)) @@ -553,6 +555,6 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ bgu,pt %XCC, 90b add %o1, 1, %o1 retl - mov EX_RETVAL(%g5), %o0 + mov EX_RETVAL(%o4), %o0 .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc64/lib/U3memcpy.S b/arch/sparc64/lib/U3memcpy.S index 8fe195a10bbad..7cae9cc6a204a 100644 --- a/arch/sparc64/lib/U3memcpy.S +++ b/arch/sparc64/lib/U3memcpy.S @@ -6,6 +6,7 @@ #ifdef __KERNEL__ #include <asm/visasm.h> #include <asm/asi.h> +#define GLOBAL_SPARE %g7 #else #define ASI_BLK_P 0xf0 #define FPRS_FEF 0x04 @@ -17,6 +18,7 @@ #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs #endif +#define GLOBAL_SPARE %g5 #endif #ifndef EX_LD @@ -84,7 +86,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ cmp %g2, 0 tne %xcc, 5 PREAMBLE - mov %o0, %g5 + mov %o0, %o4 cmp %o2, 0 be,pn %XCC, 85f or %o0, %o1, %o3 @@ -109,7 +111,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ * of bytes to copy to make 'dst' 64-byte aligned. We pre- * subtract this from 'len'. */ - sub %o0, %o1, %o4 + sub %o0, %o1, GLOBAL_SPARE sub %g2, 0x40, %g2 sub %g0, %g2, %g2 sub %o2, %g2, %o2 @@ -119,11 +121,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 1: subcc %g1, 0x1, %g1 EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) - EX_ST(STORE(stb, %o3, %o1 + %o4)) + EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 0x1, %o1 - add %o1, %o4, %o0 + add %o1, GLOBAL_SPARE, %o0 2: cmp %g2, 0x0 and %o1, 0x7, %g1 @@ -149,7 +151,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 3: LOAD(prefetch, %o1 + 0x000, #one_read) LOAD(prefetch, %o1 + 0x040, #one_read) - andn %o2, (0x40 - 1), %o4 + andn %o2, (0x40 - 1), GLOBAL_SPARE LOAD(prefetch, %o1 + 0x080, #one_read) LOAD(prefetch, %o1 + 0x0c0, #one_read) LOAD(prefetch, %o1 + 0x100, #one_read) @@ -173,10 +175,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ faligndata %f10, %f12, %f26 EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) - subcc %o4, 0x80, %o4 + subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE add %o1, 0x40, %o1 bgu,pt %XCC, 1f - srl %o4, 6, %o3 + srl GLOBAL_SPARE, 6, %o3 ba,pt %xcc, 2f nop @@ -315,9 +317,9 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ sub %o0, %o1, %o3 72: - andn %o2, 0xf, %o4 + andn %o2, 0xf, GLOBAL_SPARE and %o2, 0xf, %o2 -1: subcc %o4, 0x10, %o4 +1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) EX_ST(STORE(stx, %o5, %o1 + %o3)) @@ -372,10 +374,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ andn %o1, 0x7, %o1 EX_LD(LOAD(ldx, %o1, %g2)) sub %o3, %g1, %o3 - andn %o2, 0x7, %o4 + andn %o2, 0x7, GLOBAL_SPARE sllx %g2, %g1, %g2 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) - subcc %o4, 0x8, %o4 + subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE add %o1, 0x8, %o1 srlx %g3, %o3, %o5 or %o5, %g2, %o5 @@ -405,7 +407,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %o1, 4, %o1 85: retl - mov EX_RETVAL(%g5), %o0 + mov EX_RETVAL(%o4), %o0 .align 32 90: @@ -415,6 +417,6 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ bgu,pt %XCC, 90b add %o1, 1, %o1 retl - mov EX_RETVAL(%g5), %o0 + mov EX_RETVAL(%o4), %o0 .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc64/lib/VIS.h b/arch/sparc64/lib/VIS.h deleted file mode 100644 index 9d93a70e7081f..0000000000000 --- a/arch/sparc64/lib/VIS.h +++ /dev/null @@ -1,128 +0,0 @@ -/* $Id: VIS.h,v 1.4 1999/05/25 16:52:50 jj Exp $ - * VIS.h: High speed copy/clear operations utilizing the UltraSparc - * Visual Instruction Set. - * - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - */ - - /* VIS code can be used for numerous copy/set operation variants. - * It can be made to work in the kernel, one single instance, - * for all of memcpy, copy_to_user, and copy_from_user by setting - * the ASI src/dest globals correctly. Furthermore it can - * be used for kernel-->kernel page copies as well, a hook label - * is put in here just for this purpose. - * - * For userland, compiling this without __KERNEL__ defined makes - * it work just fine as a generic libc bcopy and memcpy. - * If for userland it is compiled with a 32bit gcc (but you need - * -Wa,-Av9a), the code will just rely on lower 32bits of - * IEU registers, if you compile it with 64bit gcc (ie. define - * __sparc_v9__), the code will use full 64bit. - */ - -#ifndef __VIS_H -#define __VIS_H - -#ifdef __KERNEL__ -#include <asm/head.h> -#include <asm/asi.h> -#else -#define ASI_AIUS 0x11 /* Secondary, user */ -#define ASI_BLK_AIUS 0x71 /* Secondary, user, blk ld/st */ -#define ASI_P 0x80 /* Primary, implicit */ -#define ASI_S 0x81 /* Secondary, implicit */ -#define ASI_BLK_COMMIT_P 0xe0 /* Primary, blk store commit */ -#define ASI_BLK_COMMIT_S 0xe1 /* Secondary, blk store commit */ -#define ASI_BLK_P 0xf0 /* Primary, blk ld/st */ -#define ASI_BLK_S 0xf1 /* Secondary, blk ld/st */ -#define FPRS_FEF 0x04 -#endif - - /* I'm telling you, they really did this chip right. - * Perhaps the SunSoft folks should visit some of the - * people in Sun Microelectronics and start some brain - * cell exchange program... - */ -#define ASI_BLK_XOR (ASI_P ^ ASI_BLK_P) - /* Well, things get more hairy if we use ASI_AIUS as - * USER_DS and ASI_P as KERNEL_DS, we'd reach - * commit block stores this way which is not what we want... - */ - /* ASI_P->ASI_BLK_P && ASI_AIUS->ASI_BLK_AIUS transitions can be done - * as blkasi = asi | ASI_BLK_OR - */ -#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P) - /* Transition back from ASI_BLK_P->ASI_P && ASI_BLK_AIUS->ASI_AIUS is - * more complicated: - * asi = blkasi ^ (blkasi >> 3) ^ ASI_BLK_XOR1 - */ -#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P) - -#define asi_src %o3 -#define asi_dest %o4 - -#ifdef __KERNEL__ -#define ASI_SETSRC_BLK wr asi_src, 0, %asi; -#define ASI_SETSRC_NOBLK wr asi_src, 0, %asi; -#define ASI_SETDST_BLK wr asi_dest, 0, %asi; -#define ASI_SETDST_NOBLK wr asi_dest, 0, %asi; -#define ASIBLK %asi -#define ASINORMAL %asi -#define LDUB lduba -#define LDUH lduha -#define LDUW lduwa -#define LDX ldxa -#define LDD ldda -#define LDDF ldda -#define LDBLK ldda -#define STB stba -#define STH stha -#define STW stwa -#define STD stda -#define STX stxa -#define STDF stda -#define STBLK stda -#else -#define ASI_SETSRC_BLK -#define ASI_SETSRC_NOBLK -#define ASI_SETDST_BLK -#define ASI_SETDST_NOBLK -#define ASI_SETDST_SPECIAL -#define ASIBLK %asi -#define ASINORMAL -#define LDUB ldub -#define LDUH lduh -#define LDUW lduw -#define LDD ldd -#define LDX ldx -#define LDDF ldd -#define LDBLK ldda -#define STB stb -#define STH sth -#define STW stw -#define STD std -#define STX stx -#define STDF std -#define STBLK stda -#endif - -#ifdef __KERNEL__ - -#define REGS_64BIT - -#else - -#ifndef REGS_64BIT -#ifdef __sparc_v9__ -#define REGS_64BIT -#endif -#endif - -#endif - -#ifndef REGS_64BIT -#define xcc icc -#endif - -#endif diff --git a/arch/sparc64/lib/VISbzero.S b/arch/sparc64/lib/VISbzero.S deleted file mode 100644 index 06b697bab974b..0000000000000 --- a/arch/sparc64/lib/VISbzero.S +++ /dev/null @@ -1,274 +0,0 @@ -/* $Id: VISbzero.S,v 1.11 2001/03/15 08:51:24 anton Exp $ - * VISbzero.S: High speed clear operations utilizing the UltraSparc - * Visual Instruction Set. - * - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - */ - -#include "VIS.h" - -#ifdef __KERNEL__ -#include <asm/visasm.h> - -#define EXN(x,y,a,b,z) \ -98: x,y; \ - .section .fixup; \ - .align 4; \ -99: ba VISbzerofixup_ret##z; \ - a, b, %o0; \ - .section __ex_table; \ - .align 4; \ - .word 98b, 99b; \ - .text; \ - .align 4; -#define EXC(x,y,a,b,c...) \ -98: x,y; \ - .section .fixup; \ - .align 4; \ -99: c; \ - ba VISbzerofixup_ret0; \ - a, b, %o0; \ - .section __ex_table; \ - .align 4; \ - .word 98b, 99b; \ - .text; \ - .align 4; -#define EXO1(x,y) \ -98: x,y; \ - .section __ex_table; \ - .align 4; \ - .word 98b, VISbzerofixup_reto1; \ - .text; \ - .align 4; -#define EX(x,y,a,b) EXN(x,y,a,b,0) -#define EX1(x,y,a,b) EXN(x,y,a,b,1) -#define EX2(x,y,a,b) EXN(x,y,a,b,2) -#define EXT(start,end,handler) \ - .section __ex_table; \ - .align 4; \ - .word start, 0, end, handler; \ - .text; \ - .align 4 -#else -#define EX(x,y,a,b) x,y -#define EX1(x,y,a,b) x,y -#define EX2(x,y,a,b) x,y -#define EXC(x,y,a,b,c...) x,y -#define EXO1(x,y) x,y -#define EXT(a,b,c) -#endif - -#define ZERO_BLOCKS(base, offset, source) \ - STX source, [base - offset - 0x38] ASINORMAL; \ - STX source, [base - offset - 0x30] ASINORMAL; \ - STX source, [base - offset - 0x28] ASINORMAL; \ - STX source, [base - offset - 0x20] ASINORMAL; \ - STX source, [base - offset - 0x18] ASINORMAL; \ - STX source, [base - offset - 0x10] ASINORMAL; \ - STX source, [base - offset - 0x08] ASINORMAL; \ - STX source, [base - offset - 0x00] ASINORMAL; - -#ifdef __KERNEL__ -#define RETL clr %o0 -#else -#define RETL mov %g3, %o0 -#endif - - /* Well, bzero is a lot easier to get right than bcopy... */ -#ifdef __KERNEL__ - .section __ex_table,#alloc - .section .fixup,#alloc,#execinstr -#endif - .text - .align 32 -#ifdef __KERNEL__ - .globl __bzero, __bzero_noasi -__bzero_noasi: - rd %asi, %g5 - ba,pt %xcc, __bzero+12 - mov %g5, %o4 -__bzero: - rd %asi, %g5 - wr %g0, ASI_P, %asi ! LSU Group - mov ASI_P, %o4 -#else - .globl bzero -bzero_private: -bzero: -#ifndef REGS_64BIT - srl %o1, 0, %o1 -#endif - mov %o0, %g3 -#endif - cmp %o1, 7 - bleu,pn %xcc, 17f - andcc %o0, 3, %o2 - be,a,pt %xcc, 4f - andcc %o0, 4, %g0 - cmp %o2, 3 - be,pn %xcc, 2f - EXO1(STB %g0, [%o0 + 0x00] ASINORMAL) - cmp %o2, 2 - be,pt %xcc, 2f - EX(STB %g0, [%o0 + 0x01] ASINORMAL, sub %o1, 1) - EX(STB %g0, [%o0 + 0x02] ASINORMAL, sub %o1, 2) -2: sub %o2, 4, %o2 - sub %o0, %o2, %o0 - add %o1, %o2, %o1 - andcc %o0, 4, %g0 -4: be,pt %xcc, 2f - cmp %o1, 128 - EXO1(STW %g0, [%o0] ASINORMAL) - sub %o1, 4, %o1 - add %o0, 4, %o0 -2: blu,pn %xcc, 9f - andcc %o0, 0x38, %o2 - be,pn %icc, 6f - mov 64, %o5 - andcc %o0, 8, %g0 - be,pn %icc, 1f - sub %o5, %o2, %o5 - EX(STX %g0, [%o0] ASINORMAL, sub %o1, 0) - add %o0, 8, %o0 -1: andcc %o5, 16, %g0 - be,pn %icc, 1f - sub %o1, %o5, %o1 - EX1(STX %g0, [%o0] ASINORMAL, add %g0, 0) - EX1(STX %g0, [%o0 + 8] ASINORMAL, sub %g0, 8) - add %o0, 16, %o0 -1: andcc %o5, 32, %g0 - be,pn %icc, 7f - andncc %o1, 0x3f, %o3 - EX(STX %g0, [%o0] ASINORMAL, add %o1, 32) - EX(STX %g0, [%o0 + 8] ASINORMAL, add %o1, 24) - EX(STX %g0, [%o0 + 16] ASINORMAL, add %o1, 16) - EX(STX %g0, [%o0 + 24] ASINORMAL, add %o1, 8) - add %o0, 32, %o0 -6: andncc %o1, 0x3f, %o3 -7: be,pn %xcc, 9f -#ifdef __KERNEL__ - or %o4, ASI_BLK_OR, %g7 - wr %g7, %g0, %asi - VISEntryHalf -#else - wr %g0, ASI_BLK_P, %asi -#endif - membar #StoreLoad | #StoreStore | #LoadStore - fzero %f0 - andcc %o3, 0xc0, %o2 - and %o1, 0x3f, %o1 - fzero %f2 - andn %o3, 0xff, %o3 - faddd %f0, %f2, %f4 - fmuld %f0, %f2, %f6 - cmp %o2, 64 - faddd %f0, %f2, %f8 - fmuld %f0, %f2, %f10 - faddd %f0, %f2, %f12 - brz,pn %o2, 10f - fmuld %f0, %f2, %f14 - be,pn %icc, 2f - EXC(STBLK %f0, [%o0 + 0x00] ASIBLK, add %o3, %o2, add %o2, %o1, %o2) - cmp %o2, 128 - be,pn %icc, 2f - EXC(STBLK %f0, [%o0 + 0x40] ASIBLK, add %o3, %o2, add %o2, %o1, %o2; sub %o2, 64, %o2) - EXC(STBLK %f0, [%o0 + 0x80] ASIBLK, add %o3, %o2, add %o2, %o1, %o2; sub %o2, 128, %o2) -2: brz,pn %o3, 12f - add %o0, %o2, %o0 -10: EX(STBLK %f0, [%o0 + 0x00] ASIBLK, add %o3, %o1) - EXC(STBLK %f0, [%o0 + 0x40] ASIBLK, add %o3, %o1, sub %o1, 64, %o1) - EXC(STBLK %f0, [%o0 + 0x80] ASIBLK, add %o3, %o1, sub %o1, 128, %o1) - EXC(STBLK %f0, [%o0 + 0xc0] ASIBLK, add %o3, %o1, sub %o1, 192, %o1) -11: subcc %o3, 256, %o3 - bne,pt %xcc, 10b - add %o0, 256, %o0 -12: -#ifdef __KERNEL__ - VISExitHalf - wr %o4, 0x0, %asi -#else -#ifndef REGS_64BIT - wr %g0, FPRS_FEF, %fprs -#endif -#endif - membar #StoreLoad | #StoreStore -9: andcc %o1, 0xf8, %o2 - be,pn %xcc, 13f - andcc %o1, 7, %o1 -#ifdef __KERNEL__ -14: sethi %hi(13f), %o4 - srl %o2, 1, %o3 - sub %o4, %o3, %o4 - jmpl %o4 + %lo(13f), %g0 - add %o0, %o2, %o0 -#else -14: rd %pc, %o4 - srl %o2, 1, %o3 - sub %o4, %o3, %o4 - jmpl %o4 + (13f - 14b), %g0 - add %o0, %o2, %o0 -#endif -12: ZERO_BLOCKS(%o0, 0xc8, %g0) - ZERO_BLOCKS(%o0, 0x88, %g0) - ZERO_BLOCKS(%o0, 0x48, %g0) - ZERO_BLOCKS(%o0, 0x08, %g0) - EXT(12b,13f,VISbzerofixup_zb) -13: be,pn %xcc, 8f - andcc %o1, 4, %g0 - be,pn %xcc, 1f - andcc %o1, 2, %g0 - EX(STW %g0, [%o0] ASINORMAL, and %o1, 7) - add %o0, 4, %o0 -1: be,pn %xcc, 1f - andcc %o1, 1, %g0 - EX(STH %g0, [%o0] ASINORMAL, and %o1, 3) - add %o0, 2, %o0 -1: bne,a,pn %xcc, 8f - EX(STB %g0, [%o0] ASINORMAL, add %g0, 1) -8: -#ifdef __KERNEL__ - wr %g5, %g0, %asi -#endif - retl - RETL -17: be,pn %xcc, 13b - orcc %o1, 0, %g0 - be,pn %xcc, 0f -8: add %o0, 1, %o0 - subcc %o1, 1, %o1 - bne,pt %xcc, 8b - EX(STB %g0, [%o0 - 1] ASINORMAL, add %o1, 1) -0: -#ifdef __KERNEL__ - wr %g5, %g0, %asi -#endif - retl - RETL - -#ifdef __KERNEL__ - .section .fixup - .align 4 -VISbzerofixup_reto1: - mov %o1, %o0 -VISbzerofixup_ret0: - wr %g5, %g0, %asi - retl - wr %g0, 0, %fprs -VISbzerofixup_ret1: - and %o5, 0x30, %o5 - add %o5, %o1, %o5 - ba,pt %xcc, VISbzerofixup_ret0 - add %o0, %o5, %o0 -VISbzerofixup_ret2: - and %o5, 0x20, %o5 - add %o5, %o1, %o5 - ba,pt %xcc, VISbzerofixup_ret0 - add %o0, %o5, %o0 -VISbzerofixup_zb: - andcc %o1, 7, %o1 - sll %g2, 3, %g2 - add %o1, 256, %o1 - ba,pt %xcc, VISbzerofixup_ret0 - sub %o1, %g2, %o0 -#endif diff --git a/arch/sparc64/lib/VIScsum.S b/arch/sparc64/lib/VIScsum.S deleted file mode 100644 index ae00e9fb17e6e..0000000000000 --- a/arch/sparc64/lib/VIScsum.S +++ /dev/null @@ -1,546 +0,0 @@ -/* $Id: VIScsum.S,v 1.7 2002/02/09 19:49:30 davem Exp $ - * VIScsum.S: High bandwidth IP checksumming utilizing the UltraSparc - * Visual Instruction Set. - * - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 2000 David S. Miller (davem@redhat.com) - * - * Based on older sparc32/sparc64 checksum.S, which is: - * - * Copyright(C) 1995 Linus Torvalds - * Copyright(C) 1995 Miguel de Icaza - * Copyright(C) 1996, 1997 David S. Miller - * derived from: - * Linux/Alpha checksum c-code - * Linux/ix86 inline checksum assembly - * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) - * David Mosberger-Tang for optimized reference c-code - * BSD4.4 portable checksum routine - */ - -#ifdef __sparc_v9__ -#define STACKOFF 2175 -#else -#define STACKOFF 64 -#endif - -#ifdef __KERNEL__ -#include <asm/head.h> -#include <asm/asi.h> -#include <asm/visasm.h> -#include <asm/thread_info.h> -#else -#define ASI_BLK_P 0xf0 -#define FRPS_FEF 0x04 -#endif - -/* Dobrou noc, SunSoft engineers. Spete sladce. - * This has a couple of tricks in and those - * tricks are UltraLinux trade secrets :)) - */ - -#define START_THE_TRICK(fz,f0,f2,f4,f6,f8,f10) \ - fcmpgt32 %fz, %f0, %g1 /* FPM Group */; \ - fcmpgt32 %fz, %f2, %g2 /* FPM Group */; \ - fcmpgt32 %fz, %f4, %g3 /* FPM Group */; \ - inc %g1 /* IEU0 Group */; \ - fcmpgt32 %fz, %f6, %g5 /* FPM */; \ - srl %g1, 1, %g1 /* IEU0 Group */; \ - fcmpgt32 %fz, %f8, %g7 /* FPM */; \ - inc %g2 /* IEU0 Group */; \ - fcmpgt32 %fz, %f10, %o3 /* FPM */; \ - srl %g2, 1, %g2 /* IEU0 Group */; \ - inc %g3 /* IEU1 */; \ - srl %g3, 1, %g3 /* IEU0 Group */; \ - add %o2, %g1, %o2 /* IEU1 */; \ - add %o2, %g2, %o2 /* IEU0 Group */; \ - inc %g5 /* IEU1 */; \ - add %o2, %g3, %o2 /* IEU0 Group */; - -#define DO_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14) \ - srl %g5, 1, %g5 /* IEU0 Group */; \ - fpadd32 %F0, %f0, %F0 /* FPA */; \ - fcmpgt32 %O12, %f12, %o4 /* FPM */; \ - inc %g7 /* IEU0 Group */; \ - fpadd32 %F2, %f2, %F2 /* FPA */; \ - fcmpgt32 %O14, %f14, %o5 /* FPM */; \ - add %o2, %g5, %o2 /* IEU1 Group */; \ - fpadd32 %F4, %f4, %F4 /* FPA */; \ - fcmpgt32 %f0, %F0, %g1 /* FPM */; \ - srl %g7, 1, %g7 /* IEU0 Group */; \ - fpadd32 %F6, %f6, %F6 /* FPA */; \ - fcmpgt32 %f2, %F2, %g2 /* FPM */; \ - add %o2, %g7, %o2 /* IEU0 Group */; \ - fpadd32 %F8, %f8, %F8 /* FPA */; \ - fcmpgt32 %f4, %F4, %g3 /* FPM */; \ - inc %o3 /* IEU0 Group */; \ - fpadd32 %F10, %f10, %F10 /* FPA */; \ - fcmpgt32 %f6, %F6, %g5 /* FPM */; \ - srl %o3, 1, %o3 /* IEU0 Group */; \ - fpadd32 %F12, %f12, %F12 /* FPA */; \ - fcmpgt32 %f8, %F8, %g7 /* FPM */; \ - add %o2, %o3, %o2 /* IEU0 Group */; \ - fpadd32 %F14, %f14, %F14 /* FPA */; \ - fcmpgt32 %f10, %F10, %o3 /* FPM */; \ - inc %o4 /* IEU0 Group */; \ - inc %o5 /* IEU1 */; \ - srl %o4, 1, %o4 /* IEU0 Group */; \ - inc %g1 /* IEU1 */; \ - srl %o5, 1, %o5 /* IEU0 Group */; \ - add %o2, %o4, %o2 /* IEU1 */; \ - srl %g1, 1, %g1 /* IEU0 Group */; \ - add %o2, %o5, %o2 /* IEU1 */; \ - inc %g2 /* IEU0 Group */; \ - add %o2, %g1, %o2 /* IEU1 */; \ - srl %g2, 1, %g2 /* IEU0 Group */; \ - inc %g3 /* IEU1 */; \ - srl %g3, 1, %g3 /* IEU0 Group */; \ - add %o2, %g2, %o2 /* IEU1 */; \ - inc %g5 /* IEU0 Group */; \ - add %o2, %g3, %o2 /* IEU0 */; - -#define END_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,S0,S1,S2,S3,T0,T1,U0,fz) \ - srl %g5, 1, %g5 /* IEU0 Group */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - fcmpgt32 %O12, %f12, %o4 /* FPM */; \ - inc %g7 /* IEU0 Group */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - fcmpgt32 %O14, %f14, %o5 /* FPM */; \ - srl %g7, 1, %g7 /* IEU0 Group */; \ - fpadd32 %f10, %f8, %S2 /* FPA */; \ - fcmpgt32 %f0, %S0, %g1 /* FPM */; \ - inc %o3 /* IEU0 Group */; \ - fpadd32 %f14, %f12, %S3 /* FPA */; \ - fcmpgt32 %f4, %S1, %g2 /* FPM */; \ - add %o2, %g5, %o2 /* IEU0 Group */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fcmpgt32 %f8, %S2, %g3 /* FPM */; \ - add %o2, %g7, %o2 /* IEU0 Group */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f12, %S3, %g5 /* FPM */; \ - srl %o3, 1, %o3 /* IEU0 Group */; \ - fpadd32 %S2, %S3, %T1 /* FPA */; \ - fcmpgt32 %S0, %T0, %g7 /* FPM */; \ - add %o2, %o3, %o2 /* IEU0 Group */; \ - fpadd32 %T0, %T1, %U0 /* FPA */; \ - fcmpgt32 %S2, %T1, %o3 /* FPM */; \ - inc %o4 /* IEU0 Group */; \ - inc %o5 /* IEU1 */; \ - srl %o4, 1, %o4 /* IEU0 Group */; \ - inc %g1 /* IEU1 */; \ - add %o2, %o4, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %f2, %o4 /* FPM */; \ - srl %o5, 1, %o5 /* IEU0 Group */; \ - inc %g2 /* IEU1 */; \ - add %o2, %o5, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %f6, %o5 /* FPM */; \ - srl %g1, 1, %g1 /* IEU0 Group */; \ - inc %g3 /* IEU1 */; \ - add %o2, %g1, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %f10, %g1 /* FPM */; \ - srl %g2, 1, %g2 /* IEU0 Group */; \ - inc %g5 /* IEU1 */; \ - add %o2, %g2, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %f14, %g2 /* FPM */; \ - srl %g3, 1, %g3 /* IEU0 Group */; \ - inc %g7 /* IEU1 */; \ - add %o2, %g3, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %S1, %g3 /* FPM */; \ - srl %g5, 1, %g5 /* IEU0 Group */; \ - inc %o3 /* IEU1 */; \ - add %o2, %g5, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %S3, %g5 /* FPM */; \ - srl %g7, 1, %g7 /* IEU0 Group */; \ - inc %o4 /* IEU1 */; \ - add %o2, %g7, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %T1, %g7 /* FPM */; \ - srl %o3, 1, %o3 /* IEU0 Group */; \ - inc %o5 /* IEU1 */; \ - add %o2, %o3, %o2 /* IEU0 Group */; \ - fcmpgt32 %T0, %U0, %o3 /* FPM */; \ - srl %o4, 1, %o4 /* IEU0 Group */; \ - inc %g1 /* IEU1 */; \ - sub %o2, %o4, %o2 /* IEU0 Group */; \ - fcmpgt32 %fz, %U0, %o4 /* FPM */; \ - srl %o5, 1, %o5 /* IEU0 Group */; \ - inc %g2 /* IEU1 */; \ - srl %g1, 1, %g1 /* IEU0 Group */; \ - sub %o2, %o5, %o2 /* IEU1 */; \ - std %U0, [%sp + STACKOFF] /* Store */; \ - srl %g2, 1, %g2 /* IEU0 Group */; \ - sub %o2, %g1, %o2 /* IEU1 */; \ - inc %g3 /* IEU0 Group */; \ - sub %o2, %g2, %o2 /* IEU1 */; \ - srl %g3, 1, %g3 /* IEU0 Group */; \ - inc %g5 /* IEU1 */; \ - srl %g5, 1, %g5 /* IEU0 Group */; \ - sub %o2, %g3, %o2 /* IEU1 */; \ - ldx [%sp + STACKOFF], %o5 /* Load Group */; \ - inc %g7 /* IEU0 */; \ - sub %o2, %g5, %o2 /* IEU1 */; \ - srl %g7, 1, %g7 /* IEU0 Group */; \ - inc %o3 /* IEU1 */; \ - srl %o3, 1, %o3 /* IEU0 Group */; \ - sub %o2, %g7, %o2 /* IEU1 */; \ - inc %o4 /* IEU0 Group */; \ - add %o2, %o3, %o2 /* IEU1 */; \ - srl %o4, 1, %o4 /* IEU0 Group */; \ - sub %o2, %o4, %o2 /* IEU0 Group */; \ - addcc %o2, %o5, %o2 /* IEU1 Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %o2, 1, %o2 /* IEU0 */; \ -33: /* That's it */; - -#define CSUM_LASTCHUNK(offset) \ - ldx [%o0 - offset - 0x10], %g2; \ - ldx [%o0 - offset - 0x08], %g3; \ - addcc %g2, %o2, %o2; \ - bcs,a,pn %xcc, 31f; \ - add %o2, 1, %o2; \ -31: addcc %g3, %o2, %o2; \ - bcs,a,pn %xcc, 32f; \ - add %o2, 1, %o2; \ -32: - - .text - .globl csum_partial - .align 32 -csum_partial: - andcc %o0, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - andcc %o0, 0x38, %g3 /* IEU1 */ - mov 1, %g5 /* IEU0 Group */ - cmp %o1, 6 /* IEU1 */ - bl,pn %icc, 21f /* CTI */ - andcc %o0, 1, %g0 /* IEU1 Group */ - bne,pn %icc, csump_really_slow /* CTI */ - andcc %o0, 2, %g0 /* IEU1 Group */ - be,pt %icc, 1f /* CTI */ - and %o0, 4, %g7 /* IEU0 */ - lduh [%o0], %g2 /* Load */ - sub %o1, 2, %o1 /* IEU0 Group */ - add %o0, 2, %o0 /* IEU1 */ - andcc %o0, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - sll %g2, 16, %g2 /* IEU0 Group */ - addcc %g2, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %icc, 1f /* CTI */ - add %o2, %g5, %o2 /* IEU0 */ -1: ld [%o0], %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %o0, 0x38, %g3 /* IEU0 */ - add %o0, 4, %o0 /* IEU0 Group */ - sub %o1, 4, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: and %o0, 0x38, %g3 /* IEU1 Group */ -4: srl %o2, 0, %o2 /* IEU0 Group */ - mov 0x40, %g1 /* IEU1 */ - brz,pn %g3, 3f /* CTI+IEU1 Group */ - sub %g1, %g3, %g1 /* IEU0 */ - cmp %o1, 56 /* IEU1 Group */ - blu,pn %icc, 20f /* CTI */ - andcc %o0, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - ldx [%o0], %g2 /* Load */ - add %o0, 8, %o0 /* IEU0 Group */ - sub %o1, 8, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 2f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - ldx [%o0], %g2 /* Load */ - ldx [%o0+8], %g3 /* Load Group */ - add %o0, 16, %o0 /* IEU0 */ - sub %o1, 16, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 2f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -2: brz,pn %g1, 3f /* CTI+IEU1 Group */ - ldx [%o0], %g2 /* Load */ - ldx [%o0+8], %g3 /* Load Group */ - ldx [%o0+16], %g5 /* Load Group */ - ldx [%o0+24], %g7 /* Load Group */ - add %o0, 32, %o0 /* IEU0 */ - sub %o1, 32, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g5, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g7, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 3f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -3: cmp %o1, 0xc0 /* IEU1 Group */ - blu,pn %icc, 20f /* CTI */ - sllx %o2, 32, %g5 /* IEU0 */ -#ifdef __KERNEL__ - VISEntry -#endif - addcc %o2, %g5, %o2 /* IEU1 Group */ - sub %o1, 0xc0, %o1 /* IEU0 */ - wr %g0, ASI_BLK_P, %asi /* LSU Group */ - membar #StoreLoad /* LSU Group */ - srlx %o2, 32, %o2 /* IEU0 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU1 */ -1: andcc %o1, 0x80, %g0 /* IEU1 Group */ - bne,pn %icc, 7f /* CTI */ - andcc %o1, 0x40, %g0 /* IEU1 Group */ - be,pn %icc, 6f /* CTI */ - fzero %f12 /* FPA */ - fzero %f14 /* FPA Group */ - ldda [%o0 + 0x000] %asi, %f16 - ldda [%o0 + 0x040] %asi, %f32 - ldda [%o0 + 0x080] %asi, %f48 - START_THE_TRICK(f12,f16,f18,f20,f22,f24,f26) - ba,a,pt %xcc, 3f -6: sub %o0, 0x40, %o0 /* IEU0 Group */ - fzero %f28 /* FPA */ - fzero %f30 /* FPA Group */ - ldda [%o0 + 0x040] %asi, %f32 - ldda [%o0 + 0x080] %asi, %f48 - ldda [%o0 + 0x0c0] %asi, %f0 - START_THE_TRICK(f28,f32,f34,f36,f38,f40,f42) - ba,a,pt %xcc, 4f -7: bne,pt %icc, 8f /* CTI */ - fzero %f44 /* FPA */ - add %o0, 0x40, %o0 /* IEU0 Group */ - fzero %f60 /* FPA */ - fzero %f62 /* FPA Group */ - ldda [%o0 - 0x040] %asi, %f0 - ldda [%o0 + 0x000] %asi, %f16 - ldda [%o0 + 0x040] %asi, %f32 - START_THE_TRICK(f60,f0,f2,f4,f6,f8,f10) - ba,a,pt %xcc, 2f -8: add %o0, 0x80, %o0 /* IEU0 Group */ - fzero %f46 /* FPA */ - ldda [%o0 - 0x080] %asi, %f48 - ldda [%o0 - 0x040] %asi, %f0 - ldda [%o0 + 0x000] %asi, %f16 - START_THE_TRICK(f44,f48,f50,f52,f54,f56,f58) -1: DO_THE_TRICK(f44,f46,f48,f50,f52,f54,f56,f58,f60,f62,f0,f2,f4,f6,f8,f10,f12,f14) - ldda [%o0 + 0x040] %asi, %f32 -2: DO_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30) - ldda [%o0 + 0x080] %asi, %f48 -3: DO_THE_TRICK(f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46) - ldda [%o0 + 0x0c0] %asi, %f0 -4: DO_THE_TRICK(f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,f48,f50,f52,f54,f56,f58,f60,f62) - add %o0, 0x100, %o0 /* IEU0 Group */ - subcc %o1, 0x100, %o1 /* IEU1 */ - bgeu,a,pt %icc, 1b /* CTI */ - ldda [%o0 + 0x000] %asi, %f16 - membar #Sync /* LSU Group */ - DO_THE_TRICK(f44,f46,f48,f50,f52,f54,f56,f58,f60,f62,f0,f2,f4,f6,f8,f10,f12,f14) - END_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30) -#ifdef __KERNEL__ - ldub [%g6 + TI_CURRENT_DS], %g7 -#endif - and %o1, 0x3f, %o1 /* IEU0 Group */ -#ifdef __KERNEL__ - VISExit - wr %g7, %g0, %asi -#endif -20: andcc %o1, 0xf0, %g1 /* IEU1 Group */ - be,pn %icc, 23f /* CTI */ - and %o1, 0xf, %o3 /* IEU0 */ -#ifdef __KERNEL__ -22: sll %g1, 1, %o4 /* IEU0 Group */ - sethi %hi(23f), %g7 /* IEU1 */ - sub %g7, %o4, %g7 /* IEU0 Group */ - jmpl %g7 + %lo(23f), %g0 /* CTI Group brk forced*/ - add %o0, %g1, %o0 /* IEU0 */ -#else -22: rd %pc, %g7 /* LSU Group+4bubbles */ - sll %g1, 1, %o4 /* IEU0 Group */ - sub %g7, %o4, %g7 /* IEU0 Group (regdep) */ - jmpl %g7 + (23f - 22b), %g0 /* CTI Group brk forced*/ - add %o0, %g1, %o0 /* IEU0 */ -#endif - CSUM_LASTCHUNK(0xe0) - CSUM_LASTCHUNK(0xd0) - CSUM_LASTCHUNK(0xc0) - CSUM_LASTCHUNK(0xb0) - CSUM_LASTCHUNK(0xa0) - CSUM_LASTCHUNK(0x90) - CSUM_LASTCHUNK(0x80) - CSUM_LASTCHUNK(0x70) - CSUM_LASTCHUNK(0x60) - CSUM_LASTCHUNK(0x50) - CSUM_LASTCHUNK(0x40) - CSUM_LASTCHUNK(0x30) - CSUM_LASTCHUNK(0x20) - CSUM_LASTCHUNK(0x10) - CSUM_LASTCHUNK(0x00) -23: brnz,pn %o3, 26f /* CTI+IEU1 Group */ -24: sllx %o2, 32, %g1 /* IEU0 */ -25: addcc %o2, %g1, %o0 /* IEU1 Group */ - srlx %o0, 32, %o0 /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o0, 1, %o0 /* IEU1 */ -1: retl /* CTI Group brk forced*/ - srl %o0, 0, %o0 /* IEU0 */ -26: andcc %o1, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - ldx [%o0], %g3 /* Load */ - add %o0, 8, %o0 /* IEU0 Group */ - addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: andcc %o1, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - ld [%o0], %g2 /* Load */ - add %o0, 4, %o0 /* IEU0 Group */ - sllx %g2, 32, %g2 /* IEU0 Group */ -1: andcc %o1, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o4 /* IEU0 Group */ - lduh [%o0], %o4 /* Load */ - add %o0, 2, %o0 /* IEU1 */ - sll %o4, 16, %o4 /* IEU0 Group */ -1: andcc %o1, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%o0], %o5 /* Load */ - sll %o5, 8, %o5 /* IEU0 Group */ -1: or %g2, %o4, %o4 /* IEU1 */ - or %o5, %o4, %o4 /* IEU0 Group (regdep) */ - addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %o2, 32, %g1 /* IEU0 */ -21: srl %o2, 0, %o2 /* IEU0 Group */ - cmp %o1, 0 /* IEU1 */ - be,pn %icc, 24b /* CTI */ - andcc %o1, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduh [%o0], %g3 /* Load */ - lduh [%o0+2], %g2 /* Load Group */ - add %o0, 4, %o0 /* IEU0 Group */ - sllx %g3, 48, %g3 /* IEU0 Group */ - sllx %g2, 32, %g2 /* IEU0 Group */ - or %g3, %g2, %g2 /* IEU0 Group */ -1: andcc %o1, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o4 /* IEU0 Group */ - lduh [%o0], %o4 /* Load */ - add %o0, 2, %o0 /* IEU1 */ - sll %o4, 16, %o4 /* IEU0 Group */ -1: andcc %o1, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%o0], %o5 /* Load */ - sll %o5, 8, %o5 /* IEU0 Group */ -1: or %g2, %o4, %o4 /* IEU1 */ - or %o5, %o4, %o4 /* IEU0 Group (regdep) */ - addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %o2, 32, %g1 /* IEU0 */ - - /* When buff is byte aligned and len is large, we backoff to - * this really slow handling. The issue is that we cannot do - * the VIS stuff when buff is byte aligned as unaligned.c will - * not fix it up. - */ -csump_really_slow: - mov %o0, %o3 - mov %o1, %o4 - cmp %o1, 0 - ble,pn %icc, 9f - mov 0, %o0 - andcc %o3, 1, %o5 - be,pt %icc, 1f - sra %o4, 1, %g3 - add %o1, -1, %o4 - ldub [%o3], %o0 - add %o3, 1, %o3 - sra %o4, 1, %g3 -1: - cmp %g3, 0 - be,pt %icc, 3f - and %o4, 1, %g2 - and %o3, 2, %g2 - brz,a,pt %g2, 1f - sra %g3, 1, %g3 - add %g3, -1, %g3 - add %o4, -2, %o4 - lduh [%o3], %g2 - add %o3, 2, %o3 - add %o0, %g2, %o0 - sra %g3, 1, %g3 -1: - cmp %g3, 0 - be,pt %icc, 2f - and %o4, 2, %g2 -1: - ld [%o3], %g2 - addcc %o0, %g2, %o0 - addx %o0, %g0, %o0 - addcc %g3, -1, %g3 - bne,pt %icc, 1b - add %o3, 4, %o3 - srl %o0, 16, %o1 - sethi %hi(64512), %g2 - or %g2, 1023, %g2 - and %o0, %g2, %g3 - add %g3, %o1, %g3 - srl %g3, 16, %o0 - and %g3, %g2, %g2 - add %g2, %o0, %g3 - sll %g3, 16, %g3 - srl %g3, 16, %o0 - and %o4, 2, %g2 -2: - cmp %g2, 0 - be,pt %icc, 3f - and %o4, 1, %g2 - lduh [%o3], %g2 - add %o3, 2, %o3 - add %o0, %g2, %o0 - and %o4, 1, %g2 -3: - cmp %g2, 0 - be,pt %icc, 1f - srl %o0, 16, %o1 - ldub [%o3], %g2 - sll %g2, 8, %g2 - add %o0, %g2, %o0 - srl %o0, 16, %o1 -1: - sethi %hi(64512), %g2 - or %g2, 1023, %g2 - cmp %o5, 0 - and %o0, %g2, %g3 - add %g3, %o1, %g3 - srl %g3, 16, %o0 - and %g3, %g2, %g2 - add %g2, %o0, %g3 - sll %g3, 16, %g3 - srl %g3, 16, %o0 - srl %g3, 24, %g3 - and %o0, 255, %g2 - sll %g2, 8, %g2 - bne,pt %icc, 1f - or %g3, %g2, %g2 -9: - mov %o0, %g2 -1: - addcc %g2, %o2, %g2 - addx %g2, %g0, %g2 - retl - srl %g2, 0, %o0 diff --git a/arch/sparc64/lib/VIScsumcopy.S b/arch/sparc64/lib/VIScsumcopy.S deleted file mode 100644 index d4caa955ea738..0000000000000 --- a/arch/sparc64/lib/VIScsumcopy.S +++ /dev/null @@ -1,897 +0,0 @@ -/* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $ - * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous - * copying utilizing the UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - * - * Based on older sparc32/sparc64 checksum.S, which is: - * - * Copyright(C) 1995 Linus Torvalds - * Copyright(C) 1995 Miguel de Icaza - * Copyright(C) 1996,1997 David S. Miller - * derived from: - * Linux/Alpha checksum c-code - * Linux/ix86 inline checksum assembly - * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) - * David Mosberger-Tang for optimized reference c-code - * BSD4.4 portable checksum routine - */ - -#ifdef __sparc_v9__ -#define STACKOFF 0x7ff+128 -#else -#define STACKOFF 64 -#endif - -#ifdef __KERNEL__ -#include <asm/head.h> -#include <asm/asi.h> -#include <asm/page.h> -#include <asm/visasm.h> -#include <asm/thread_info.h> -#define ASI_BLK_XOR 0 -#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P) -#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P) -#else -#define ASI_P 0x80 -#define ASI_BLK_P 0xf0 -#define FRPS_FEF 0x04 -#define FPRS_DU 0x02 -#define FPRS_DL 0x01 -#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P) -#endif - -#define src o0 -#define dst o1 -#define len o2 -#define sum o3 -#define x1 g1 -#define x2 g2 -#define x3 o4 -#define x4 g4 -#define x5 g5 -#define x6 g7 -#define x7 g3 -#define x8 o5 - -/* Dobrou noc, SunSoft engineers. Spete sladce. - * This has a couple of tricks in and those - * tricks are UltraLinux trade secrets :)) - * Once AGAIN, the SunSoft engineers are caught - * asleep at the keyboard :)). - * The main loop does about 20 superscalar cycles - * per 64bytes checksummed/copied. - */ - -#define LDBLK(O0) \ - ldda [%src] %asi, %O0 /* Load Group */ - -#define STBLK \ - stda %f48, [%dst] ASI_BLK_P /* Store */ - -#define ST(fx,off) \ - std %fx, [%dst + off] /* Store */ - -#define SYNC \ - membar #Sync - - -#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ - LOAD /* Load (Group) */; \ - faligndata %A14, %F0, %A14 /* FPA Group */; \ - inc %x5 /* IEU0 */; \ - STORE1 /* Store (optional) */; \ - faligndata %F0, %F2, %A0 /* FPA Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %F0, %f0, %F0 /* FPA Group */; \ - inc %x6 /* IEU0 */; \ - STORE2 /* Store (optional) */; \ - faligndata %F2, %F4, %A2 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fpadd32 %F2, %f2, %F2 /* FPA Group */; \ - add %src, 64, %src /* IEU0 */; \ - fcmpgt32 %f0, %F0, %x1 /* FPM */; \ - add %dst, 64, %dst /* IEU1 Group */; \ - inc %x7 /* IEU0 */; \ - STORE3 /* Store (optional) */; \ - faligndata %F4, %F6, %A4 /* FPA */; \ - fpadd32 %F4, %f4, %F4 /* FPA Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fcmpgt32 %f2, %F2, %x2 /* FPM */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - inc %x8 /* IEU1 */; \ - STORE4 /* Store (optional) */; \ - faligndata %F6, %F8, %A6 /* FPA */; \ - fpadd32 %F6, %f6, %F6 /* FPA Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - fcmpgt32 %f4, %F4, %x3 /* FPM */; \ - add %sum, %x7, %sum /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - STORE5 /* Store (optional) */; \ - faligndata %F8, %F10, %A8 /* FPA */; \ - fpadd32 %F8, %f8, %F8 /* FPA Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - fcmpgt32 %f6, %F6, %x4 /* FPM */; \ - add %sum, %x8, %sum /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - STORE6 /* Store (optional) */; \ - faligndata %F10, %F12, %A10 /* FPA */; \ - fpadd32 %F10, %f10, %F10 /* FPA Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - fcmpgt32 %f8, %F8, %x5 /* FPM */; \ - add %sum, %x1, %sum /* IEU0 Group */; \ - inc %x3 /* IEU1 */; \ - STORE7 /* Store (optional) */; \ - faligndata %F12, %F14, %A12 /* FPA */; \ - fpadd32 %F12, %f12, %F12 /* FPA Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - fcmpgt32 %f10, %F10, %x6 /* FPM */; \ - add %sum, %x2, %sum /* IEU0 Group */; \ - inc %x4 /* IEU1 */; \ - STORE8 /* Store (optional) */; \ - fmovd %F14, %B14 /* FPA */; \ - fpadd32 %F14, %f14, %F14 /* FPA Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - fcmpgt32 %f12, %F12, %x7 /* FPM */; \ - add %sum, %x3, %sum /* IEU0 Group */; \ - subcc %len, 64, %len /* IEU1 */; \ - BRANCH /* CTI */; \ - fcmpgt32 %f14, %F14, %x8 /* FPM Group */; - -#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ - inc %x5 /* IEU0 Group */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - add %sum, %x4, %sum /* IEU1 */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - inc %x6 /* IEU1 */; \ - fpadd32 %f10, %f8, %S2 /* FPA Group */; \ - add %sum, %x5, %sum /* IEU0 */; \ - fcmpgt32 %f0, %S0, %x1 /* FPM */; \ - fpadd32 %f14, %f12, %S3 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - fcmpgt32 %f4, %S1, %x2 /* FPM */; \ - add %sum, %x6, %sum /* IEU0 Group */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f8, %S2, %x3 /* FPM */; \ - inc %x7 /* IEU0 Group */; \ - inc %x8 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fpadd32 %S2, %S3, %T1 /* FPA Group */; \ - add %sum, %x7, %sum /* IEU0 */; \ - fcmpgt32 %f12, %S3, %x4 /* FPM */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - add %sum, %x8, %sum /* IEU1 */; \ - add %sum, %x1, %sum /* IEU0 Group */; \ - fcmpgt32 %S0, %T0, %x5 /* FPM */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - fcmpgt32 %S2, %T1, %x6 /* FPM */; \ - inc %x3 /* IEU0 Group */; \ - add %sum, %x2, %sum /* IEU1 */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - inc %x4 /* IEU1 */; \ - fpadd32 %T0, %T1, %U0 /* FPA Group */; \ - add %sum, %x3, %sum /* IEU0 */; \ - fcmpgt32 %fz, %f2, %x7 /* FPM */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - fcmpgt32 %fz, %f6, %x8 /* FPM */; \ - inc %x5 /* IEU0 Group */; \ - add %sum, %x4, %sum /* IEU1 */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - fcmpgt32 %fz, %f10, %x1 /* FPM */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fmovd %FA, %FB /* FPA Group */; \ - fcmpgt32 %fz, %f14, %x2 /* FPM */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - ba,pt %xcc, ett /* CTI */; \ - inc %x7 /* IEU1 */; - -#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ - END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) - -#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ - fpadd32 %U0, %U1, %V0 /* FPA Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - std %V0, [%sp + STACKOFF] /* Store Group */; \ - inc %x8 /* IEU0 */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - fcmpgt32 %fz, %S1, %x3 /* FPM */; \ - inc %x1 /* IEU0 Group */; \ - fcmpgt32 %fz, %S3, %x4 /* FPM */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - sub %sum, %x8, %sum /* IEU1 */; \ - ldx [%sp + STACKOFF], %x8 /* Load Group */; \ - inc %x2 /* IEU0 */; \ - sub %sum, %x1, %sum /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - fcmpgt32 %fz, %T1, %x5 /* FPM */; \ - inc %x3 /* IEU0 Group */; \ - fcmpgt32 %T0, %U0, %x6 /* FPM */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - sub %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - sub %sum, %x3, %sum /* IEU1 */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - fcmpgt32 %fz, %U1, %x7 /* FPM */; \ - inc %x5 /* IEU0 Group */; \ - fcmpgt32 %U0, %V0, %x1 /* FPM */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - sub %sum, %x4, %sum /* IEU1 */; \ - sub %sum, %x5, %sum /* IEU0 Group */; \ - fcmpgt32 %fz, %V0, %x2 /* FPM */; \ - inc %x6 /* IEU0 Group */; \ - inc %x7 /* IEU1 */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - inc %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - sub %sum, %x2, %sum /* IEU0 Group */; \ - addcc %sum, %x8, %sum /* IEU1 Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %sum, 1, %sum /* IEU0 (Group) */; \ -33: /* That's it */; - - .text - .globl csum_partial_copy_vis - .align 32 -/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. - * csum_partial_copy_from_user - * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 - */ -csum_partial_copy_vis: - andcc %dst, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - and %dst, 0x38, %o4 /* IEU0 */ - mov 1, %g5 /* IEU0 Group */ - andcc %dst, 2, %g0 /* IEU1 */ - be,pt %icc, 1f /* CTI */ - and %dst, 4, %g7 /* IEU0 Group */ - lduha [%src] %asi, %g2 /* Load */ - sub %len, 2, %len /* IEU0 Group */ - add %dst, 2, %dst /* IEU1 */ - andcc %dst, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - sth %g2, [%dst - 2] /* Store Group */ - sll %g2, 16, %g2 /* IEU0 */ - add %src, 2, %src /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, %g5, %sum /* IEU0 */ -1: lduwa [%src] %asi, %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %dst, 0x38, %o4 /* IEU0 */ - add %dst, 4, %dst /* IEU0 Group */ - sub %len, 4, %len /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: and %dst, 0x38, %o4 /* IEU0 Group */ - stw %g2, [%dst - 4] /* Store */ - add %src, 4, %src /* IEU1 */ -4: -#ifdef __KERNEL__ - VISEntry -#endif - mov %src, %g7 /* IEU1 Group */ - fzero %f48 /* FPA */ - alignaddr %src, %g0, %src /* Single Group */ - subcc %g7, %src, %g7 /* IEU1 Group */ - be,pt %xcc, 1f /* CTI */ - mov 0x40, %g1 /* IEU0 */ - lduwa [%src] %asi, %g2 /* Load Group */ - subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ - bcs,a,pn %icc, 1f /* CTI */ - sub %sum, 1, %sum /* IEU0 */ -1: srl %sum, 0, %sum /* IEU0 Group */ - clr %g5 /* IEU1 */ - brz,pn %o4, 3f /* CTI+IEU1 Group */ - sub %g1, %o4, %g1 /* IEU0 */ - ldda [%src] %asi, %f0 /* Load */ - clr %o4 /* IEU0 Group */ - andcc %dst, 8, %g0 /* IEU1 */ - be,pn %icc, 1f /* CTI */ - ldda [%src + 8] %asi, %f2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - sub %len, 8, %len /* IEU1 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - addcc %dst, 8, %dst /* IEU1 Group */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o4 /* FPM Group */ - fmovd %f2, %f0 /* FPA Group */ - ldda [%src + 8] %asi, %f2 /* Load */ - std %f16, [%dst - 8] /* Store */ - fmovd %f50, %f48 /* FPA */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldda [%src + 16] %asi, %f4 /* Load Group */ - add %src, 16, %src /* IEU0 */ - add %dst, 16, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - sub %len, 16, %len /* IEU0 */ - inc %o4 /* IEU1 */ - std %f16, [%dst - 16] /* Store Group */ - fpadd32 %f2, %f50, %f48 /* FPA */ - srl %o4, 1, %o5 /* IEU0 */ - faligndata %f2, %f4, %f18 /* FPA Group */ - std %f18, [%dst - 8] /* Store */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - ldda [%src + 8] %asi, %f2 /* Load */ - fmovd %f4, %f0 /* FPA */ -1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ - rd %asi, %g2 /* LSU Group + 4 bubbles*/ - inc %g5 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldda [%src + 16] %asi, %f4 /* Load Group */ - srl %g5, 1, %g5 /* IEU0 */ - add %dst, 32, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o5 /* FPM Group */ - inc %o4 /* IEU0 */ - ldda [%src + 24] %asi, %f6 /* Load */ - srl %o4, 1, %o4 /* IEU0 Group */ - add %g5, %sum, %sum /* IEU1 */ - ldda [%src + 32] %asi, %f8 /* Load */ - fpadd32 %f2, %f50, %f48 /* FPA */ - faligndata %f2, %f4, %f18 /* FPA Group */ - sub %len, 32, %len /* IEU0 */ - std %f16, [%dst - 32] /* Store */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ - inc %o5 /* IEU0 */ - add %o4, %sum, %sum /* IEU1 */ - fpadd32 %f4, %f48, %f50 /* FPA */ - faligndata %f4, %f6, %f20 /* FPA Group */ - srl %o5, 1, %o5 /* IEU0 */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - std %f18, [%dst - 24] /* Store */ - fpadd32 %f6, %f50, %f48 /* FPA */ - inc %g3 /* IEU0 Group */ - std %f20, [%dst - 16] /* Store */ - add %src, 32, %src /* IEU1 */ - faligndata %f6, %f8, %f22 /* FPA */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - srl %g3, 1, %g3 /* IEU0 */ - std %f22, [%dst - 8] /* Store */ - add %g3, %sum, %sum /* IEU0 Group */ -3: rd %asi, %g2 /* LSU Group + 4 bubbles*/ -#ifdef __KERNEL__ -4: sethi %hi(vis0s), %g7 /* IEU0 Group */ - or %g2, ASI_BLK_OR, %g2 /* IEU1 */ -#else -4: rd %pc, %g7 /* LSU Group + 4 bubbles*/ -#endif - inc %g5 /* IEU0 Group */ - and %src, 0x38, %g3 /* IEU1 */ - membar #StoreLoad /* LSU Group */ - srl %g5, 1, %g5 /* IEU0 */ - inc %o4 /* IEU1 */ - sll %g3, 8, %g3 /* IEU0 Group */ - sub %len, 0xc0, %len /* IEU1 */ - addcc %g5, %sum, %sum /* IEU1 Group */ - srl %o4, 1, %o4 /* IEU0 */ - add %g7, %g3, %g7 /* IEU0 Group */ - add %o4, %sum, %sum /* IEU1 */ -#ifdef __KERNEL__ - jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ -#else - jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ -#endif - fzero %f32 /* FPA */ - - .align 2048 -vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f62 /* FPA Group f0 available*/ - faligndata %f0, %f2, %f48 /* FPA Group f2 available*/ - fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/ - fpadd32 %f0, %f62, %f0 /* FPA */ - fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/ - faligndata %f2, %f4, %f50 /* FPA */ - fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/ - faligndata %f4, %f6, %f52 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/ - inc %x1 /* IEU0 */ - faligndata %f6, %f8, %f54 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/ - srl %x1, 1, %x1 /* IEU0 */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f56 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/ - srl %x2, 1, %x2 /* IEU0 */ - add %sum, %x1, %sum /* IEU1 */ - faligndata %f10, %f12, %f58 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f60 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f62 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f32), STBLK,,,,,,,, - ,bcs,pn %icc, vis0e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f0), STBLK,,,,,,,, - ,bcs,pn %icc, vis0e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f16), STBLK,,,,,,,, - ,bcc,pt %icc, vis0) -vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f48,f50,f52,f54,f56,f58,f60,f62,f32, - ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2) -vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f48,f50,f52,f54,f56,f58,f60,f62,f0, - ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3) -vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f48,f50,f52,f54,f56,f58,f60,f62,f16, - ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) - .align 2048 -vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 8, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f58 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - fcmpgt32 %f32, %f2, %x2 /* FPM Group */ - faligndata %f2, %f4, %f48 /* FPA */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f50 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f52 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f54 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - srl %x2, 1, %x2 /* IEU0 */ - faligndata %f10, %f12, %f56 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f58 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f60 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f32), ,STBLK,,,,,,, - ,bcs,pn %icc, vis1e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f0), ,STBLK,,,,,,, - ,bcs,pn %icc, vis1e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f16), ,STBLK,,,,,,, - ,bcc,pt %icc, vis1) -vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f62,f48,f50,f52,f54,f56,f58,f60,f32, - ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2) -vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f62,f48,f50,f52,f54,f56,f58,f60,f0, - ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3) -vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f62,f48,f50,f52,f54,f56,f58,f60,f16, - ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) - .align 2048 -vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 16, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f56 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f48 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f50 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f52 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f54 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - faligndata %f12, %f14, %f56 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f58 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f32), ,,STBLK,,,,,, - ,bcs,pn %icc, vis2e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f0), ,,STBLK,,,,,, - ,bcs,pn %icc, vis2e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f16), ,,STBLK,,,,,, - ,bcc,pt %icc, vis2) -vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f60,f62,f48,f50,f52,f54,f56,f58,f32, - ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2) -vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f60,f62,f48,f50,f52,f54,f56,f58,f0, - ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3) -vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f60,f62,f48,f50,f52,f54,f56,f58,f16, - ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) - .align 2048 -vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 24, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f54 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f48 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f50 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f52 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f54 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f56 /* FPA */ - inc %x4 /* IEU0 */ - srl %x4, 1, %x4 /* IEU0 Group */ -vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f32), ,,,STBLK,,,,, - ,bcs,pn %icc, vis3e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f0), ,,,STBLK,,,,, - ,bcs,pn %icc, vis3e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f16), ,,,STBLK,,,,, - ,bcc,pt %icc, vis3) -vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f58,f60,f62,f48,f50,f52,f54,f56,f32, - ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2) -vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f58,f60,f62,f48,f50,f52,f54,f56,f0, - ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3) -vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f58,f60,f62,f48,f50,f52,f54,f56,f16, - ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) - .align 2048 -vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 32, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f52 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fpsub32 %f6, %f6, %f6 /* FPA Group */ - clr %x4 /* IEU0 */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f48 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f50 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f52 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f54 /* FPA */ -vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f32), ,,,,STBLK,,,, - ,bcs,pn %icc, vis4e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f0), ,,,,STBLK,,,, - ,bcs,pn %icc, vis4e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f16), ,,,,STBLK,,,, - ,bcc,pt %icc, vis4) -vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f56,f58,f60,f62,f48,f50,f52,f54,f32, - ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2) -vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f56,f58,f60,f62,f48,f50,f52,f54,f0, - ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3) -vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f56,f58,f60,f62,f48,f50,f52,f54,f16, - ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) - .align 2048 -vis5s: add %src, 128 - 40, %src /* IEU0 Group */ - ldda [%src-88] %asi, %f10 /* Load Group */ - ldda [%src-80] %asi, %f12 /* Load Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - faligndata %f10, %f12, %f48 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f50 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f52 /* FPA */ -vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f32), ,,,,,STBLK,,, - ,bcs,pn %icc, vis5e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f0), ,,,,,STBLK,,, - ,bcs,pn %icc, vis5e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f16), ,,,,,STBLK,,, - ,bcc,pt %icc, vis5) -vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f54,f56,f58,f60,f62,f48,f50,f52,f32, - ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2) -vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f54,f56,f58,f60,f62,f48,f50,f52,f0, - ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3) -vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f54,f56,f58,f60,f62,f48,f50,f52,f16, - ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) - .align 2048 -vis6s: add %src, 128 - 48, %src /* IEU0 Group */ - ldda [%src-80] %asi, %f12 /* Load Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - faligndata %f12, %f14, %f48 /* FPA */ - fmovd %f14, %f50 /* FPA Group */ -vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f32), ,,,,,,STBLK,, - ,bcs,pn %icc, vis6e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f0), ,,,,,,STBLK,, - ,bcs,pn %icc, vis6e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f16), ,,,,,,STBLK,, - ,bcc,pt %icc, vis6) -vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f52,f54,f56,f58,f60,f62,f48,f50,f32, - ,SYNC, ,,,,,,STBLK,ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2) -vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f52,f54,f56,f58,f60,f62,f48,f50,f0, - ,SYNC, ,,,,,,STBLK,ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3) -vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f52,f54,f56,f58,f60,f62,f48,f50,f16, - ,SYNC, ,,,,,,STBLK,ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) - .align 2048 -vis7s: add %src, 128 - 56, %src /* IEU0 Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - faddd %f32, %f32, %f12 /* FPA Group */ - clr %x7 /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fmovd %f14, %f48 /* FPA */ -vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f32), ,,,,,,,STBLK, - ,bcs,pn %icc, vis7e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f0), ,,,,,,,STBLK, - ,bcs,pn %icc, vis7e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f16), ,,,,,,,STBLK, - ,bcc,pt %icc, vis7) -vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f50,f52,f54,f56,f58,f60,f62,f48,f32, - ,SYNC, ,,,,,,,STBLK, - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2) -vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f50,f52,f54,f56,f58,f60,f62,f48,f0, - ,SYNC, ,,,,,,,STBLK, - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3) -vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f50,f52,f54,f56,f58,f60,f62,f48,f16, - ,SYNC, ,,,,,,,STBLK, - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1) -e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) -e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) -e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) -ett: rd %asi, %x4 /* LSU Group+4bubbles */ - rd %gsr, %x3 /* LSU Group+4bubbles */ -#ifdef __KERNEL__ - srl %x4, 3, %x5 /* IEU0 Group */ - xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */ - wr %x4, %x5, %asi /* LSU Group+4bubbles */ -#else - wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */ -#endif - andcc %x3, 7, %x3 /* IEU1 Group */ - add %dst, 8, %dst /* IEU0 */ - bne,pn %icc, 1f /* CTI */ - fzero %f10 /* FPA */ - brz,a,pn %len, 2f /* CTI+IEU1 Group */ - std %f6, [%dst - 8] /* Store */ -1: cmp %len, 8 /* IEU1 */ - blu,pn %icc, 3f /* CTI */ - sub %src, 64, %src /* IEU0 Group */ -1: ldda [%src] %asi, %f2 /* Load Group */ - fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - faligndata %f6, %f2, %f14 /* FPA Group */ - fcmpgt32 %f10, %f12, %x5 /* FPM Group */ - std %f14, [%dst - 16] /* Store */ - fmovd %f2, %f6 /* FPA */ - fmovd %f12, %f10 /* FPA Group */ - sub %len, 8, %len /* IEU1 */ - fzero %f16 /* FPA Group - FPU nop */ - fzero %f18 /* FPA Group - FPU nop */ - inc %x5 /* IEU0 */ - srl %x5, 1, %x5 /* IEU0 Group (regdep) */ - cmp %len, 8 /* IEU1 */ - bgeu,pt %icc, 1b /* CTI */ - add %x5, %sum, %sum /* IEU0 Group */ -3: brz,a,pt %x3, 2f /* CTI+IEU1 */ - std %f6, [%dst - 8] /* Store Group */ - st %f7, [%dst - 8] /* Store Group */ - sub %dst, 4, %dst /* IEU0 */ - add %len, 4, %len /* IEU1 */ -2: -#ifdef __KERNEL__ - sub %sp, 8, %sp /* IEU0 Group */ -#endif - END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) - membar #Sync /* LSU Group */ -#ifdef __KERNEL__ - VISExit - add %sp, 8, %sp /* IEU0 Group */ -#endif -23: brnz,pn %len, 26f /* CTI+IEU1 Group */ -24: sllx %sum, 32, %g1 /* IEU0 */ -25: addcc %sum, %g1, %src /* IEU1 Group */ - srlx %src, 32, %src /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %src, 1, %src /* IEU1 */ -#ifndef __KERNEL__ -1: retl /* CTI Group brk forced*/ - srl %src, 0, %src /* IEU0 */ -#else -1: retl /* CTI Group brk forced*/ - ldx [%g6 + TI_TASK], %g4 /* Load */ -#endif -26: andcc %len, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - lduwa [%src] %asi, %o4 /* Load */ - lduwa [%src+4] %asi, %g2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - sllx %o4, 32, %g5 /* IEU0 Group */ - stw %o4, [%dst - 8] /* Store */ - or %g5, %g2, %g5 /* IEU0 Group */ - stw %g2, [%dst - 4] /* Store */ - addcc %g5, %sum, %sum /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: andcc %len, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduwa [%src] %asi, %g7 /* Load */ - add %src, 4, %src /* IEU0 Group */ - add %dst, 4, %dst /* IEU1 */ - sllx %g7, 32, %g2 /* IEU0 Group */ - stw %g7, [%dst - 4] /* Store */ -1: andcc %len, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %g3 /* IEU0 Group */ - lduha [%src] %asi, %g7 /* Load */ - add %src, 2, %src /* IEU1 */ - add %dst, 2, %dst /* IEU0 Group */ - sll %g7, 16, %g3 /* IEU0 Group */ - sth %g7, [%dst - 2] /* Store */ -1: andcc %len, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - lduba [%src] %asi, %g7 /* Load */ - sll %g7, 8, %o5 /* IEU0 Group */ - stb %g7, [%dst] /* Store */ -1: or %g2, %g3, %g3 /* IEU1 */ - or %o5, %g3, %g3 /* IEU0 Group (regdep) */ - addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %sum, 32, %g1 /* IEU0 */ - -#ifdef __KERNEL__ -end: - - .section __ex_table - .align 4 - .word csum_partial_copy_vis, 0, end, cpc_handler -#endif diff --git a/arch/sparc64/lib/VIScsumcopyusr.S b/arch/sparc64/lib/VIScsumcopyusr.S deleted file mode 100644 index fc27b7fa4117e..0000000000000 --- a/arch/sparc64/lib/VIScsumcopyusr.S +++ /dev/null @@ -1,916 +0,0 @@ -/* $Id: VIScsumcopyusr.S,v 1.2 2000/02/20 23:21:40 davem Exp $ - * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous - * copying utilizing the UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - * Copyright (C) 2000 David S. Miller (davem@redhat.com) - * - * Based on older sparc32/sparc64 checksum.S, which is: - * - * Copyright(C) 1995 Linus Torvalds - * Copyright(C) 1995 Miguel de Icaza - * Copyright(C) 1996,1997 David S. Miller - * derived from: - * Linux/Alpha checksum c-code - * Linux/ix86 inline checksum assembly - * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) - * David Mosberger-Tang for optimized reference c-code - * BSD4.4 portable checksum routine - */ - -#ifdef __sparc_v9__ -#define STACKOFF 0x7ff+128 -#else -#define STACKOFF 64 -#endif - -#ifdef __KERNEL__ -#include <asm/head.h> -#include <asm/asi.h> -#include <asm/page.h> -#include <asm/visasm.h> -#include <asm/thread_info.h> -#define ASI_BLK_XOR 0 -#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P) -#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P) -#else -#define ASI_P 0x80 -#define ASI_BLK_P 0xf0 -#define FRPS_FEF 0x04 -#define FPRS_DU 0x02 -#define FPRS_DL 0x01 -#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P) -#endif - -#define src o0 -#define dst o1 -#define len o2 -#define sum o3 -#define x1 g1 -#define x2 g2 -#define x3 o4 -#define x4 g4 -#define x5 g5 -#define x6 g7 -#define x7 g3 -#define x8 o5 - -/* Dobrou noc, SunSoft engineers. Spete sladce. - * This has a couple of tricks in and those - * tricks are UltraLinux trade secrets :)) - * Once AGAIN, the SunSoft engineers are caught - * asleep at the keyboard :)). - * The main loop does about 20 superscalar cycles - * per 64bytes checksummed/copied. - */ - -#define LDBLK(O0) \ - ldda [%src] ASI_BLK_P, %O0 /* Load Group */ - -#define STBLK \ - stda %f48, [%dst] %asi /* Store */ - -#ifdef __KERNEL__ -#define STBLK_XORASI(tmpreg1,tmpreg2) \ - stda %f48, [%dst] %asi /* Store */; \ - rd %asi, %tmpreg1; \ - srl %tmpreg1, 3, %tmpreg2; \ - xor %tmpreg1, ASI_BLK_XOR1, %tmpreg1; \ - wr %tmpreg1, %tmpreg2, %asi; -#else -#define STBLK_XORASI(tmpreg1,tmpreg2) \ - stda %f48, [%dst] %asi /* Store */; \ - rd %asi, %tmpreg1; \ - wr %tmpreg1, ASI_BLK_XOR, %asi; -#endif - -#define ST(fx,off) \ - stda %fx, [%dst + off] %asi /* Store */ - -#define SYNC \ - membar #Sync - - -#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ - LOAD /* Load (Group) */; \ - faligndata %A14, %F0, %A14 /* FPA Group */; \ - inc %x5 /* IEU0 */; \ - STORE1 /* Store (optional) */; \ - faligndata %F0, %F2, %A0 /* FPA Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %F0, %f0, %F0 /* FPA Group */; \ - inc %x6 /* IEU0 */; \ - STORE2 /* Store (optional) */; \ - faligndata %F2, %F4, %A2 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fpadd32 %F2, %f2, %F2 /* FPA Group */; \ - add %src, 64, %src /* IEU0 */; \ - fcmpgt32 %f0, %F0, %x1 /* FPM */; \ - add %dst, 64, %dst /* IEU1 Group */; \ - inc %x7 /* IEU0 */; \ - STORE3 /* Store (optional) */; \ - faligndata %F4, %F6, %A4 /* FPA */; \ - fpadd32 %F4, %f4, %F4 /* FPA Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fcmpgt32 %f2, %F2, %x2 /* FPM */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - inc %x8 /* IEU1 */; \ - STORE4 /* Store (optional) */; \ - faligndata %F6, %F8, %A6 /* FPA */; \ - fpadd32 %F6, %f6, %F6 /* FPA Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - fcmpgt32 %f4, %F4, %x3 /* FPM */; \ - add %sum, %x7, %sum /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - STORE5 /* Store (optional) */; \ - faligndata %F8, %F10, %A8 /* FPA */; \ - fpadd32 %F8, %f8, %F8 /* FPA Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - fcmpgt32 %f6, %F6, %x4 /* FPM */; \ - add %sum, %x8, %sum /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - STORE6 /* Store (optional) */; \ - faligndata %F10, %F12, %A10 /* FPA */; \ - fpadd32 %F10, %f10, %F10 /* FPA Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - fcmpgt32 %f8, %F8, %x5 /* FPM */; \ - add %sum, %x1, %sum /* IEU0 Group */; \ - inc %x3 /* IEU1 */; \ - STORE7 /* Store (optional) */; \ - faligndata %F12, %F14, %A12 /* FPA */; \ - fpadd32 %F12, %f12, %F12 /* FPA Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - fcmpgt32 %f10, %F10, %x6 /* FPM */; \ - add %sum, %x2, %sum /* IEU0 Group */; \ - inc %x4 /* IEU1 */; \ - STORE8 /* Store (optional) */; \ - fmovd %F14, %B14 /* FPA */; \ - fpadd32 %F14, %f14, %F14 /* FPA Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - fcmpgt32 %f12, %F12, %x7 /* FPM */; \ - add %sum, %x3, %sum /* IEU0 Group */; \ - subcc %len, 64, %len /* IEU1 */; \ - BRANCH /* CTI */; \ - fcmpgt32 %f14, %F14, %x8 /* FPM Group */; - -#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ - inc %x5 /* IEU0 Group */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - add %sum, %x4, %sum /* IEU1 */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - inc %x6 /* IEU1 */; \ - fpadd32 %f10, %f8, %S2 /* FPA Group */; \ - add %sum, %x5, %sum /* IEU0 */; \ - fcmpgt32 %f0, %S0, %x1 /* FPM */; \ - fpadd32 %f14, %f12, %S3 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - fcmpgt32 %f4, %S1, %x2 /* FPM */; \ - add %sum, %x6, %sum /* IEU0 Group */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f8, %S2, %x3 /* FPM */; \ - inc %x7 /* IEU0 Group */; \ - inc %x8 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fpadd32 %S2, %S3, %T1 /* FPA Group */; \ - add %sum, %x7, %sum /* IEU0 */; \ - fcmpgt32 %f12, %S3, %x4 /* FPM */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - add %sum, %x8, %sum /* IEU1 */; \ - add %sum, %x1, %sum /* IEU0 Group */; \ - fcmpgt32 %S0, %T0, %x5 /* FPM */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - fcmpgt32 %S2, %T1, %x6 /* FPM */; \ - inc %x3 /* IEU0 Group */; \ - add %sum, %x2, %sum /* IEU1 */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - inc %x4 /* IEU1 */; \ - fpadd32 %T0, %T1, %U0 /* FPA Group */; \ - add %sum, %x3, %sum /* IEU0 */; \ - fcmpgt32 %fz, %f2, %x7 /* FPM */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - fcmpgt32 %fz, %f6, %x8 /* FPM */; \ - inc %x5 /* IEU0 Group */; \ - add %sum, %x4, %sum /* IEU1 */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - fcmpgt32 %fz, %f10, %x1 /* FPM */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fmovd %FA, %FB /* FPA Group */; \ - fcmpgt32 %fz, %f14, %x2 /* FPM */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - ba,pt %xcc, ett /* CTI */; \ - inc %x7 /* IEU1 */; - -#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ - END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) - -#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ - fpadd32 %U0, %U1, %V0 /* FPA Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - std %V0, [%sp + STACKOFF] /* Store Group */; \ - inc %x8 /* IEU0 */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - fcmpgt32 %fz, %S1, %x3 /* FPM */; \ - inc %x1 /* IEU0 Group */; \ - fcmpgt32 %fz, %S3, %x4 /* FPM */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - sub %sum, %x8, %sum /* IEU1 */; \ - ldx [%sp + STACKOFF], %x8 /* Load Group */; \ - inc %x2 /* IEU0 */; \ - sub %sum, %x1, %sum /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - fcmpgt32 %fz, %T1, %x5 /* FPM */; \ - inc %x3 /* IEU0 Group */; \ - fcmpgt32 %T0, %U0, %x6 /* FPM */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - sub %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - sub %sum, %x3, %sum /* IEU1 */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - fcmpgt32 %fz, %U1, %x7 /* FPM */; \ - inc %x5 /* IEU0 Group */; \ - fcmpgt32 %U0, %V0, %x1 /* FPM */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - sub %sum, %x4, %sum /* IEU1 */; \ - sub %sum, %x5, %sum /* IEU0 Group */; \ - fcmpgt32 %fz, %V0, %x2 /* FPM */; \ - inc %x6 /* IEU0 Group */; \ - inc %x7 /* IEU1 */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - inc %x1 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - inc %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - sub %sum, %x2, %sum /* IEU0 Group */; \ - addcc %sum, %x8, %sum /* IEU1 Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %sum, 1, %sum /* IEU0 (Group) */; \ -33: /* That's it */; - - .text - .globl csum_partial_copy_user_vis - .align 32 -/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. - * csum_partial_copy_from_user - * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 - */ -csum_partial_copy_user_vis: - andcc %dst, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - and %dst, 0x38, %o4 /* IEU0 */ - mov 1, %g5 /* IEU0 Group */ - andcc %dst, 2, %g0 /* IEU1 */ - be,pt %icc, 1f /* CTI */ - and %dst, 4, %g7 /* IEU0 Group */ - lduh [%src], %g2 /* Load */ - sub %len, 2, %len /* IEU0 Group */ - add %dst, 2, %dst /* IEU1 */ - andcc %dst, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - stha %g2, [%dst - 2] %asi /* Store Group */ - sll %g2, 16, %g2 /* IEU0 */ - add %src, 2, %src /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, %g5, %sum /* IEU0 */ -1: lduw [%src], %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %dst, 0x38, %o4 /* IEU0 */ - add %dst, 4, %dst /* IEU0 Group */ - sub %len, 4, %len /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: and %dst, 0x38, %o4 /* IEU0 Group */ - stwa %g2, [%dst - 4] %asi /* Store */ - add %src, 4, %src /* IEU1 */ -4: -#ifdef __KERNEL__ - VISEntry -#endif - mov %src, %g7 /* IEU1 Group */ - fzero %f48 /* FPA */ - alignaddr %src, %g0, %src /* Single Group */ - subcc %g7, %src, %g7 /* IEU1 Group */ - be,pt %xcc, 1f /* CTI */ - mov 0x40, %g1 /* IEU0 */ - lduw [%src], %g2 /* Load Group */ - subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ - bcs,a,pn %icc, 1f /* CTI */ - sub %sum, 1, %sum /* IEU0 */ -1: srl %sum, 0, %sum /* IEU0 Group */ - clr %g5 /* IEU1 */ - brz,pn %o4, 3f /* CTI+IEU1 Group */ - sub %g1, %o4, %g1 /* IEU0 */ - ldd [%src], %f0 /* Load */ - clr %o4 /* IEU0 Group */ - andcc %dst, 8, %g0 /* IEU1 */ - be,pn %icc, 1f /* CTI */ - ldd [%src + 8], %f2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - sub %len, 8, %len /* IEU1 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - addcc %dst, 8, %dst /* IEU1 Group */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o4 /* FPM Group */ - fmovd %f2, %f0 /* FPA Group */ - ldd [%src + 8], %f2 /* Load */ - stda %f16, [%dst - 8] %asi /* Store */ - fmovd %f50, %f48 /* FPA */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldd [%src + 16], %f4 /* Load Group */ - add %src, 16, %src /* IEU0 */ - add %dst, 16, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - sub %len, 16, %len /* IEU0 */ - inc %o4 /* IEU1 */ - stda %f16, [%dst - 16] %asi /* Store Group */ - fpadd32 %f2, %f50, %f48 /* FPA */ - srl %o4, 1, %o5 /* IEU0 */ - faligndata %f2, %f4, %f18 /* FPA Group */ - stda %f18, [%dst - 8] %asi /* Store */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - ldd [%src + 8], %f2 /* Load */ - fmovd %f4, %f0 /* FPA */ -1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ - rd %asi, %g2 /* LSU Group + 4 bubbles*/ - inc %g5 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldd [%src + 16], %f4 /* Load Group */ - srl %g5, 1, %g5 /* IEU0 */ - add %dst, 32, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o5 /* FPM Group */ - inc %o4 /* IEU0 */ - ldd [%src + 24], %f6 /* Load */ - srl %o4, 1, %o4 /* IEU0 Group */ - add %g5, %sum, %sum /* IEU1 */ - ldd [%src + 32], %f8 /* Load */ - fpadd32 %f2, %f50, %f48 /* FPA */ - faligndata %f2, %f4, %f18 /* FPA Group */ - sub %len, 32, %len /* IEU0 */ - stda %f16, [%dst - 32] %asi /* Store */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ - inc %o5 /* IEU0 */ - add %o4, %sum, %sum /* IEU1 */ - fpadd32 %f4, %f48, %f50 /* FPA */ - faligndata %f4, %f6, %f20 /* FPA Group */ - srl %o5, 1, %o5 /* IEU0 */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - stda %f18, [%dst - 24] %asi /* Store */ - fpadd32 %f6, %f50, %f48 /* FPA */ - inc %g3 /* IEU0 Group */ - stda %f20, [%dst - 16] %asi /* Store */ - add %src, 32, %src /* IEU1 */ - faligndata %f6, %f8, %f22 /* FPA */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - srl %g3, 1, %g3 /* IEU0 */ - stda %f22, [%dst - 8] %asi /* Store */ - add %g3, %sum, %sum /* IEU0 Group */ -3: rd %asi, %g2 /* LSU Group + 4 bubbles*/ -#ifdef __KERNEL__ -4: sethi %hi(vis0s), %g7 /* IEU0 Group */ - or %g2, ASI_BLK_OR, %g2 /* IEU1 */ -#else -4: rd %pc, %g7 /* LSU Group + 4 bubbles*/ -#endif - inc %g5 /* IEU0 Group */ - and %src, 0x38, %g3 /* IEU1 */ - membar #StoreLoad /* LSU Group */ - srl %g5, 1, %g5 /* IEU0 */ - inc %o4 /* IEU1 */ - sll %g3, 8, %g3 /* IEU0 Group */ - sub %len, 0xc0, %len /* IEU1 */ - addcc %g5, %sum, %sum /* IEU1 Group */ - srl %o4, 1, %o4 /* IEU0 */ - add %g7, %g3, %g7 /* IEU0 Group */ - add %o4, %sum, %sum /* IEU1 */ -#ifdef __KERNEL__ - jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ -#else - jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ -#endif - fzero %f32 /* FPA */ - - .align 2048 -vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f62 /* FPA Group f0 available*/ - faligndata %f0, %f2, %f48 /* FPA Group f2 available*/ - fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/ - fpadd32 %f0, %f62, %f0 /* FPA */ - fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/ - faligndata %f2, %f4, %f50 /* FPA */ - fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/ - faligndata %f4, %f6, %f52 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/ - inc %x1 /* IEU0 */ - faligndata %f6, %f8, %f54 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/ - srl %x1, 1, %x1 /* IEU0 */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f56 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/ - srl %x2, 1, %x2 /* IEU0 */ - add %sum, %x1, %sum /* IEU1 */ - faligndata %f10, %f12, %f58 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f60 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f62 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f32), STBLK,,,,,,,, - ,bcs,pn %icc, vis0e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f0), STBLK,,,,,,,, - ,bcs,pn %icc, vis0e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f16), STBLK,,,,,,,, - ,bcc,pt %icc, vis0) -vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f48,f50,f52,f54,f56,f58,f60,f62,f32, - ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2) -vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f48,f50,f52,f54,f56,f58,f60,f62,f0, - ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3) -vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f48,f50,f52,f54,f56,f58,f60,f62,f16, - ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), - ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) - .align 2048 -vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 8, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f58 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - fcmpgt32 %f32, %f2, %x2 /* FPM Group */ - faligndata %f2, %f4, %f48 /* FPA */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f50 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f52 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f54 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - srl %x2, 1, %x2 /* IEU0 */ - faligndata %f10, %f12, %f56 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f58 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f60 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f32), ,STBLK,,,,,,, - ,bcs,pn %icc, vis1e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f0), ,STBLK,,,,,,, - ,bcs,pn %icc, vis1e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f62,f48,f50,f52,f54,f56,f58,f60,f60, - ,LDBLK(f16), ,STBLK,,,,,,, - ,bcc,pt %icc, vis1) -vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f62,f48,f50,f52,f54,f56,f58,f60,f32, - ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2) -vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f62,f48,f50,f52,f54,f56,f58,f60,f0, - ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3) -vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f62,f48,f50,f52,f54,f56,f58,f60,f16, - ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), - ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) - .align 2048 -vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 16, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f56 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f48 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f50 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f52 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f54 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - faligndata %f12, %f14, %f56 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f58 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ -vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f32), ,,STBLK,,,,,, - ,bcs,pn %icc, vis2e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f0), ,,STBLK,,,,,, - ,bcs,pn %icc, vis2e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f60,f62,f48,f50,f52,f54,f56,f58,f58, - ,LDBLK(f16), ,,STBLK,,,,,, - ,bcc,pt %icc, vis2) -vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f60,f62,f48,f50,f52,f54,f56,f58,f32, - ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2) -vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f60,f62,f48,f50,f52,f54,f56,f58,f0, - ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3) -vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f60,f62,f48,f50,f52,f54,f56,f58,f16, - ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), - ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) - .align 2048 -vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 24, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f54 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f48 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f50 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f52 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f54 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f56 /* FPA */ - inc %x4 /* IEU0 */ - srl %x4, 1, %x4 /* IEU0 Group */ -vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f32), ,,,STBLK,,,,, - ,bcs,pn %icc, vis3e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f0), ,,,STBLK,,,,, - ,bcs,pn %icc, vis3e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f58,f60,f62,f48,f50,f52,f54,f56,f56, - ,LDBLK(f16), ,,,STBLK,,,,, - ,bcc,pt %icc, vis3) -vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f58,f60,f62,f48,f50,f52,f54,f56,f32, - ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2) -vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f58,f60,f62,f48,f50,f52,f54,f56,f0, - ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3) -vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f58,f60,f62,f48,f50,f52,f54,f56,f16, - ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), - ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) - .align 2048 -vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 32, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f52 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fpsub32 %f6, %f6, %f6 /* FPA Group */ - clr %x4 /* IEU0 */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f48 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f50 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f52 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f54 /* FPA */ -vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f32), ,,,,STBLK,,,, - ,bcs,pn %icc, vis4e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f0), ,,,,STBLK,,,, - ,bcs,pn %icc, vis4e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f56,f58,f60,f62,f48,f50,f52,f54,f54, - ,LDBLK(f16), ,,,,STBLK,,,, - ,bcc,pt %icc, vis4) -vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f56,f58,f60,f62,f48,f50,f52,f54,f32, - ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2) -vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f56,f58,f60,f62,f48,f50,f52,f54,f0, - ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3) -vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f56,f58,f60,f62,f48,f50,f52,f54,f16, - ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), - ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) - .align 2048 -vis5s: ldd [%src+0], %f10 /* Load Group */ - ldd [%src+8], %f12 /* Load Group */ - ldd [%src+16], %f14 /* Load Group */ - add %src, 24, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - faligndata %f10, %f12, %f48 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f50 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f52 /* FPA */ -vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f32), ,,,,,STBLK,,, - ,bcs,pn %icc, vis5e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f0), ,,,,,STBLK,,, - ,bcs,pn %icc, vis5e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f54,f56,f58,f60,f62,f48,f50,f52,f52, - ,LDBLK(f16), ,,,,,STBLK,,, - ,bcc,pt %icc, vis5) -vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f54,f56,f58,f60,f62,f48,f50,f52,f32, - ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2) -vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f54,f56,f58,f60,f62,f48,f50,f52,f0, - ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3) -vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f54,f56,f58,f60,f62,f48,f50,f52,f16, - ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), - ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) - .align 2048 -vis6s: ldd [%src+0], %f12 /* Load Group */ - ldd [%src+8], %f14 /* Load Group */ - add %src, 16, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - faligndata %f12, %f14, %f48 /* FPA */ - fmovd %f14, %f50 /* FPA Group */ -vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f32), ,,,,,,STBLK,, - ,bcs,pn %icc, vis6e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f0), ,,,,,,STBLK,, - ,bcs,pn %icc, vis6e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f52,f54,f56,f58,f60,f62,f48,f50,f50, - ,LDBLK(f16), ,,,,,,STBLK,, - ,bcc,pt %icc, vis6) -vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f52,f54,f56,f58,f60,f62,f48,f50,f32, - ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2) -vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f52,f54,f56,f58,f60,f62,f48,f50,f0, - ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3) -vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f52,f54,f56,f58,f60,f62,f48,f50,f16, - ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), - ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) - .align 2048 -vis7s: ldd [%src+0], %f14 /* Load Group */ - add %src, 8, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - faddd %f32, %f32, %f12 /* FPA Group */ - clr %x7 /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fmovd %f14, %f48 /* FPA */ -vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f32), ,,,,,,,STBLK, - ,bcs,pn %icc, vis7e1) - DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f0), ,,,,,,,STBLK, - ,bcs,pn %icc, vis7e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f50,f52,f54,f56,f58,f60,f62,f48,f48, - ,LDBLK(f16), ,,,,,,,STBLK, - ,bcc,pt %icc, vis7) -vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f50,f52,f54,f56,f58,f60,f62,f48,f32, - ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2) -vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f50,f52,f54,f56,f58,f60,f62,f48,f0, - ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3) -vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, - ,f50,f52,f54,f56,f58,f60,f62,f48,f16, - ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), - ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1) -e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) -e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) -e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) -ett: rd %gsr, %x3 /* LSU Group+4bubbles */ - andcc %x3, 7, %x3 /* IEU1 Group */ - add %dst, 8, %dst /* IEU0 */ - bne,pn %icc, 1f /* CTI */ - fzero %f10 /* FPA */ - brz,a,pn %len, 2f /* CTI+IEU1 Group */ - stda %f6, [%dst - 8] %asi /* Store */ -1: cmp %len, 8 /* IEU1 */ - blu,pn %icc, 3f /* CTI */ - sub %src, 64, %src /* IEU0 Group */ -1: ldd [%src], %f2 /* Load Group */ - fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - faligndata %f6, %f2, %f14 /* FPA Group */ - fcmpgt32 %f10, %f12, %x5 /* FPM Group */ - stda %f14, [%dst - 16] %asi /* Store */ - fmovd %f2, %f6 /* FPA */ - fmovd %f12, %f10 /* FPA Group */ - sub %len, 8, %len /* IEU1 */ - fzero %f16 /* FPA Group - FPU nop */ - fzero %f18 /* FPA Group - FPU nop */ - inc %x5 /* IEU0 */ - srl %x5, 1, %x5 /* IEU0 Group (regdep) */ - cmp %len, 8 /* IEU1 */ - bgeu,pt %icc, 1b /* CTI */ - add %x5, %sum, %sum /* IEU0 Group */ -3: brz,a,pt %x3, 2f /* CTI+IEU1 */ - stda %f6, [%dst - 8] %asi /* Store Group */ - sta %f7, [%dst - 8] %asi /* Store Group */ - sub %dst, 4, %dst /* IEU0 */ - add %len, 4, %len /* IEU1 */ -2: -#ifdef __KERNEL__ - sub %sp, 8, %sp /* IEU0 Group */ -#endif - END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) - membar #Sync /* LSU Group */ -#ifdef __KERNEL__ - VISExit - add %sp, 8, %sp /* IEU0 Group */ -#endif -23: brnz,pn %len, 26f /* CTI+IEU1 Group */ -24: sllx %sum, 32, %g1 /* IEU0 */ -25: addcc %sum, %g1, %src /* IEU1 Group */ - srlx %src, 32, %src /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %src, 1, %src /* IEU1 */ -#ifndef __KERNEL__ -1: retl /* CTI Group brk forced*/ - srl %src, 0, %src /* IEU0 */ -#else -1: retl /* CTI Group brk forced*/ - ldx [%g6 + TI_TASK], %g4 /* Load */ -#endif -26: andcc %len, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - lduw [%src], %o4 /* Load */ - lduw [%src+4], %g2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - sllx %o4, 32, %g5 /* IEU0 Group */ - stwa %o4, [%dst - 8] %asi /* Store */ - or %g5, %g2, %g5 /* IEU0 Group */ - stwa %g2, [%dst - 4] %asi /* Store */ - addcc %g5, %sum, %sum /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: andcc %len, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduw [%src], %g7 /* Load */ - add %src, 4, %src /* IEU0 Group */ - add %dst, 4, %dst /* IEU1 */ - sllx %g7, 32, %g2 /* IEU0 Group */ - stwa %g7, [%dst - 4] %asi /* Store */ -1: andcc %len, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %g3 /* IEU0 Group */ - lduh [%src], %g7 /* Load */ - add %src, 2, %src /* IEU1 */ - add %dst, 2, %dst /* IEU0 Group */ - sll %g7, 16, %g3 /* IEU0 Group */ - stha %g7, [%dst - 2] %asi /* Store */ -1: andcc %len, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%src], %g7 /* Load */ - sll %g7, 8, %o5 /* IEU0 Group */ - stba %g7, [%dst] %asi /* Store */ -1: or %g2, %g3, %g3 /* IEU1 */ - or %o5, %g3, %g3 /* IEU0 Group (regdep) */ - addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %sum, 32, %g1 /* IEU0 */ - -#ifdef __KERNEL__ -end: - - .section __ex_table - .align 4 - .word csum_partial_copy_user_vis, 0, end, cpc_handler -#endif diff --git a/arch/sparc64/lib/VISmemset.S b/arch/sparc64/lib/VISmemset.S deleted file mode 100644 index 152723a490141..0000000000000 --- a/arch/sparc64/lib/VISmemset.S +++ /dev/null @@ -1,240 +0,0 @@ -/* $Id: VISmemset.S,v 1.10 1999/12/23 17:02:16 jj Exp $ - * VISmemset.S: High speed memset operations utilizing the UltraSparc - * Visual Instruction Set. - * - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jakub@redhat.com) - */ - -#include "VIS.h" - -#ifdef REGS_64BIT -#define SET_BLOCKS(base, offset, source) \ - stx source, [base - offset - 0x18]; \ - stx source, [base - offset - 0x10]; \ - stx source, [base - offset - 0x08]; \ - stx source, [base - offset - 0x00]; -#else -#define SET_BLOCKS(base, offset, source) \ - stw source, [base - offset - 0x18]; \ - stw source, [base - offset - 0x14]; \ - stw source, [base - offset - 0x10]; \ - stw source, [base - offset - 0x0c]; \ - stw source, [base - offset - 0x08]; \ - stw source, [base - offset - 0x04]; \ - stw source, [base - offset - 0x00]; \ - stw source, [base - offset + 0x04]; -#endif - -#ifndef __KERNEL__ -/* So that the brz,a,pt in memset doesn't have to get through PLT, here we go... */ -#include "VISbzero.S" -#endif - -#ifdef __KERNEL__ -#include <asm/visasm.h> -#endif - - /* Well, memset is a lot easier to get right than bcopy... */ - .text - .align 32 -#ifdef __KERNEL__ - .globl __memset -__memset: -#endif - .globl memset -memset: -#ifndef __KERNEL__ - brz,a,pt %o1, bzero_private - mov %o2, %o1 -#ifndef REGS_64BIT - srl %o2, 0, %o2 -#endif -#endif - mov %o0, %o4 - cmp %o2, 7 - bleu,pn %xcc, 17f - andcc %o0, 3, %g5 - be,pt %xcc, 4f - and %o1, 0xff, %o1 - cmp %g5, 3 - be,pn %xcc, 2f - stb %o1, [%o0 + 0x00] - cmp %g5, 2 - be,pt %xcc, 2f - stb %o1, [%o0 + 0x01] - stb %o1, [%o0 + 0x02] -2: sub %g5, 4, %g5 - sub %o0, %g5, %o0 - add %o2, %g5, %o2 -4: sllx %o1, 8, %g1 - andcc %o0, 4, %g0 - or %o1, %g1, %o1 - sllx %o1, 16, %g1 - or %o1, %g1, %o1 - be,pt %xcc, 2f -#ifdef REGS_64BIT - sllx %o1, 32, %g1 -#else - cmp %o2, 128 -#endif - stw %o1, [%o0] - sub %o2, 4, %o2 - add %o0, 4, %o0 -2: -#ifdef REGS_64BIT - cmp %o2, 128 - or %o1, %g1, %o1 -#endif - blu,pn %xcc, 9f - andcc %o0, 0x38, %g5 - be,pn %icc, 6f - mov 64, %o5 - andcc %o0, 8, %g0 - be,pn %icc, 1f - sub %o5, %g5, %o5 -#ifdef REGS_64BIT - stx %o1, [%o0] -#else - stw %o1, [%o0] - stw %o1, [%o0 + 4] -#endif - add %o0, 8, %o0 -1: andcc %o5, 16, %g0 - be,pn %icc, 1f - sub %o2, %o5, %o2 -#ifdef REGS_64BIT - stx %o1, [%o0] - stx %o1, [%o0 + 8] -#else - stw %o1, [%o0] - stw %o1, [%o0 + 4] - stw %o1, [%o0 + 8] - stw %o1, [%o0 + 12] -#endif - add %o0, 16, %o0 -1: andcc %o5, 32, %g0 - be,pn %icc, 7f - andncc %o2, 0x3f, %o3 -#ifdef REGS_64BIT - stx %o1, [%o0] - stx %o1, [%o0 + 8] - stx %o1, [%o0 + 16] - stx %o1, [%o0 + 24] -#else - stw %o1, [%o0] - stw %o1, [%o0 + 4] - stw %o1, [%o0 + 8] - stw %o1, [%o0 + 12] - stw %o1, [%o0 + 16] - stw %o1, [%o0 + 20] - stw %o1, [%o0 + 24] - stw %o1, [%o0 + 28] -#endif - add %o0, 32, %o0 -7: be,pn %xcc, 9f - nop -#ifdef __KERNEL__ - VISEntryHalf -#endif - ldd [%o0 - 8], %f0 -18: rd %asi, %g2 - wr %g0, ASI_BLK_P, %asi - membar #StoreStore | #LoadStore - andcc %o3, 0xc0, %g5 - and %o2, 0x3f, %o2 - fmovd %f0, %f2 - fmovd %f0, %f4 - andn %o3, 0xff, %o3 - fmovd %f0, %f6 - cmp %g5, 64 - fmovd %f0, %f8 - fmovd %f0, %f10 - fmovd %f0, %f12 - brz,pn %g5, 10f - fmovd %f0, %f14 - be,pn %icc, 2f - stda %f0, [%o0 + 0x00] %asi - cmp %g5, 128 - be,pn %icc, 2f - stda %f0, [%o0 + 0x40] %asi - stda %f0, [%o0 + 0x80] %asi -2: brz,pn %o3, 12f - add %o0, %g5, %o0 -10: stda %f0, [%o0 + 0x00] %asi - stda %f0, [%o0 + 0x40] %asi - stda %f0, [%o0 + 0x80] %asi - stda %f0, [%o0 + 0xc0] %asi -11: subcc %o3, 256, %o3 - bne,pt %xcc, 10b - add %o0, 256, %o0 -12: -#ifdef __KERNEL__ - wr %g2, %g0, %asi - VISExitHalf -#else -#ifndef REGS_64BIT - wr %g0, FPRS_FEF, %fprs -#endif -#endif - membar #StoreLoad | #StoreStore -9: andcc %o2, 0x78, %g5 - be,pn %xcc, 13f - andcc %o2, 7, %o2 -#ifdef __KERNEL__ -14: srl %g5, 1, %o3 - sethi %hi(13f), %g3 - sub %g3, %o3, %g3 - jmpl %g3 + %lo(13f), %g0 - add %o0, %g5, %o0 -#else -14: rd %pc, %g3 -#ifdef REGS_64BIT - srl %g5, 1, %o3 - sub %g3, %o3, %g3 -#else - sub %g3, %g5, %g3 -#endif - jmpl %g3 + (13f - 14b), %g0 - add %o0, %g5, %o0 -#endif -12: SET_BLOCKS(%o0, 0x68, %o1) - SET_BLOCKS(%o0, 0x48, %o1) - SET_BLOCKS(%o0, 0x28, %o1) - SET_BLOCKS(%o0, 0x08, %o1) -13: be,pn %xcc, 8f - andcc %o2, 4, %g0 - be,pn %xcc, 1f - andcc %o2, 2, %g0 - stw %o1, [%o0] - add %o0, 4, %o0 -1: be,pn %xcc, 1f - andcc %o2, 1, %g0 - sth %o1, [%o0] - add %o0, 2, %o0 -1: bne,a,pn %xcc, 8f - stb %o1, [%o0] -8: retl - mov %o4, %o0 -17: brz,pn %o2, 0f -8: add %o0, 1, %o0 - subcc %o2, 1, %o2 - bne,pt %xcc, 8b - stb %o1, [%o0 - 1] -0: retl - mov %o4, %o0 -6: -#ifdef REGS_64BIT - stx %o1, [%o0] -#else - stw %o1, [%o0] - stw %o1, [%o0 + 4] -#endif - andncc %o2, 0x3f, %o3 - be,pn %xcc, 9b - nop -#ifdef __KERNEL__ - VISEntryHalf -#endif - ba,pt %xcc, 18b - ldd [%o0], %f0 diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S index 41be4131f8008..e528b8d1a3e69 100644 --- a/arch/sparc64/lib/atomic.S +++ b/arch/sparc64/lib/atomic.S @@ -29,10 +29,10 @@ .globl atomic_add .type atomic_add,#function atomic_add: /* %o0 = increment, %o1 = atomic_ptr */ -1: lduw [%o1], %g5 - add %g5, %o0, %g7 - cas [%o1], %g5, %g7 - cmp %g5, %g7 +1: lduw [%o1], %g1 + add %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %icc, 1b nop retl @@ -42,10 +42,10 @@ atomic_add: /* %o0 = increment, %o1 = atomic_ptr */ .globl atomic_sub .type atomic_sub,#function atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ -1: lduw [%o1], %g5 - sub %g5, %o0, %g7 - cas [%o1], %g5, %g7 - cmp %g5, %g7 +1: lduw [%o1], %g1 + sub %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %icc, 1b nop retl @@ -56,10 +56,10 @@ atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ .type atomic_add_ret,#function atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ ATOMIC_PRE_BARRIER -1: lduw [%o1], %g5 - add %g5, %o0, %g7 - cas [%o1], %g5, %g7 - cmp %g5, %g7 +1: lduw [%o1], %g1 + add %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %icc, 1b add %g7, %o0, %g7 ATOMIC_POST_BARRIER @@ -71,10 +71,10 @@ atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ .type atomic_sub_ret,#function atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ ATOMIC_PRE_BARRIER -1: lduw [%o1], %g5 - sub %g5, %o0, %g7 - cas [%o1], %g5, %g7 - cmp %g5, %g7 +1: lduw [%o1], %g1 + sub %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %icc, 1b sub %g7, %o0, %g7 ATOMIC_POST_BARRIER @@ -85,10 +85,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ .globl atomic64_add .type atomic64_add,#function atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */ -1: ldx [%o1], %g5 - add %g5, %o0, %g7 - casx [%o1], %g5, %g7 - cmp %g5, %g7 +1: ldx [%o1], %g1 + add %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %xcc, 1b nop retl @@ -98,10 +98,10 @@ atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */ .globl atomic64_sub .type atomic64_sub,#function atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */ -1: ldx [%o1], %g5 - sub %g5, %o0, %g7 - casx [%o1], %g5, %g7 - cmp %g5, %g7 +1: ldx [%o1], %g1 + sub %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %xcc, 1b nop retl @@ -112,10 +112,10 @@ atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */ .type atomic64_add_ret,#function atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ ATOMIC_PRE_BARRIER -1: ldx [%o1], %g5 - add %g5, %o0, %g7 - casx [%o1], %g5, %g7 - cmp %g5, %g7 +1: ldx [%o1], %g1 + add %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %xcc, 1b add %g7, %o0, %g7 ATOMIC_POST_BARRIER @@ -127,10 +127,10 @@ atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ .type atomic64_sub_ret,#function atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ ATOMIC_PRE_BARRIER -1: ldx [%o1], %g5 - sub %g5, %o0, %g7 - casx [%o1], %g5, %g7 - cmp %g5, %g7 +1: ldx [%o1], %g1 + sub %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 bne,pn %xcc, 1b sub %g7, %o0, %g7 ATOMIC_POST_BARRIER diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S index fd20171ecfd10..886dcd2b376a0 100644 --- a/arch/sparc64/lib/bitops.S +++ b/arch/sparc64/lib/bitops.S @@ -26,17 +26,17 @@ test_and_set_bit: /* %o0=nr, %o1=addr */ BITOP_PRE_BARRIER srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - or %g7, %g5, %g1 + or %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b - and %g7, %g5, %g2 + and %g7, %o2, %g2 BITOP_POST_BARRIER clr %o0 retl @@ -48,17 +48,17 @@ test_and_set_bit: /* %o0=nr, %o1=addr */ test_and_clear_bit: /* %o0=nr, %o1=addr */ BITOP_PRE_BARRIER srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - andn %g7, %g5, %g1 + andn %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b - and %g7, %g5, %g2 + and %g7, %o2, %g2 BITOP_POST_BARRIER clr %o0 retl @@ -70,17 +70,17 @@ test_and_clear_bit: /* %o0=nr, %o1=addr */ test_and_change_bit: /* %o0=nr, %o1=addr */ BITOP_PRE_BARRIER srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - xor %g7, %g5, %g1 + xor %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b - and %g7, %g5, %g2 + and %g7, %o2, %g2 BITOP_POST_BARRIER clr %o0 retl @@ -91,13 +91,13 @@ test_and_change_bit: /* %o0=nr, %o1=addr */ .type set_bit,#function set_bit: /* %o0=nr, %o1=addr */ srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - or %g7, %g5, %g1 + or %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b @@ -110,13 +110,13 @@ set_bit: /* %o0=nr, %o1=addr */ .type clear_bit,#function clear_bit: /* %o0=nr, %o1=addr */ srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - andn %g7, %g5, %g1 + andn %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b @@ -129,13 +129,13 @@ clear_bit: /* %o0=nr, %o1=addr */ .type change_bit,#function change_bit: /* %o0=nr, %o1=addr */ srlx %o0, 6, %g1 - mov 1, %g5 + mov 1, %o2 sllx %g1, 3, %g3 and %o0, 63, %g2 - sllx %g5, %g2, %g5 + sllx %o2, %g2, %o2 add %o1, %g3, %o1 1: ldx [%o1], %g7 - xor %g7, %g5, %g1 + xor %g7, %o2, %g1 casx [%o1], %g7, %g1 cmp %g7, %g1 bne,pn %xcc, 1b diff --git a/arch/sparc64/lib/bzero.S b/arch/sparc64/lib/bzero.S new file mode 100644 index 0000000000000..21a933ffb7c29 --- /dev/null +++ b/arch/sparc64/lib/bzero.S @@ -0,0 +1,158 @@ +/* bzero.S: Simple prefetching memset, bzero, and clear_user + * implementations. + * + * Copyright (C) 2005 David S. Miller <davem@davemloft.net> + */ + + .text + + .globl __memset + .type __memset, #function +__memset: /* %o0=buf, %o1=pat, %o2=len */ + + .globl memset + .type memset, #function +memset: /* %o0=buf, %o1=pat, %o2=len */ + and %o1, 0xff, %o3 + mov %o2, %o1 + sllx %o3, 8, %g1 + or %g1, %o3, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %xcc, 1f + or %g1, %o2, %o2 + + .globl __bzero + .type __bzero, #function +__bzero: /* %o0=buf, %o1=len */ + clr %o2 +1: mov %o0, %o3 + brz,pn %o1, __bzero_done + cmp %o1, 16 + bl,pn %icc, __bzero_tiny + prefetch [%o0 + 0x000], #n_writes + andcc %o0, 0x3, %g0 + be,pt %icc, 2f +1: stb %o2, [%o0 + 0x00] + add %o0, 1, %o0 + andcc %o0, 0x3, %g0 + bne,pn %icc, 1b + sub %o1, 1, %o1 +2: andcc %o0, 0x7, %g0 + be,pt %icc, 3f + stw %o2, [%o0 + 0x00] + sub %o1, 4, %o1 + add %o0, 4, %o0 +3: and %o1, 0x38, %g1 + cmp %o1, 0x40 + andn %o1, 0x3f, %o4 + bl,pn %icc, 5f + and %o1, 0x7, %o1 + prefetch [%o0 + 0x040], #n_writes + prefetch [%o0 + 0x080], #n_writes + prefetch [%o0 + 0x0c0], #n_writes + prefetch [%o0 + 0x100], #n_writes + prefetch [%o0 + 0x140], #n_writes +4: prefetch [%o0 + 0x180], #n_writes + stx %o2, [%o0 + 0x00] + stx %o2, [%o0 + 0x08] + stx %o2, [%o0 + 0x10] + stx %o2, [%o0 + 0x18] + stx %o2, [%o0 + 0x20] + stx %o2, [%o0 + 0x28] + stx %o2, [%o0 + 0x30] + stx %o2, [%o0 + 0x38] + subcc %o4, 0x40, %o4 + bne,pt %icc, 4b + add %o0, 0x40, %o0 + brz,pn %g1, 6f + nop +5: stx %o2, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 5b + add %o0, 0x8, %o0 +6: brz,pt %o1, __bzero_done + nop +__bzero_tiny: +1: stb %o2, [%o0 + 0x00] + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 +__bzero_done: + retl + mov %o3, %o0 + .size __bzero, .-__bzero + .size __memset, .-__memset + .size memset, .-memset + +#define EX_ST(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + mov %o1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + + .globl __bzero_noasi + .type __bzero_noasi, #function +__bzero_noasi: /* %o0=buf, %o1=len */ + brz,pn %o1, __bzero_noasi_done + cmp %o1, 16 + bl,pn %icc, __bzero_noasi_tiny + EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes) + andcc %o0, 0x3, %g0 + be,pt %icc, 2f +1: EX_ST(stba %g0, [%o0 + 0x00] %asi) + add %o0, 1, %o0 + andcc %o0, 0x3, %g0 + bne,pn %icc, 1b + sub %o1, 1, %o1 +2: andcc %o0, 0x7, %g0 + be,pt %icc, 3f + EX_ST(stwa %g0, [%o0 + 0x00] %asi) + sub %o1, 4, %o1 + add %o0, 4, %o0 +3: and %o1, 0x38, %g1 + cmp %o1, 0x40 + andn %o1, 0x3f, %o4 + bl,pn %icc, 5f + and %o1, 0x7, %o1 + EX_ST(prefetcha [%o0 + 0x040] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x080] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x0c0] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x100] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x140] %asi, #n_writes) +4: EX_ST(prefetcha [%o0 + 0x180] %asi, #n_writes) + EX_ST(stxa %g0, [%o0 + 0x00] %asi) + EX_ST(stxa %g0, [%o0 + 0x08] %asi) + EX_ST(stxa %g0, [%o0 + 0x10] %asi) + EX_ST(stxa %g0, [%o0 + 0x18] %asi) + EX_ST(stxa %g0, [%o0 + 0x20] %asi) + EX_ST(stxa %g0, [%o0 + 0x28] %asi) + EX_ST(stxa %g0, [%o0 + 0x30] %asi) + EX_ST(stxa %g0, [%o0 + 0x38] %asi) + subcc %o4, 0x40, %o4 + bne,pt %icc, 4b + add %o0, 0x40, %o0 + brz,pn %g1, 6f + nop +5: EX_ST(stxa %g0, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %icc, 5b + add %o0, 0x8, %o0 +6: brz,pt %o1, __bzero_noasi_done + nop +__bzero_noasi_tiny: +1: EX_ST(stba %g0, [%o0 + 0x00] %asi) + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 +__bzero_noasi_done: + retl + clr %o0 + .size __bzero_noasi, .-__bzero_noasi diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S index dc7c887ca17a2..ba9cd3ccc2b26 100644 --- a/arch/sparc64/lib/checksum.S +++ b/arch/sparc64/lib/checksum.S @@ -13,500 +13,160 @@ * BSD4.4 portable checksum routine */ -#include <asm/errno.h> -#include <asm/head.h> -#include <asm/ptrace.h> -#include <asm/asi.h> -#include <asm/page.h> -#include <asm/thread_info.h> - - /* The problem with the "add with carry" instructions on Ultra - * are two fold. Firstly, they cannot pair with jack shit, - * and also they only add in the 32-bit carry condition bit - * into the accumulated sum. The following is much better. - * For larger chunks we use VIS code, which is faster ;) - */ - -#define src o0 -#define dst o1 -#define len o2 -#define sum o3 - .text - /* I think I have an erection... Once _AGAIN_ the SunSoft - * engineers are caught asleep at the keyboard, tsk tsk... - */ - -#define CSUMCOPY_LASTCHUNK(off, t0, t1) \ - ldxa [%src - off - 0x08] %asi, t0; \ - ldxa [%src - off - 0x00] %asi, t1; \ - nop; nop; \ - addcc t0, %sum, %sum; \ - stw t0, [%dst - off - 0x04]; \ - srlx t0, 32, t0; \ - bcc,pt %xcc, 51f; \ - stw t0, [%dst - off - 0x08]; \ - add %sum, 1, %sum; \ -51: addcc t1, %sum, %sum; \ - stw t1, [%dst - off + 0x04]; \ - srlx t1, 32, t1; \ - bcc,pt %xcc, 52f; \ - stw t1, [%dst - off - 0x00]; \ - add %sum, 1, %sum; \ -52: - -cpc_start: -cc_end_cruft: - andcc %g7, 8, %g0 ! IEU1 Group - be,pn %icc, 1f ! CTI - and %g7, 4, %g5 ! IEU0 - ldxa [%src + 0x00] %asi, %g2 ! Load Group - add %dst, 8, %dst ! IEU0 - add %src, 8, %src ! IEU1 - addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles - stw %g2, [%dst - 0x04] ! Store - srlx %g2, 32, %g2 ! IEU0 - bcc,pt %xcc, 1f ! CTI Group - stw %g2, [%dst - 0x08] ! Store - add %sum, 1, %sum ! IEU0 -1: brz,pt %g5, 1f ! CTI Group - clr %g2 ! IEU0 - lduwa [%src + 0x00] %asi, %g2 ! Load - add %dst, 4, %dst ! IEU0 Group - add %src, 4, %src ! IEU1 - stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles - sllx %g2, 32, %g2 ! IEU0 -1: andcc %g7, 2, %g0 ! IEU1 - be,pn %icc, 1f ! CTI Group - clr %o4 ! IEU1 - lduha [%src + 0x00] %asi, %o4 ! Load - add %src, 2, %src ! IEU0 Group - add %dst, 2, %dst ! IEU1 - sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles - sll %o4, 16, %o4 ! IEU0 -1: andcc %g7, 1, %g0 ! IEU1 - be,pn %icc, 1f ! CTI Group - clr %o5 ! IEU0 - lduba [%src + 0x00] %asi, %o5 ! Load - stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles - sll %o5, 8, %o5 ! IEU0 -1: or %g2, %o4, %o4 ! IEU1 - or %o5, %o4, %o4 ! IEU0 Group - addcc %o4, %sum, %sum ! IEU1 - bcc,pt %xcc, ccfold ! CTI - nop ! IEU0 Group - b,pt %xcc, ccfold ! CTI - add %sum, 1, %sum ! IEU1 - -cc_fixit: - cmp %len, 6 ! IEU1 Group - bl,a,pn %icc, ccte ! CTI - andcc %len, 0xf, %g7 ! IEU1 Group - andcc %src, 2, %g0 ! IEU1 Group - be,pn %icc, 1f ! CTI - andcc %src, 0x4, %g0 ! IEU1 Group - lduha [%src + 0x00] %asi, %g4 ! Load - sub %len, 2, %len ! IEU0 - add %src, 2, %src ! IEU0 Group - add %dst, 2, %dst ! IEU1 - sll %g4, 16, %g3 ! IEU0 Group + 1 bubble - addcc %g3, %sum, %sum ! IEU1 - bcc,pt %xcc, 0f ! CTI - srl %sum, 16, %g3 ! IEU0 Group - add %g3, 1, %g3 ! IEU0 4 clocks (mispredict) -0: andcc %src, 0x4, %g0 ! IEU1 Group - sth %g4, [%dst - 0x2] ! Store - sll %sum, 16, %sum ! IEU0 - sll %g3, 16, %g3 ! IEU0 Group - srl %sum, 16, %sum ! IEU0 Group - or %g3, %sum, %sum ! IEU0 Group (regdep) -1: be,pt %icc, ccmerge ! CTI - andcc %len, 0xf0, %g1 ! IEU1 - lduwa [%src + 0x00] %asi, %g4 ! Load Group - sub %len, 4, %len ! IEU0 - add %src, 4, %src ! IEU1 - add %dst, 4, %dst ! IEU0 Group - addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble - stw %g4, [%dst - 0x4] ! Store - bcc,pt %xcc, ccmerge ! CTI - andcc %len, 0xf0, %g1 ! IEU1 Group - b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict) - add %sum, 1, %sum ! IEU0 - - .align 32 - .globl csum_partial_copy_sparc64 -csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */ - xorcc %src, %dst, %o4 ! IEU1 Group - srl %sum, 0, %sum ! IEU0 - andcc %o4, 3, %g0 ! IEU1 Group - srl %len, 0, %len ! IEU0 - bne,pn %icc, ccslow ! CTI - andcc %src, 1, %g0 ! IEU1 Group - bne,pn %icc, ccslow ! CTI - cmp %len, 256 ! IEU1 Group - bgeu,pt %icc, csum_partial_copy_vis ! CTI - andcc %src, 7, %g0 ! IEU1 Group - bne,pn %icc, cc_fixit ! CTI - andcc %len, 0xf0, %g1 ! IEU1 Group -ccmerge:be,pn %icc, ccte ! CTI - andcc %len, 0xf, %g7 ! IEU1 Group - sll %g1, 2, %o4 ! IEU0 -13: sethi %hi(12f), %o5 ! IEU0 Group - add %src, %g1, %src ! IEU1 - sub %o5, %o4, %o5 ! IEU0 Group - jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced - add %dst, %g1, %dst ! IEU0 Group -cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3) - CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3) - CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3) - CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3) - CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x98,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x88,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x78,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x68,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x58,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x48,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x38,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x28,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x18,%g2,%g3) - CSUMCOPY_LASTCHUNK(0x08,%g2,%g3) -12: - andcc %len, 0xf, %g7 ! IEU1 Group -ccte: bne,pn %icc, cc_end_cruft ! CTI - nop ! IEU0 -ccfold: sllx %sum, 32, %o0 ! IEU0 Group - addcc %sum, %o0, %o0 ! IEU1 Group (regdep) - srlx %o0, 32, %o0 ! IEU0 Group (regdep) - bcs,a,pn %xcc, 1f ! CTI - add %o0, 1, %o0 ! IEU1 4 clocks (mispredict) -1: retl ! CTI Group brk forced - ldx [%g6 + TI_TASK], %g4 ! Load - -ccslow: mov 0, %g5 - brlez,pn %len, 4f - andcc %src, 1, %o5 - be,a,pt %icc, 1f - srl %len, 1, %g7 - sub %len, 1, %len - lduba [%src] %asi, %g5 - add %src, 1, %src - stb %g5, [%dst] - srl %len, 1, %g7 - add %dst, 1, %dst -1: brz,a,pn %g7, 3f - andcc %len, 1, %g0 - andcc %src, 2, %g0 - be,a,pt %icc, 1f - srl %g7, 1, %g7 - lduha [%src] %asi, %o4 - sub %len, 2, %len - srl %o4, 8, %g2 - sub %g7, 1, %g7 - stb %g2, [%dst] - add %o4, %g5, %g5 - stb %o4, [%dst + 1] - add %src, 2, %src - srl %g7, 1, %g7 - add %dst, 2, %dst -1: brz,a,pn %g7, 2f - andcc %len, 2, %g0 - lduwa [%src] %asi, %o4 -5: srl %o4, 24, %g2 - srl %o4, 16, %g3 - stb %g2, [%dst] - srl %o4, 8, %g2 - stb %g3, [%dst + 1] - add %src, 4, %src - stb %g2, [%dst + 2] - addcc %o4, %g5, %g5 - stb %o4, [%dst + 3] - addc %g5, %g0, %g5 - add %dst, 4, %dst - subcc %g7, 1, %g7 - bne,a,pt %icc, 5b - lduwa [%src] %asi, %o4 - sll %g5, 16, %g2 - srl %g5, 16, %g5 - srl %g2, 16, %g2 - andcc %len, 2, %g0 - add %g2, %g5, %g5 -2: be,a,pt %icc, 3f - andcc %len, 1, %g0 - lduha [%src] %asi, %o4 - andcc %len, 1, %g0 - srl %o4, 8, %g2 - add %src, 2, %src - stb %g2, [%dst] - add %g5, %o4, %g5 - stb %o4, [%dst + 1] - add %dst, 2, %dst -3: be,a,pt %icc, 1f - sll %g5, 16, %o4 - lduba [%src] %asi, %g2 - sll %g2, 8, %o4 - stb %g2, [%dst] - add %g5, %o4, %g5 - sll %g5, 16, %o4 -1: addcc %o4, %g5, %g5 - srl %g5, 16, %o4 - addc %g0, %o4, %g5 - brz,pt %o5, 4f - srl %g5, 8, %o4 - and %g5, 0xff, %g2 - and %o4, 0xff, %o4 - sll %g2, 8, %g2 - or %g2, %o4, %g5 -4: addcc %sum, %g5, %sum - addc %g0, %sum, %o0 - retl - srl %o0, 0, %o0 -cpc_end: - - /* Now the version with userspace as the destination */ -#define CSUMCOPY_LASTCHUNK_USER(off, t0, t1) \ - ldx [%src - off - 0x08], t0; \ - ldx [%src - off - 0x00], t1; \ - nop; nop; \ - addcc t0, %sum, %sum; \ - stwa t0, [%dst - off - 0x04] %asi; \ - srlx t0, 32, t0; \ - bcc,pt %xcc, 51f; \ - stwa t0, [%dst - off - 0x08] %asi; \ - add %sum, 1, %sum; \ -51: addcc t1, %sum, %sum; \ - stwa t1, [%dst - off + 0x04] %asi; \ - srlx t1, 32, t1; \ - bcc,pt %xcc, 52f; \ - stwa t1, [%dst - off - 0x00] %asi; \ - add %sum, 1, %sum; \ -52: -cpc_user_start: -cc_user_end_cruft: - andcc %g7, 8, %g0 ! IEU1 Group - be,pn %icc, 1f ! CTI - and %g7, 4, %g5 ! IEU0 - ldx [%src + 0x00], %g2 ! Load Group - add %dst, 8, %dst ! IEU0 - add %src, 8, %src ! IEU1 - addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles - stwa %g2, [%dst - 0x04] %asi ! Store - srlx %g2, 32, %g2 ! IEU0 - bcc,pt %xcc, 1f ! CTI Group - stwa %g2, [%dst - 0x08] %asi ! Store - add %sum, 1, %sum ! IEU0 -1: brz,pt %g5, 1f ! CTI Group - clr %g2 ! IEU0 - lduw [%src + 0x00], %g2 ! Load - add %dst, 4, %dst ! IEU0 Group - add %src, 4, %src ! IEU1 - stwa %g2, [%dst - 0x04] %asi ! Store Group + 2 bubbles - sllx %g2, 32, %g2 ! IEU0 -1: andcc %g7, 2, %g0 ! IEU1 - be,pn %icc, 1f ! CTI Group - clr %o4 ! IEU1 - lduh [%src + 0x00], %o4 ! Load - add %src, 2, %src ! IEU0 Group - add %dst, 2, %dst ! IEU1 - stha %o4, [%dst - 0x2] %asi ! Store Group + 2 bubbles - sll %o4, 16, %o4 ! IEU0 -1: andcc %g7, 1, %g0 ! IEU1 - be,pn %icc, 1f ! CTI Group - clr %o5 ! IEU0 - ldub [%src + 0x00], %o5 ! Load - stba %o5, [%dst + 0x00] %asi ! Store Group + 2 bubbles - sll %o5, 8, %o5 ! IEU0 -1: or %g2, %o4, %o4 ! IEU1 - or %o5, %o4, %o4 ! IEU0 Group - addcc %o4, %sum, %sum ! IEU1 - bcc,pt %xcc, ccuserfold ! CTI - nop ! IEU0 Group - b,pt %xcc, ccuserfold ! CTI - add %sum, 1, %sum ! IEU1 - -cc_user_fixit: - cmp %len, 6 ! IEU1 Group - bl,a,pn %icc, ccuserte ! CTI - andcc %len, 0xf, %g7 ! IEU1 Group - andcc %src, 2, %g0 ! IEU1 Group - be,pn %icc, 1f ! CTI - andcc %src, 0x4, %g0 ! IEU1 Group - lduh [%src + 0x00], %g4 ! Load - sub %len, 2, %len ! IEU0 - add %src, 2, %src ! IEU0 Group - add %dst, 2, %dst ! IEU1 - sll %g4, 16, %g3 ! IEU0 Group + 1 bubble - addcc %g3, %sum, %sum ! IEU1 - bcc,pt %xcc, 0f ! CTI - srl %sum, 16, %g3 ! IEU0 Group - add %g3, 1, %g3 ! IEU0 4 clocks (mispredict) -0: andcc %src, 0x4, %g0 ! IEU1 Group - stha %g4, [%dst - 0x2] %asi ! Store - sll %sum, 16, %sum ! IEU0 - sll %g3, 16, %g3 ! IEU0 Group - srl %sum, 16, %sum ! IEU0 Group - or %g3, %sum, %sum ! IEU0 Group (regdep) -1: be,pt %icc, ccusermerge ! CTI - andcc %len, 0xf0, %g1 ! IEU1 - lduw [%src + 0x00], %g4 ! Load Group - sub %len, 4, %len ! IEU0 - add %src, 4, %src ! IEU1 - add %dst, 4, %dst ! IEU0 Group - addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble - stwa %g4, [%dst - 0x4] %asi ! Store - bcc,pt %xcc, ccusermerge ! CTI - andcc %len, 0xf0, %g1 ! IEU1 Group - b,pt %xcc, ccusermerge ! CTI 4 clocks (mispredict) - add %sum, 1, %sum ! IEU0 +csum_partial_fix_alignment: + /* We checked for zero length already, so there must be + * at least one byte. + */ + be,pt %icc, 1f + nop + ldub [%o0 + 0x00], %o4 + add %o0, 1, %o0 + sub %o1, 1, %o1 +1: andcc %o0, 0x2, %g0 + be,pn %icc, csum_partial_post_align + cmp %o1, 2 + blu,pn %icc, csum_partial_end_cruft + nop + lduh [%o0 + 0x00], %o5 + add %o0, 2, %o0 + sub %o1, 2, %o1 + ba,pt %xcc, csum_partial_post_align + add %o5, %o4, %o4 .align 32 - .globl csum_partial_copy_user_sparc64 -csum_partial_copy_user_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */ - xorcc %src, %dst, %o4 ! IEU1 Group - srl %sum, 0, %sum ! IEU0 - andcc %o4, 3, %g0 ! IEU1 Group - srl %len, 0, %len ! IEU0 - bne,pn %icc, ccuserslow ! CTI - andcc %src, 1, %g0 ! IEU1 Group - bne,pn %icc, ccuserslow ! CTI - cmp %len, 256 ! IEU1 Group - bgeu,pt %icc, csum_partial_copy_user_vis ! CTI - andcc %src, 7, %g0 ! IEU1 Group - bne,pn %icc, cc_user_fixit ! CTI - andcc %len, 0xf0, %g1 ! IEU1 Group -ccusermerge: - be,pn %icc, ccuserte ! CTI - andcc %len, 0xf, %g7 ! IEU1 Group - sll %g1, 2, %o4 ! IEU0 -13: sethi %hi(12f), %o5 ! IEU0 Group - add %src, %g1, %src ! IEU1 - sub %o5, %o4, %o5 ! IEU0 Group - jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced - add %dst, %g1, %dst ! IEU0 Group -ccusertbl: - CSUMCOPY_LASTCHUNK_USER(0xe8,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0xd8,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0xc8,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0xb8,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0xa8,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x98,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x88,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x78,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x68,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x58,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x48,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x38,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x28,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x18,%g2,%g3) - CSUMCOPY_LASTCHUNK_USER(0x08,%g2,%g3) -12: - andcc %len, 0xf, %g7 ! IEU1 Group -ccuserte: - bne,pn %icc, cc_user_end_cruft ! CTI - nop ! IEU0 -ccuserfold: - sllx %sum, 32, %o0 ! IEU0 Group - addcc %sum, %o0, %o0 ! IEU1 Group (regdep) - srlx %o0, 32, %o0 ! IEU0 Group (regdep) - bcs,a,pn %xcc, 1f ! CTI - add %o0, 1, %o0 ! IEU1 4 clocks (mispredict) -1: retl ! CTI Group brk forced - ldx [%g6 + TI_TASK], %g4 ! IEU0 Group - -ccuserslow: - mov 0, %g5 - brlez,pn %len, 4f - andcc %src, 1, %o5 - be,a,pt %icc, 1f - srl %len, 1, %g7 - sub %len, 1, %len - ldub [%src], %g5 - add %src, 1, %src - stba %g5, [%dst] %asi - srl %len, 1, %g7 - add %dst, 1, %dst -1: brz,a,pn %g7, 3f - andcc %len, 1, %g0 - andcc %src, 2, %g0 - be,a,pt %icc, 1f - srl %g7, 1, %g7 - lduh [%src], %o4 - sub %len, 2, %len - srl %o4, 8, %g2 - sub %g7, 1, %g7 - stba %g2, [%dst] %asi - add %o4, %g5, %g5 - stba %o4, [%dst + 1] %asi - add %src, 2, %src - srl %g7, 1, %g7 - add %dst, 2, %dst -1: brz,a,pn %g7, 2f - andcc %len, 2, %g0 - lduw [%src], %o4 -5: srl %o4, 24, %g2 - srl %o4, 16, %g3 - stba %g2, [%dst] %asi - srl %o4, 8, %g2 - stba %g3, [%dst + 1] %asi - add %src, 4, %src - stba %g2, [%dst + 2] %asi - addcc %o4, %g5, %g5 - stba %o4, [%dst + 3] %asi - addc %g5, %g0, %g5 - add %dst, 4, %dst - subcc %g7, 1, %g7 - bne,a,pt %icc, 5b - lduw [%src], %o4 - sll %g5, 16, %g2 - srl %g5, 16, %g5 - srl %g2, 16, %g2 - andcc %len, 2, %g0 - add %g2, %g5, %g5 -2: be,a,pt %icc, 3f - andcc %len, 1, %g0 - lduh [%src], %o4 - andcc %len, 1, %g0 - srl %o4, 8, %g2 - add %src, 2, %src - stba %g2, [%dst] %asi - add %g5, %o4, %g5 - stba %o4, [%dst + 1] %asi - add %dst, 2, %dst -3: be,a,pt %icc, 1f - sll %g5, 16, %o4 - ldub [%src], %g2 - sll %g2, 8, %o4 - stba %g2, [%dst] %asi - add %g5, %o4, %g5 - sll %g5, 16, %o4 -1: addcc %o4, %g5, %g5 - srl %g5, 16, %o4 - addc %g0, %o4, %g5 - brz,pt %o5, 4f - srl %g5, 8, %o4 - and %g5, 0xff, %g2 - and %o4, 0xff, %o4 - sll %g2, 8, %g2 - or %g2, %o4, %g5 -4: addcc %sum, %g5, %sum - addc %g0, %sum, %o0 - retl - srl %o0, 0, %o0 -cpc_user_end: - - .globl cpc_handler -cpc_handler: - ldx [%sp + 0x7ff + 128], %g1 - ldub [%g6 + TI_CURRENT_DS], %g3 - sub %g0, EFAULT, %g2 - brnz,a,pt %g1, 1f - st %g2, [%g1] -1: wr %g3, %g0, %asi + .globl csum_partial +csum_partial: /* %o0=buff, %o1=len, %o2=sum */ + prefetch [%o0 + 0x000], #n_reads + clr %o4 + prefetch [%o0 + 0x040], #n_reads + brz,pn %o1, csum_partial_finish + andcc %o0, 0x3, %g0 + + /* We "remember" whether the lowest bit in the address + * was set in %g7. Because if it is, we have to swap + * upper and lower 8 bit fields of the sum we calculate. + */ + bne,pn %icc, csum_partial_fix_alignment + andcc %o0, 0x1, %g7 + +csum_partial_post_align: + prefetch [%o0 + 0x080], #n_reads + andncc %o1, 0x3f, %o3 + + prefetch [%o0 + 0x0c0], #n_reads + sub %o1, %o3, %o1 + brz,pn %o3, 2f + prefetch [%o0 + 0x100], #n_reads + + /* So that we don't need to use the non-pairing + * add-with-carry instructions we accumulate 32-bit + * values into a 64-bit register. At the end of the + * loop we fold it down to 32-bits and so on. + */ + prefetch [%o0 + 0x140], #n_reads +1: lduw [%o0 + 0x00], %o5 + lduw [%o0 + 0x04], %g1 + lduw [%o0 + 0x08], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x0c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x10], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x14], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x18], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x1c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x20], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x24], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x28], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x2c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x30], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x34], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x38], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x3c], %g3 + add %o4, %g1, %o4 + prefetch [%o0 + 0x180], #n_reads + add %o4, %g2, %o4 + subcc %o3, 0x40, %o3 + add %o0, 0x40, %o0 + bne,pt %icc, 1b + add %o4, %g3, %o4 + +2: and %o1, 0x3c, %o3 + brz,pn %o3, 2f + sub %o1, %o3, %o1 +1: lduw [%o0 + 0x00], %o5 + subcc %o3, 0x4, %o3 + add %o0, 0x4, %o0 + bne,pt %icc, 1b + add %o4, %o5, %o4 + +2: + /* fold 64-->32 */ + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +csum_partial_end_cruft: + /* %o4 has the 16-bit sum we have calculated so-far. */ + cmp %o1, 2 + blu,pt %icc, 1f + nop + lduh [%o0 + 0x00], %o5 + sub %o1, 2, %o1 + add %o0, 2, %o0 + add %o4, %o5, %o4 +1: brz,pt %o1, 1f + nop + ldub [%o0 + 0x00], %o5 + sub %o1, 1, %o1 + add %o0, 1, %o0 + sllx %o5, 8, %o5 + add %o4, %o5, %o4 +1: + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +1: brz,pt %g7, 1f + nop + + /* We started with an odd byte, byte-swap the result. */ + srl %o4, 8, %o5 + and %o4, 0xff, %g1 + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +1: add %o2, %o4, %o2 + +csum_partial_finish: retl - ldx [%g6 + TI_TASK], %g4 - - .section __ex_table - .align 4 - .word cpc_start, 0, cpc_end, cpc_handler - .word cpc_user_start, 0, cpc_user_end, cpc_handler + mov %o2, %o0 diff --git a/arch/sparc64/lib/csum_copy.S b/arch/sparc64/lib/csum_copy.S new file mode 100644 index 0000000000000..71af488390646 --- /dev/null +++ b/arch/sparc64/lib/csum_copy.S @@ -0,0 +1,308 @@ +/* csum_copy.S: Checksum+copy code for sparc64 + * + * Copyright (C) 2005 David S. Miller <davem@davemloft.net> + */ + +#ifdef __KERNEL__ +#define GLOBAL_SPARE %g7 +#else +#define GLOBAL_SPARE %g5 +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME csum_partial_copy_nocheck +#endif + + .register %g2, #scratch + .register %g3, #scratch + + .text + +90: + /* We checked for zero length already, so there must be + * at least one byte. + */ + be,pt %icc, 1f + nop + EX_LD(LOAD(ldub, %o0 + 0x00, %o4)) + add %o0, 1, %o0 + sub %o2, 1, %o2 + EX_ST(STORE(stb, %o4, %o1 + 0x00)) + add %o1, 1, %o1 +1: andcc %o0, 0x2, %g0 + be,pn %icc, 80f + cmp %o2, 2 + blu,pn %icc, 60f + nop + EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) + add %o0, 2, %o0 + sub %o2, 2, %o2 + EX_ST(STORE(sth, %o5, %o1 + 0x00)) + add %o1, 2, %o1 + ba,pt %xcc, 80f + add %o5, %o4, %o4 + + .globl FUNC_NAME +FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */ + LOAD(prefetch, %o0 + 0x000, #n_reads) + xor %o0, %o1, %g1 + clr %o4 + andcc %g1, 0x3, %g0 + bne,pn %icc, 95f + LOAD(prefetch, %o0 + 0x040, #n_reads) + + brz,pn %o2, 70f + andcc %o0, 0x3, %g0 + + /* We "remember" whether the lowest bit in the address + * was set in GLOBAL_SPARE. Because if it is, we have to swap + * upper and lower 8 bit fields of the sum we calculate. + */ + bne,pn %icc, 90b + andcc %o0, 0x1, GLOBAL_SPARE + +80: + LOAD(prefetch, %o0 + 0x080, #n_reads) + andncc %o2, 0x3f, %g3 + + LOAD(prefetch, %o0 + 0x0c0, #n_reads) + sub %o2, %g3, %o2 + brz,pn %g3, 2f + LOAD(prefetch, %o0 + 0x100, #n_reads) + + /* So that we don't need to use the non-pairing + * add-with-carry instructions we accumulate 32-bit + * values into a 64-bit register. At the end of the + * loop we fold it down to 32-bits and so on. + */ + ba,pt %xcc, 1f + LOAD(prefetch, %o0 + 0x140, #n_reads) + + .align 32 +1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) + EX_LD(LOAD(lduw, %o0 + 0x04, %g1)) + EX_LD(LOAD(lduw, %o0 + 0x08, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x00)) + EX_LD(LOAD(lduw, %o0 + 0x0c, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x04)) + EX_LD(LOAD(lduw, %o0 + 0x10, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x08)) + EX_LD(LOAD(lduw, %o0 + 0x14, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x0c)) + EX_LD(LOAD(lduw, %o0 + 0x18, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x10)) + EX_LD(LOAD(lduw, %o0 + 0x1c, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x14)) + EX_LD(LOAD(lduw, %o0 + 0x20, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x18)) + EX_LD(LOAD(lduw, %o0 + 0x24, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x1c)) + EX_LD(LOAD(lduw, %o0 + 0x28, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x20)) + EX_LD(LOAD(lduw, %o0 + 0x2c, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x24)) + EX_LD(LOAD(lduw, %o0 + 0x30, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x28)) + EX_LD(LOAD(lduw, %o0 + 0x34, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x2c)) + EX_LD(LOAD(lduw, %o0 + 0x38, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x30)) + EX_LD(LOAD(lduw, %o0 + 0x3c, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x34)) + LOAD(prefetch, %o0 + 0x180, #n_reads) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x38)) + subcc %g3, 0x40, %g3 + add %o0, 0x40, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x3c)) + bne,pt %icc, 1b + add %o1, 0x40, %o1 + +2: and %o2, 0x3c, %g3 + brz,pn %g3, 2f + sub %o2, %g3, %o2 +1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) + subcc %g3, 0x4, %g3 + add %o0, 0x4, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x00)) + bne,pt %icc, 1b + add %o1, 0x4, %o1 + +2: + /* fold 64-->32 */ + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +60: + /* %o4 has the 16-bit sum we have calculated so-far. */ + cmp %o2, 2 + blu,pt %icc, 1f + nop + EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) + sub %o2, 2, %o2 + add %o0, 2, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(sth, %o5, %o1 + 0x00)) + add %o1, 0x2, %o1 +1: brz,pt %o2, 1f + nop + EX_LD(LOAD(ldub, %o0 + 0x00, %o5)) + sub %o2, 1, %o2 + add %o0, 1, %o0 + EX_ST(STORE(stb, %o5, %o1 + 0x00)) + sllx %o5, 8, %o5 + add %o1, 1, %o1 + add %o4, %o5, %o4 +1: + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +1: brz,pt GLOBAL_SPARE, 1f + nop + + /* We started with an odd byte, byte-swap the result. */ + srl %o4, 8, %o5 + and %o4, 0xff, %g1 + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +1: add %o3, %o4, %o3 + +70: + retl + mov %o3, %o0 + +95: mov 0, GLOBAL_SPARE + brlez,pn %o2, 4f + andcc %o0, 1, %o5 + be,a,pt %icc, 1f + srl %o2, 1, %g1 + sub %o2, 1, %o2 + EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE)) + add %o0, 1, %o0 + EX_ST(STORE(stb, GLOBAL_SPARE, %o1)) + srl %o2, 1, %g1 + add %o1, 1, %o1 +1: brz,a,pn %g1, 3f + andcc %o2, 1, %g0 + andcc %o0, 2, %g0 + be,a,pt %icc, 1f + srl %g1, 1, %g1 + EX_LD(LOAD(lduh, %o0, %o4)) + sub %o2, 2, %o2 + srl %o4, 8, %g2 + sub %g1, 1, %g1 + EX_ST(STORE(stb, %g2, %o1)) + add %o4, GLOBAL_SPARE, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 1)) + add %o0, 2, %o0 + srl %g1, 1, %g1 + add %o1, 2, %o1 +1: brz,a,pn %g1, 2f + andcc %o2, 2, %g0 + EX_LD(LOAD(lduw, %o0, %o4)) +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + EX_ST(STORE(stb, %g2, %o1)) + srl %o4, 8, %g2 + EX_ST(STORE(stb, %g3, %o1 + 1)) + add %o0, 4, %o0 + EX_ST(STORE(stb, %g2, %o1 + 2)) + addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 3)) + addc GLOBAL_SPARE, %g0, GLOBAL_SPARE + add %o1, 4, %o1 + subcc %g1, 1, %g1 + bne,a,pt %icc, 5b + EX_LD(LOAD(lduw, %o0, %o4)) + sll GLOBAL_SPARE, 16, %g2 + srl GLOBAL_SPARE, 16, GLOBAL_SPARE + srl %g2, 16, %g2 + andcc %o2, 2, %g0 + add %g2, GLOBAL_SPARE, GLOBAL_SPARE +2: be,a,pt %icc, 3f + andcc %o2, 1, %g0 + EX_LD(LOAD(lduh, %o0, %o4)) + andcc %o2, 1, %g0 + srl %o4, 8, %g2 + add %o0, 2, %o0 + EX_ST(STORE(stb, %g2, %o1)) + add GLOBAL_SPARE, %o4, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 1)) + add %o1, 2, %o1 +3: be,a,pt %icc, 1f + sll GLOBAL_SPARE, 16, %o4 + EX_LD(LOAD(ldub, %o0, %g2)) + sll %g2, 8, %o4 + EX_ST(STORE(stb, %g2, %o1)) + add GLOBAL_SPARE, %o4, GLOBAL_SPARE + sll GLOBAL_SPARE, 16, %o4 +1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE + srl GLOBAL_SPARE, 16, %o4 + addc %g0, %o4, GLOBAL_SPARE + brz,pt %o5, 4f + srl GLOBAL_SPARE, 8, %o4 + and GLOBAL_SPARE, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, GLOBAL_SPARE +4: addcc %o3, GLOBAL_SPARE, %o3 + addc %g0, %o3, %o0 + retl + srl %o0, 0, %o0 + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc64/lib/csum_copy_from_user.S b/arch/sparc64/lib/csum_copy_from_user.S new file mode 100644 index 0000000000000..817ebdae39f8e --- /dev/null +++ b/arch/sparc64/lib/csum_copy_from_user.S @@ -0,0 +1,21 @@ +/* csum_copy_from_user.S: Checksum+copy from userspace. + * + * Copyright (C) 2005 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + mov -1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME __csum_partial_copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest + +#include "csum_copy.S" diff --git a/arch/sparc64/lib/csum_copy_to_user.S b/arch/sparc64/lib/csum_copy_to_user.S new file mode 100644 index 0000000000000..c2f9463ea1e26 --- /dev/null +++ b/arch/sparc64/lib/csum_copy_to_user.S @@ -0,0 +1,21 @@ +/* csum_copy_to_user.S: Checksum+copy to userspace. + * + * Copyright (C) 2005 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + mov -1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME __csum_partial_copy_to_user +#define STORE(type,src,addr) type##a src, [addr] %asi + +#include "csum_copy.S" diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c index 46e5ebfb4b7ce..c421e0c653253 100644 --- a/arch/sparc64/lib/debuglocks.c +++ b/arch/sparc64/lib/debuglocks.c @@ -138,15 +138,15 @@ wlock_again: } /* Try once to increment the counter. */ __asm__ __volatile__( -" ldx [%0], %%g5\n" -" brlz,a,pn %%g5, 2f\n" +" ldx [%0], %%g1\n" +" brlz,a,pn %%g1, 2f\n" " mov 1, %0\n" -" add %%g5, 1, %%g7\n" -" casx [%0], %%g5, %%g7\n" -" sub %%g5, %%g7, %0\n" +" add %%g1, 1, %%g7\n" +" casx [%0], %%g1, %%g7\n" +" sub %%g1, %%g7, %0\n" "2:" : "=r" (val) : "0" (&(rw->lock)) - : "g5", "g7", "memory"); + : "g1", "g7", "memory"); membar("#StoreLoad | #StoreStore"); if (val) goto wlock_again; @@ -173,14 +173,14 @@ runlock_again: /* Spin trying to decrement the counter using casx. */ __asm__ __volatile__( " membar #StoreLoad | #LoadLoad\n" -" ldx [%0], %%g5\n" -" sub %%g5, 1, %%g7\n" -" casx [%0], %%g5, %%g7\n" +" ldx [%0], %%g1\n" +" sub %%g1, 1, %%g7\n" +" casx [%0], %%g1, %%g7\n" " membar #StoreLoad | #StoreStore\n" -" sub %%g5, %%g7, %0\n" +" sub %%g1, %%g7, %0\n" : "=r" (val) : "0" (&(rw->lock)) - : "g5", "g7", "memory"); + : "g1", "g7", "memory"); if (val) { if (!--stuck) { if (shown++ <= 2) @@ -216,17 +216,17 @@ wlock_again: __asm__ __volatile__( " mov 1, %%g3\n" " sllx %%g3, 63, %%g3\n" -" ldx [%0], %%g5\n" -" brlz,pn %%g5, 1f\n" -" or %%g5, %%g3, %%g7\n" -" casx [%0], %%g5, %%g7\n" +" ldx [%0], %%g1\n" +" brlz,pn %%g1, 1f\n" +" or %%g1, %%g3, %%g7\n" +" casx [%0], %%g1, %%g7\n" " membar #StoreLoad | #StoreStore\n" " ba,pt %%xcc, 2f\n" -" sub %%g5, %%g7, %0\n" +" sub %%g1, %%g7, %0\n" "1: mov 1, %0\n" "2:" : "=r" (val) : "0" (&(rw->lock)) - : "g3", "g5", "g7", "memory"); + : "g3", "g1", "g7", "memory"); if (val) { /* We couldn't get the write bit. */ if (!--stuck) { @@ -248,15 +248,15 @@ wlock_again: __asm__ __volatile__( " mov 1, %%g3\n" " sllx %%g3, 63, %%g3\n" -"1: ldx [%0], %%g5\n" -" andn %%g5, %%g3, %%g7\n" -" casx [%0], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +"1: ldx [%0], %%g1\n" +" andn %%g1, %%g3, %%g7\n" +" casx [%0], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%xcc, 1b\n" " membar #StoreLoad | #StoreStore" : /* no outputs */ : "r" (&(rw->lock)) - : "g3", "g5", "g7", "cc", "memory"); + : "g3", "g1", "g7", "cc", "memory"); while(rw->lock != 0) { if (!--stuck) { if (shown++ <= 2) @@ -294,14 +294,14 @@ wlock_again: " membar #StoreLoad | #LoadLoad\n" " mov 1, %%g3\n" " sllx %%g3, 63, %%g3\n" -" ldx [%0], %%g5\n" -" andn %%g5, %%g3, %%g7\n" -" casx [%0], %%g5, %%g7\n" +" ldx [%0], %%g1\n" +" andn %%g1, %%g3, %%g7\n" +" casx [%0], %%g1, %%g7\n" " membar #StoreLoad | #StoreStore\n" -" sub %%g5, %%g7, %0\n" +" sub %%g1, %%g7, %0\n" : "=r" (val) : "0" (&(rw->lock)) - : "g3", "g5", "g7", "memory"); + : "g3", "g1", "g7", "memory"); if (val) { if (!--stuck) { if (shown++ <= 2) @@ -323,17 +323,17 @@ int _do_write_trylock (rwlock_t *rw, char *str) __asm__ __volatile__( " mov 1, %%g3\n" " sllx %%g3, 63, %%g3\n" -" ldx [%0], %%g5\n" -" brlz,pn %%g5, 1f\n" -" or %%g5, %%g3, %%g7\n" -" casx [%0], %%g5, %%g7\n" +" ldx [%0], %%g1\n" +" brlz,pn %%g1, 1f\n" +" or %%g1, %%g3, %%g7\n" +" casx [%0], %%g1, %%g7\n" " membar #StoreLoad | #StoreStore\n" " ba,pt %%xcc, 2f\n" -" sub %%g5, %%g7, %0\n" +" sub %%g1, %%g7, %0\n" "1: mov 1, %0\n" "2:" : "=r" (val) : "0" (&(rw->lock)) - : "g3", "g5", "g7", "memory"); + : "g3", "g1", "g7", "memory"); if (val) { put_cpu(); @@ -347,15 +347,15 @@ int _do_write_trylock (rwlock_t *rw, char *str) __asm__ __volatile__( " mov 1, %%g3\n" " sllx %%g3, 63, %%g3\n" -"1: ldx [%0], %%g5\n" -" andn %%g5, %%g3, %%g7\n" -" casx [%0], %%g5, %%g7\n" -" cmp %%g5, %%g7\n" +"1: ldx [%0], %%g1\n" +" andn %%g1, %%g3, %%g7\n" +" casx [%0], %%g1, %%g7\n" +" cmp %%g1, %%g7\n" " bne,pn %%xcc, 1b\n" " membar #StoreLoad | #StoreStore" : /* no outputs */ : "r" (&(rw->lock)) - : "g3", "g5", "g7", "cc", "memory"); + : "g3", "g1", "g7", "cc", "memory"); put_cpu(); diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S index e86906744cf6f..7e6fdaebedbab 100644 --- a/arch/sparc64/lib/dec_and_lock.S +++ b/arch/sparc64/lib/dec_and_lock.S @@ -27,12 +27,12 @@ .globl _atomic_dec_and_lock _atomic_dec_and_lock: /* %o0 = counter, %o1 = lock */ -loop1: lduw [%o0], %g5 - subcc %g5, 1, %g7 +loop1: lduw [%o0], %g2 + subcc %g2, 1, %g7 be,pn %icc, start_to_zero nop -nzero: cas [%o0], %g5, %g7 - cmp %g5, %g7 +nzero: cas [%o0], %g2, %g7 + cmp %g2, %g7 bne,pn %icc, loop1 mov 0, %g1 @@ -50,13 +50,13 @@ to_zero: ldstub [%o1], %g3 brnz,pn %g3, spin_on_lock membar #StoreLoad | #StoreStore -loop2: cas [%o0], %g5, %g7 /* ASSERT(g7 == 0) */ - cmp %g5, %g7 +loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */ + cmp %g2, %g7 be,pt %icc, out mov 1, %g1 - lduw [%o0], %g5 - subcc %g5, 1, %g7 + lduw [%o0], %g2 + subcc %g2, 1, %g7 be,pn %icc, loop2 nop membar #StoreStore | #LoadStore diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S index 4e8c7928c49f1..2ef2e268bdcfd 100644 --- a/arch/sparc64/lib/mcount.S +++ b/arch/sparc64/lib/mcount.S @@ -38,22 +38,22 @@ _mcount: * Check whether %sp is dangerously low. */ ldub [%g6 + TI_FPDEPTH], %g1 - srl %g1, 1, %g5 - add %g5, 1, %g5 - sllx %g5, 8, %g5 ! each fpregs frame is 256b - add %g5, 192, %g5 - add %g6, %g5, %g5 ! where does task_struct+frame end? - sub %g5, STACK_BIAS, %g5 - cmp %sp, %g5 + srl %g1, 1, %g3 + add %g3, 1, %g3 + sllx %g3, 8, %g3 ! each fpregs frame is 256b + add %g3, 192, %g3 + add %g6, %g3, %g3 ! where does task_struct+frame end? + sub %g3, STACK_BIAS, %g3 + cmp %sp, %g3 bg,pt %xcc, 1f - sethi %hi(panicstring), %g5 + sethi %hi(panicstring), %g3 sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough or %g7, %lo(ovstack), %g7 add %g7, OVSTACKSIZE, %g7 sub %g7, STACK_BIAS, %g7 mov %g7, %sp call prom_printf - or %g5, %lo(panicstring), %o0 + or %g3, %lo(panicstring), %o0 call prom_halt nop #endif diff --git a/arch/sparc64/lib/memcmp.S b/arch/sparc64/lib/memcmp.S index d34dc3d874dae..c90ad96c51b9c 100644 --- a/arch/sparc64/lib/memcmp.S +++ b/arch/sparc64/lib/memcmp.S @@ -13,12 +13,12 @@ memcmp: cmp %o2, 0 ! IEU1 Group loop: be,pn %icc, ret_0 ! CTI nop ! IEU0 - ldub [%o0], %g5 ! LSU Group + ldub [%o0], %g7 ! LSU Group ldub [%o1], %g3 ! LSU Group sub %o2, 1, %o2 ! IEU0 add %o0, 1, %o0 ! IEU1 add %o1, 1, %o1 ! IEU0 Group - subcc %g5, %g3, %g3 ! IEU1 Group + subcc %g7, %g3, %g3 ! IEU1 Group be,pt %icc, loop ! CTI cmp %o2, 0 ! IEU1 Group diff --git a/arch/sparc64/lib/memmove.S b/arch/sparc64/lib/memmove.S index 1c1ebbbdf830e..97395802c23c4 100644 --- a/arch/sparc64/lib/memmove.S +++ b/arch/sparc64/lib/memmove.S @@ -12,17 +12,17 @@ memmove: /* o0=dst o1=src o2=len */ mov %o0, %g1 cmp %o0, %o1 bleu,pt %xcc, memcpy - add %o1, %o2, %g5 - cmp %g5, %o0 + add %o1, %o2, %g7 + cmp %g7, %o0 bleu,pt %xcc, memcpy add %o0, %o2, %o5 - sub %g5, 1, %o1 + sub %g7, 1, %o1 sub %o5, 1, %o0 -1: ldub [%o1], %g5 +1: ldub [%o1], %g7 subcc %o2, 1, %o2 sub %o1, 1, %o1 - stb %g5, [%o0] + stb %g7, [%o0] bne,pt %icc, 1b sub %o0, 1, %o0 diff --git a/arch/sparc64/lib/memscan.S b/arch/sparc64/lib/memscan.S index a34c6b9d21e85..5e72d49114179 100644 --- a/arch/sparc64/lib/memscan.S +++ b/arch/sparc64/lib/memscan.S @@ -52,43 +52,43 @@ check_bytes: andcc %o5, 0xff, %g0 add %o0, -5, %g2 ba,pt %xcc, 3f - srlx %o5, 32, %g5 + srlx %o5, 32, %g7 -2: srlx %o5, 8, %g5 +2: srlx %o5, 8, %g7 be,pn %icc, 1f add %o0, -8, %g2 - andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 - andcc %g5, 0xff, %g0 + andcc %g7, 0xff, %g0 - srlx %g5, 8, %g5 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 - andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 andcc %g3, %o3, %g0 be,a,pn %icc, 2f mov %o0, %g2 -3: andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 +3: andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 - andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 - andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 - andcc %g5, 0xff, %g0 - srlx %g5, 8, %g5 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 be,pn %icc, 1f inc %g2 diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S new file mode 100644 index 0000000000000..174ff7b9164c5 --- /dev/null +++ b/arch/sparc64/lib/rwsem.S @@ -0,0 +1,165 @@ +/* rwsem.S: RW semaphore assembler. + * + * Written by David S. Miller (davem@redhat.com), 2001. + * Derived from asm-i386/rwsem.h + */ + +#include <asm/rwsem-const.h> + + .section .sched.text + + .globl __down_read +__down_read: +1: lduw [%o0], %g1 + add %g1, 1, %g7 + cas [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 1b + add %g7, 1, %g7 + cmp %g7, 0 + bl,pn %icc, 3f + membar #StoreLoad | #StoreStore +2: + retl + nop +3: + save %sp, -192, %sp + call rwsem_down_read_failed + mov %i0, %o0 + ret + restore + .size __down_read, .-__down_read + + .globl __down_read_trylock +__down_read_trylock: +1: lduw [%o0], %g1 + add %g1, 1, %g7 + cmp %g7, 0 + bl,pn %icc, 2f + mov 0, %o1 + cas [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 1b + mov 1, %o1 + membar #StoreLoad | #StoreStore +2: retl + mov %o1, %o0 + .size __down_read_trylock, .-__down_read_trylock + + .globl __down_write +__down_write: + sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 + or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 +1: + lduw [%o0], %g3 + add %g3, %g1, %g7 + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b + cmp %g7, 0 + bne,pn %icc, 3f + membar #StoreLoad | #StoreStore +2: retl + nop +3: + save %sp, -192, %sp + call rwsem_down_write_failed + mov %i0, %o0 + ret + restore + .size __down_write, .-__down_write + + .globl __down_write_trylock +__down_write_trylock: + sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 + or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 +1: + lduw [%o0], %g3 + cmp %g3, 0 + bne,pn %icc, 2f + mov 0, %o1 + add %g3, %g1, %g7 + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b + mov 1, %o1 + membar #StoreLoad | #StoreStore +2: retl + mov %o1, %o0 + .size __down_write_trylock, .-__down_write_trylock + + .globl __up_read +__up_read: +1: + lduw [%o0], %g1 + sub %g1, 1, %g7 + cas [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 1b + cmp %g7, 0 + bl,pn %icc, 3f + membar #StoreLoad | #StoreStore +2: retl + nop +3: sethi %hi(RWSEM_ACTIVE_MASK), %g1 + sub %g7, 1, %g7 + or %g1, %lo(RWSEM_ACTIVE_MASK), %g1 + andcc %g7, %g1, %g0 + bne,pn %icc, 2b + nop + save %sp, -192, %sp + call rwsem_wake + mov %i0, %o0 + ret + restore + .size __up_read, .-__up_read + + .globl __up_write +__up_write: + sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 + or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 +1: + lduw [%o0], %g3 + sub %g3, %g1, %g7 + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b + sub %g7, %g1, %g7 + cmp %g7, 0 + bl,pn %icc, 3f + membar #StoreLoad | #StoreStore +2: + retl + nop +3: + save %sp, -192, %sp + call rwsem_wake + mov %i0, %o0 + ret + restore + .size __up_write, .-__up_write + + .globl __downgrade_write +__downgrade_write: + sethi %hi(RWSEM_WAITING_BIAS), %g1 + or %g1, %lo(RWSEM_WAITING_BIAS), %g1 +1: + lduw [%o0], %g3 + sub %g3, %g1, %g7 + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b + sub %g7, %g1, %g7 + cmp %g7, 0 + bl,pn %icc, 3f + membar #StoreLoad | #StoreStore +2: + retl + nop +3: + save %sp, -192, %sp + call rwsem_downgrade_wake + mov %i0, %o0 + ret + restore + .size __downgrade_write, .-__downgrade_write diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c deleted file mode 100644 index e19968dbc2d15..0000000000000 --- a/arch/sparc64/lib/rwsem.c +++ /dev/null @@ -1,239 +0,0 @@ -/* rwsem.c: Don't inline expand these suckers all over the place. - * - * Written by David S. Miller (davem@redhat.com), 2001. - * Derived from asm-i386/rwsem.h - */ - -#include <linux/kernel.h> -#include <linux/rwsem.h> -#include <linux/init.h> -#include <linux/module.h> - -extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); -extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *)); - -void __sched __down_read(struct rw_semaphore *sem) -{ - __asm__ __volatile__( - "! beginning __down_read\n" - "1:\tlduw [%0], %%g5\n\t" - "add %%g5, 1, %%g7\n\t" - "cas [%0], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " add %%g7, 1, %%g7\n\t" - "cmp %%g7, 0\n\t" - "bl,pn %%icc, 3f\n\t" - " membar #StoreLoad | #StoreStore\n" - "2:\n\t" - ".subsection 2\n" - "3:\tmov %0, %%g5\n\t" - "save %%sp, -160, %%sp\n\t" - "mov %%g1, %%l1\n\t" - "mov %%g2, %%l2\n\t" - "mov %%g3, %%l3\n\t" - "call %1\n\t" - " mov %%g5, %%o0\n\t" - "mov %%l1, %%g1\n\t" - "mov %%l2, %%g2\n\t" - "ba,pt %%xcc, 2b\n\t" - " restore %%l3, %%g0, %%g3\n\t" - ".previous\n\t" - "! ending __down_read" - : : "r" (sem), "i" (rwsem_down_read_failed) - : "g5", "g7", "memory", "cc"); -} -EXPORT_SYMBOL(__down_read); - -int __down_read_trylock(struct rw_semaphore *sem) -{ - int result; - - __asm__ __volatile__( - "! beginning __down_read_trylock\n" - "1:\tlduw [%1], %%g5\n\t" - "add %%g5, 1, %%g7\n\t" - "cmp %%g7, 0\n\t" - "bl,pn %%icc, 2f\n\t" - " mov 0, %0\n\t" - "cas [%1], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " mov 1, %0\n\t" - "membar #StoreLoad | #StoreStore\n" - "2:\n\t" - "! ending __down_read_trylock" - : "=&r" (result) - : "r" (sem) - : "g5", "g7", "memory", "cc"); - - return result; -} -EXPORT_SYMBOL(__down_read_trylock); - -void __sched __down_write(struct rw_semaphore *sem) -{ - __asm__ __volatile__( - "! beginning __down_write\n\t" - "sethi %%hi(%2), %%g1\n\t" - "or %%g1, %%lo(%2), %%g1\n" - "1:\tlduw [%0], %%g5\n\t" - "add %%g5, %%g1, %%g7\n\t" - "cas [%0], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " cmp %%g7, 0\n\t" - "bne,pn %%icc, 3f\n\t" - " membar #StoreLoad | #StoreStore\n" - "2:\n\t" - ".subsection 2\n" - "3:\tmov %0, %%g5\n\t" - "save %%sp, -160, %%sp\n\t" - "mov %%g2, %%l2\n\t" - "mov %%g3, %%l3\n\t" - "call %1\n\t" - " mov %%g5, %%o0\n\t" - "mov %%l2, %%g2\n\t" - "ba,pt %%xcc, 2b\n\t" - " restore %%l3, %%g0, %%g3\n\t" - ".previous\n\t" - "! ending __down_write" - : : "r" (sem), "i" (rwsem_down_write_failed), - "i" (RWSEM_ACTIVE_WRITE_BIAS) - : "g1", "g5", "g7", "memory", "cc"); -} -EXPORT_SYMBOL(__down_write); - -int __down_write_trylock(struct rw_semaphore *sem) -{ - int result; - - __asm__ __volatile__( - "! beginning __down_write_trylock\n\t" - "sethi %%hi(%2), %%g1\n\t" - "or %%g1, %%lo(%2), %%g1\n" - "1:\tlduw [%1], %%g5\n\t" - "cmp %%g5, 0\n\t" - "bne,pn %%icc, 2f\n\t" - " mov 0, %0\n\t" - "add %%g5, %%g1, %%g7\n\t" - "cas [%1], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " mov 1, %0\n\t" - "membar #StoreLoad | #StoreStore\n" - "2:\n\t" - "! ending __down_write_trylock" - : "=&r" (result) - : "r" (sem), "i" (RWSEM_ACTIVE_WRITE_BIAS) - : "g1", "g5", "g7", "memory", "cc"); - - return result; -} -EXPORT_SYMBOL(__down_write_trylock); - -void __up_read(struct rw_semaphore *sem) -{ - __asm__ __volatile__( - "! beginning __up_read\n\t" - "1:\tlduw [%0], %%g5\n\t" - "sub %%g5, 1, %%g7\n\t" - "cas [%0], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " cmp %%g7, 0\n\t" - "bl,pn %%icc, 3f\n\t" - " membar #StoreLoad | #StoreStore\n" - "2:\n\t" - ".subsection 2\n" - "3:\tsethi %%hi(%2), %%g1\n\t" - "sub %%g7, 1, %%g7\n\t" - "or %%g1, %%lo(%2), %%g1\n\t" - "andcc %%g7, %%g1, %%g0\n\t" - "bne,pn %%icc, 2b\n\t" - " mov %0, %%g5\n\t" - "save %%sp, -160, %%sp\n\t" - "mov %%g2, %%l2\n\t" - "mov %%g3, %%l3\n\t" - "call %1\n\t" - " mov %%g5, %%o0\n\t" - "mov %%l2, %%g2\n\t" - "ba,pt %%xcc, 2b\n\t" - " restore %%l3, %%g0, %%g3\n\t" - ".previous\n\t" - "! ending __up_read" - : : "r" (sem), "i" (rwsem_wake), - "i" (RWSEM_ACTIVE_MASK) - : "g1", "g5", "g7", "memory", "cc"); -} -EXPORT_SYMBOL(__up_read); - -void __up_write(struct rw_semaphore *sem) -{ - __asm__ __volatile__( - "! beginning __up_write\n\t" - "sethi %%hi(%2), %%g1\n\t" - "or %%g1, %%lo(%2), %%g1\n" - "1:\tlduw [%0], %%g5\n\t" - "sub %%g5, %%g1, %%g7\n\t" - "cas [%0], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " sub %%g7, %%g1, %%g7\n\t" - "cmp %%g7, 0\n\t" - "bl,pn %%icc, 3f\n\t" - " membar #StoreLoad | #StoreStore\n" - "2:\n\t" - ".subsection 2\n" - "3:\tmov %0, %%g5\n\t" - "save %%sp, -160, %%sp\n\t" - "mov %%g2, %%l2\n\t" - "mov %%g3, %%l3\n\t" - "call %1\n\t" - " mov %%g5, %%o0\n\t" - "mov %%l2, %%g2\n\t" - "ba,pt %%xcc, 2b\n\t" - " restore %%l3, %%g0, %%g3\n\t" - ".previous\n\t" - "! ending __up_write" - : : "r" (sem), "i" (rwsem_wake), - "i" (RWSEM_ACTIVE_WRITE_BIAS) - : "g1", "g5", "g7", "memory", "cc"); -} -EXPORT_SYMBOL(__up_write); - -void __downgrade_write(struct rw_semaphore *sem) -{ - __asm__ __volatile__( - "! beginning __downgrade_write\n\t" - "sethi %%hi(%2), %%g1\n\t" - "or %%g1, %%lo(%2), %%g1\n" - "1:\tlduw [%0], %%g5\n\t" - "sub %%g5, %%g1, %%g7\n\t" - "cas [%0], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" - "bne,pn %%icc, 1b\n\t" - " sub %%g7, %%g1, %%g7\n\t" - "cmp %%g7, 0\n\t" - "bl,pn %%icc, 3f\n\t" - " membar #StoreLoad | #StoreStore\n" - "2:\n\t" - ".subsection 2\n" - "3:\tmov %0, %%g5\n\t" - "save %%sp, -160, %%sp\n\t" - "mov %%g2, %%l2\n\t" - "mov %%g3, %%l3\n\t" - "call %1\n\t" - " mov %%g5, %%o0\n\t" - "mov %%l2, %%g2\n\t" - "ba,pt %%xcc, 2b\n\t" - " restore %%l3, %%g0, %%g3\n\t" - ".previous\n\t" - "! ending __up_write" - : : "r" (sem), "i" (rwsem_downgrade_wake), - "i" (RWSEM_WAITING_BIAS) - : "g1", "g5", "g7", "memory", "cc"); -} -EXPORT_SYMBOL(__downgrade_write); diff --git a/arch/sparc64/lib/strlen.S b/arch/sparc64/lib/strlen.S index 066ec1ed7d0dd..e9ba1920d818e 100644 --- a/arch/sparc64/lib/strlen.S +++ b/arch/sparc64/lib/strlen.S @@ -48,16 +48,16 @@ strlen: add %o0, 4, %o0 /* Check every byte. */ - srl %o5, 24, %g5 - andcc %g5, 0xff, %g0 + srl %o5, 24, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o0, -4, %o4 - srl %o5, 16, %g5 - andcc %g5, 0xff, %g0 + srl %o5, 16, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o4, 1, %o4 - srl %o5, 8, %g5 - andcc %g5, 0xff, %g0 + srl %o5, 8, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o4, 1, %o4 andcc %o5, 0xff, %g0 diff --git a/arch/sparc64/lib/strlen_user.S b/arch/sparc64/lib/strlen_user.S index 4af69a0adfbcc..9ed54ba14fc63 100644 --- a/arch/sparc64/lib/strlen_user.S +++ b/arch/sparc64/lib/strlen_user.S @@ -54,16 +54,16 @@ __strnlen_user: ba,a,pt %xcc, 1f /* Check every byte. */ -82: srl %o5, 24, %g5 - andcc %g5, 0xff, %g0 +82: srl %o5, 24, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o0, -3, %o4 - srl %o5, 16, %g5 - andcc %g5, 0xff, %g0 + srl %o5, 16, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o4, 1, %o4 - srl %o5, 8, %g5 - andcc %g5, 0xff, %g0 + srl %o5, 8, %g7 + andcc %g7, 0xff, %g0 be,pn %icc, 1f add %o4, 1, %o4 andcc %o5, 0xff, %g0 diff --git a/arch/sparc64/lib/strncpy_from_user.S b/arch/sparc64/lib/strncpy_from_user.S index 93d600a319763..09cbbaa0ebf43 100644 --- a/arch/sparc64/lib/strncpy_from_user.S +++ b/arch/sparc64/lib/strncpy_from_user.S @@ -34,15 +34,15 @@ .type __strncpy_from_user,#function __strncpy_from_user: /* %o0=dest, %o1=src, %o2=count */ - sethi %hi(0b), %o5 ! IEU0 Group - andcc %o1, 7, %g0 ! IEU1 + andcc %o1, 7, %g0 ! IEU1 Group bne,pn %icc, 30f ! CTI - ldx [%o5 + %lo(0b)], %o4 ! Load Group - add %o0, %o2, %g3 ! IEU0 + add %o0, %o2, %g3 ! IEU0 60: ldxa [%o1] %asi, %g1 ! Load Group brlez,pn %o2, 10f ! CTI - sllx %o4, 7, %o5 ! IEU0 Group - mov %o0, %o3 ! IEU1 + mov %o0, %o3 ! IEU0 +50: sethi %hi(0b), %o4 ! IEU0 Group + ldx [%o4 + %lo(0b)], %o4 ! Load + sllx %o4, 7, %o5 ! IEU1 Group 1: sub %g1, %o4, %g2 ! IEU0 Group stx %g1, [%o0] ! Store add %o0, 8, %o0 ! IEU1 @@ -55,34 +55,34 @@ __strncpy_from_user: 10: retl ! CTI Group mov %o2, %o0 ! IEU0 5: srlx %g2, 32, %g7 ! IEU0 Group - sethi %hi(0xff00), %g5 ! IEU1 + sethi %hi(0xff00), %o4 ! IEU1 andcc %g7, %o5, %g0 ! IEU1 Group be,pn %icc, 2f ! CTI - or %g5, %lo(0xff00), %g5 ! IEU0 + or %o4, %lo(0xff00), %o4 ! IEU0 srlx %g1, 48, %g7 ! IEU0 Group - andcc %g7, %g5, %g0 ! IEU1 Group + andcc %g7, %o4, %g0 ! IEU1 Group be,pn %icc, 50f ! CTI andcc %g7, 0xff, %g0 ! IEU1 Group be,pn %icc, 51f ! CTI srlx %g1, 32, %g7 ! IEU0 - andcc %g7, %g5, %g0 ! IEU1 Group + andcc %g7, %o4, %g0 ! IEU1 Group be,pn %icc, 52f ! CTI andcc %g7, 0xff, %g0 ! IEU1 Group be,pn %icc, 53f ! CTI 2: andcc %g2, %o5, %g0 ! IEU1 Group be,pn %icc, 2f ! CTI srl %g1, 16, %g7 ! IEU0 - andcc %g7, %g5, %g0 ! IEU1 Group + andcc %g7, %o4, %g0 ! IEU1 Group be,pn %icc, 54f ! CTI andcc %g7, 0xff, %g0 ! IEU1 Group be,pn %icc, 55f ! CTI - andcc %g1, %g5, %g0 ! IEU1 Group + andcc %g1, %o4, %g0 ! IEU1 Group be,pn %icc, 56f ! CTI andcc %g1, 0xff, %g0 ! IEU1 Group be,a,pn %icc, 57f ! CTI sub %o0, %o3, %o0 ! IEU0 2: cmp %o0, %g3 ! IEU1 Group - bl,a,pt %xcc, 1b ! CTI + bl,a,pt %xcc, 50b ! CTI 62: ldxa [%o1] %asi, %g1 ! Load retl ! CTI Group mov %o2, %o0 ! IEU0 diff --git a/arch/sparc64/lib/xor.S b/arch/sparc64/lib/xor.S index f748fd6bbc389..4cd5d2be1ae1f 100644 --- a/arch/sparc64/lib/xor.S +++ b/arch/sparc64/lib/xor.S @@ -248,7 +248,7 @@ xor_vis_4: .globl xor_vis_5 .type xor_vis_5,#function xor_vis_5: - mov %o5, %g5 + save %sp, -192, %sp rd %fprs, %o5 andcc %o5, FPRS_FEF|FPRS_DU, %g0 be,pt %icc, 0f @@ -256,61 +256,60 @@ xor_vis_5: jmpl %g1 + %lo(VISenter), %g7 add %g7, 8, %g7 0: wr %g0, FPRS_FEF, %fprs - mov %g5, %o5 rd %asi, %g1 wr %g0, ASI_BLK_P, %asi membar #LoadStore|#StoreLoad|#StoreStore - sub %o0, 64, %o0 - ldda [%o1] %asi, %f0 - ldda [%o2] %asi, %f16 + sub %i0, 64, %i0 + ldda [%i1] %asi, %f0 + ldda [%i2] %asi, %f16 -5: ldda [%o3] %asi, %f32 +5: ldda [%i3] %asi, %f32 fxor %f0, %f16, %f48 fxor %f2, %f18, %f50 - add %o1, 64, %o1 + add %i1, 64, %i1 fxor %f4, %f20, %f52 fxor %f6, %f22, %f54 - add %o2, 64, %o2 + add %i2, 64, %i2 fxor %f8, %f24, %f56 fxor %f10, %f26, %f58 fxor %f12, %f28, %f60 fxor %f14, %f30, %f62 - ldda [%o4] %asi, %f16 + ldda [%i4] %asi, %f16 fxor %f48, %f32, %f48 fxor %f50, %f34, %f50 fxor %f52, %f36, %f52 fxor %f54, %f38, %f54 - add %o3, 64, %o3 + add %i3, 64, %i3 fxor %f56, %f40, %f56 fxor %f58, %f42, %f58 fxor %f60, %f44, %f60 fxor %f62, %f46, %f62 - ldda [%o5] %asi, %f32 + ldda [%i5] %asi, %f32 fxor %f48, %f16, %f48 fxor %f50, %f18, %f50 - add %o4, 64, %o4 + add %i4, 64, %i4 fxor %f52, %f20, %f52 fxor %f54, %f22, %f54 - add %o5, 64, %o5 + add %i5, 64, %i5 fxor %f56, %f24, %f56 fxor %f58, %f26, %f58 fxor %f60, %f28, %f60 fxor %f62, %f30, %f62 - ldda [%o1] %asi, %f0 + ldda [%i1] %asi, %f0 fxor %f48, %f32, %f48 fxor %f50, %f34, %f50 fxor %f52, %f36, %f52 fxor %f54, %f38, %f54 fxor %f56, %f40, %f56 fxor %f58, %f42, %f58 - subcc %o0, 64, %o0 + subcc %i0, 64, %i0 fxor %f60, %f44, %f60 fxor %f62, %f46, %f62 - stda %f48, [%o1 - 64] %asi + stda %f48, [%i1 - 64] %asi bne,pt %xcc, 5b - ldda [%o2] %asi, %f16 + ldda [%i2] %asi, %f16 - ldda [%o3] %asi, %f32 + ldda [%i3] %asi, %f32 fxor %f0, %f16, %f48 fxor %f2, %f18, %f50 fxor %f4, %f20, %f52 @@ -319,7 +318,7 @@ xor_vis_5: fxor %f10, %f26, %f58 fxor %f12, %f28, %f60 fxor %f14, %f30, %f62 - ldda [%o4] %asi, %f16 + ldda [%i4] %asi, %f16 fxor %f48, %f32, %f48 fxor %f50, %f34, %f50 fxor %f52, %f36, %f52 @@ -328,7 +327,7 @@ xor_vis_5: fxor %f58, %f42, %f58 fxor %f60, %f44, %f60 fxor %f62, %f46, %f62 - ldda [%o5] %asi, %f32 + ldda [%i5] %asi, %f32 fxor %f48, %f16, %f48 fxor %f50, %f18, %f50 fxor %f52, %f20, %f52 @@ -346,9 +345,10 @@ xor_vis_5: fxor %f58, %f42, %f58 fxor %f60, %f44, %f60 fxor %f62, %f46, %f62 - stda %f48, [%o1] %asi + stda %f48, [%i1] %asi membar #Sync|#StoreStore|#StoreLoad wr %g1, %g0, %asi - retl - wr %g0, 0, %fprs + wr %g0, 0, %fprs + ret + restore .size xor_vis_5, .-xor_vis_5 diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 45edb9459bcdf..3ffee7b51aed5 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -144,7 +144,9 @@ static void unhandled_fault(unsigned long address, struct task_struct *tsk, "at virtual address %016lx\n", (unsigned long)address); } printk(KERN_ALERT "tsk->{mm,active_mm}->context = %016lx\n", - (tsk->mm ? tsk->mm->context : tsk->active_mm->context)); + (tsk->mm ? + CTX_HWBITS(tsk->mm->context) : + CTX_HWBITS(tsk->active_mm->context))); printk(KERN_ALERT "tsk->{mm,active_mm}->pgd = %016lx\n", (tsk->mm ? (unsigned long) tsk->mm->pgd : (unsigned long) tsk->active_mm->pgd)); diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index ffa207795f1df..5a1f831b2de1b 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -20,6 +20,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { @@ -217,12 +218,50 @@ void unmap_hugepage_range(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } +static void context_reload(void *__data) +{ + struct mm_struct *mm = __data; + + if (mm == current->mm) + load_secondary_context(mm); +} + int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) { struct mm_struct *mm = current->mm; unsigned long addr; int ret = 0; + /* On UltraSPARC-III+ and later, configure the second half of + * the Data-TLB for huge pages. + */ + if (tlb_type == cheetah_plus) { + unsigned long ctx; + + spin_lock(&ctx_alloc_lock); + ctx = mm->context.sparc64_ctx_val; + ctx &= ~CTX_PGSZ_MASK; + ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; + ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; + + if (ctx != mm->context.sparc64_ctx_val) { + /* When changing the page size fields, we + * must perform a context flush so that no + * stale entries match. This flush must + * occur with the original context register + * settings. + */ + do_flush_tlb_mm(mm); + + /* Reload the context register of all processors + * also executing in this address space. + */ + mm->context.sparc64_ctx_val = ctx; + on_each_cpu(context_reload, mm, 0, 0); + } + spin_unlock(&ctx_alloc_lock); + } + BUG_ON(vma->vm_start & ~HPAGE_MASK); BUG_ON(vma->vm_end & ~HPAGE_MASK); diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 0e62b62c7dd44..89022ccaa75bb 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -61,7 +61,7 @@ static unsigned long bootmap_base; /* get_new_mmu_context() uses "cache + 1". */ DEFINE_SPINLOCK(ctx_alloc_lock); unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1; -#define CTX_BMAP_SLOTS (1UL << (CTX_VERSION_SHIFT - 6)) +#define CTX_BMAP_SLOTS (1UL << (CTX_NR_BITS - 6)) unsigned long mmu_context_bmap[CTX_BMAP_SLOTS]; /* References to special section boundaries */ @@ -85,40 +85,14 @@ void check_pgt_cache(void) preempt_disable(); if (pgtable_cache_size > PGT_CACHE_HIGH) { do { -#ifdef CONFIG_SMP if (pgd_quicklist) free_pgd_slow(get_pgd_fast()); -#endif if (pte_quicklist[0]) free_pte_slow(pte_alloc_one_fast(NULL, 0)); if (pte_quicklist[1]) free_pte_slow(pte_alloc_one_fast(NULL, 1 << (PAGE_SHIFT + 10))); } while (pgtable_cache_size > PGT_CACHE_LOW); } -#ifndef CONFIG_SMP - if (pgd_cache_size > PGT_CACHE_HIGH / 4) { - struct page *page, *page2; - for (page2 = NULL, page = (struct page *)pgd_quicklist; page;) { - if ((unsigned long)page->lru.prev == 3) { - if (page2) - page2->lru.next = page->lru.next; - else - pgd_quicklist = (void *) page->lru.next; - pgd_cache_size -= 2; - __free_page(page); - if (page2) - page = (struct page *)page2->lru.next; - else - page = (struct page *)pgd_quicklist; - if (pgd_cache_size <= PGT_CACHE_LOW / 4) - break; - continue; - } - page2 = page; - page = (struct page *)page->lru.next; - } - } -#endif preempt_enable(); } @@ -135,7 +109,7 @@ __inline__ void flush_dcache_page_impl(struct page *page) atomic_inc(&dcpage_flushes); #endif -#if (L1DCACHE_SIZE > PAGE_SIZE) +#ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_page(page_address(page), ((tlb_type == spitfire) && page_mapping(page) != NULL)); @@ -158,15 +132,15 @@ static __inline__ void set_dcache_dirty(struct page *page, int this_cpu) mask = (mask << 24) | (1UL << PG_dcache_dirty); __asm__ __volatile__("1:\n\t" "ldx [%2], %%g7\n\t" - "and %%g7, %1, %%g5\n\t" - "or %%g5, %0, %%g5\n\t" - "casx [%2], %%g7, %%g5\n\t" - "cmp %%g7, %%g5\n\t" + "and %%g7, %1, %%g1\n\t" + "or %%g1, %0, %%g1\n\t" + "casx [%2], %%g7, %%g1\n\t" + "cmp %%g7, %%g1\n\t" "bne,pn %%xcc, 1b\n\t" " membar #StoreLoad | #StoreStore" : /* no outputs */ : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) - : "g5", "g7"); + : "g1", "g7"); } static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) @@ -176,20 +150,20 @@ static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long c __asm__ __volatile__("! test_and_clear_dcache_dirty\n" "1:\n\t" "ldx [%2], %%g7\n\t" - "srlx %%g7, 24, %%g5\n\t" - "and %%g5, %3, %%g5\n\t" - "cmp %%g5, %0\n\t" + "srlx %%g7, 24, %%g1\n\t" + "and %%g1, %3, %%g1\n\t" + "cmp %%g1, %0\n\t" "bne,pn %%icc, 2f\n\t" - " andn %%g7, %1, %%g5\n\t" - "casx [%2], %%g7, %%g5\n\t" - "cmp %%g7, %%g5\n\t" + " andn %%g7, %1, %%g1\n\t" + "casx [%2], %%g7, %%g1\n\t" + "cmp %%g7, %%g1\n\t" "bne,pn %%xcc, 1b\n\t" " membar #StoreLoad | #StoreStore\n" "2:" : /* no outputs */ : "r" (cpu), "r" (mask), "r" (&page->flags), "i" (NR_CPUS - 1UL) - : "g5", "g7"); + : "g1", "g7"); } extern void __update_mmu_cache(unsigned long mmu_context_hw, unsigned long address, pte_t pte, int code); @@ -219,8 +193,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p put_cpu(); } + if (get_thread_fault_code()) - __update_mmu_cache(vma->vm_mm->context & TAG_CONTEXT_BITS, + __update_mmu_cache(CTX_NRBITS(vma->vm_mm->context), address, pte, get_thread_fault_code()); } @@ -281,9 +256,6 @@ void show_mem(void) printk("%ld pages of RAM\n", num_physpages); printk("%d free pages\n", nr_free_pages()); printk("%d pages in page table cache\n",pgtable_cache_size); -#ifndef CONFIG_SMP - printk("%d entries in page dir cache\n",pgd_cache_size); -#endif } void mmu_info(struct seq_file *m) @@ -392,10 +364,10 @@ static void inherit_prom_mappings(void) n = n / sizeof(*trans); /* - * The obp translations are saved based on 8k pagesize, since obp can use - * a mixture of pagesizes. Misses to the 0xf0000000 - 0x100000000, ie obp - * range, are handled in entry.S and do not use the vpte scheme (see rant - * in inherit_locked_prom_mappings()). + * The obp translations are saved based on 8k pagesize, since obp can + * use a mixture of pagesizes. Misses to the 0xf0000000 - 0x100000000, + * ie obp range, are handled in entry.S and do not use the vpte scheme + * (see rant in inherit_locked_prom_mappings()). */ #define OBP_PMD_SIZE 2048 prompmd = __alloc_bootmem(OBP_PMD_SIZE, OBP_PMD_SIZE, bootmap_base); @@ -449,11 +421,15 @@ static void inherit_prom_mappings(void) prom_printf("Remapping the kernel... "); /* Spitfire Errata #32 workaround */ + /* NOTE: Using plain zero for the context value is + * correct here, we are not using the Linux trap + * tables yet so we should not use the special + * UltraSPARC-III+ page size encodings yet. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ - : "r" (0), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); + : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); switch (tlb_type) { default: @@ -513,6 +489,11 @@ static void inherit_prom_mappings(void) tte_vaddr = (unsigned long) KERNBASE; /* Spitfire Errata #32 workaround */ + /* NOTE: Using plain zero for the context value is + * correct here, we are not using the Linux trap + * tables yet so we should not use the special + * UltraSPARC-III+ page size encodings yet. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -531,6 +512,11 @@ static void inherit_prom_mappings(void) /* Spitfire Errata #32 workaround */ + /* NOTE: Using plain zero for the context value is + * correct here, we are not using the Linux trap + * tables yet so we should not use the special + * UltraSPARC-III+ page size encodings yet. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -617,6 +603,9 @@ static void __flush_nucleus_vptes(void) unsigned long tag; /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no cheetah+ + * page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -783,6 +772,9 @@ void inherit_locked_prom_mappings(int save_p) unsigned long data; /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no cheetah+ + * page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -794,6 +786,9 @@ void inherit_locked_prom_mappings(int save_p) unsigned long tag; /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -821,6 +816,9 @@ void inherit_locked_prom_mappings(int save_p) unsigned long data; /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -832,6 +830,9 @@ void inherit_locked_prom_mappings(int save_p) unsigned long tag; /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -947,6 +948,7 @@ void prom_reload_locked(void) } } +#ifdef DCACHE_ALIASING_POSSIBLE void __flush_dcache_range(unsigned long start, unsigned long end) { unsigned long va; @@ -970,6 +972,7 @@ void __flush_dcache_range(unsigned long start, unsigned long end) "i" (ASI_DCACHE_INVALIDATE)); } } +#endif /* DCACHE_ALIASING_POSSIBLE */ /* If not locked, zap it. */ void __flush_tlb_all(void) @@ -985,6 +988,9 @@ void __flush_tlb_all(void) if (tlb_type == spitfire) { for (i = 0; i < 64; i++) { /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -1000,6 +1006,9 @@ void __flush_tlb_all(void) } /* Spitfire Errata #32 workaround */ + /* NOTE: Always runs on spitfire, so no + * cheetah+ page size encodings. + */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ @@ -1033,11 +1042,14 @@ void __flush_tlb_all(void) void get_new_mmu_context(struct mm_struct *mm) { unsigned long ctx, new_ctx; + unsigned long orig_pgsz_bits; + spin_lock(&ctx_alloc_lock); - ctx = CTX_HWBITS(tlb_context_cache + 1); - new_ctx = find_next_zero_bit(mmu_context_bmap, 1UL << CTX_VERSION_SHIFT, ctx); - if (new_ctx >= (1UL << CTX_VERSION_SHIFT)) { + orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); + ctx = (tlb_context_cache + 1) & CTX_NR_MASK; + new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); + if (new_ctx >= (1 << CTX_NR_BITS)) { new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); if (new_ctx >= ctx) { int i; @@ -1066,9 +1078,8 @@ void get_new_mmu_context(struct mm_struct *mm) new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); out: tlb_context_cache = new_ctx; + mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; spin_unlock(&ctx_alloc_lock); - - mm->context = new_ctx; } #ifndef CONFIG_SMP @@ -1087,7 +1098,7 @@ struct pgtable_cache_struct pgt_quicklists; * using the later address range, accesses with the first address * range will see the newly initialized data rather than the garbage. */ -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE #define DC_ALIAS_SHIFT 1 #else #define DC_ALIAS_SHIFT 0 @@ -1111,7 +1122,7 @@ pte_t *__pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) unsigned long paddr; pte_t *pte; -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE set_page_count(page, 1); ClearPageCompound(page); @@ -1129,7 +1140,7 @@ pte_t *__pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) to_free = (unsigned long *) paddr; } -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE /* Now free the other one up, adjust cache size. */ preempt_disable(); *to_free = (unsigned long) pte_quicklist[color ^ 0x1]; @@ -1702,22 +1713,6 @@ void __init mem_init(void) initpages = (((unsigned long) __init_end) - ((unsigned long) __init_begin)); initpages = PAGE_ALIGN(initpages) >> PAGE_SHIFT; -#ifndef CONFIG_SMP - { - /* Put empty_pg_dir on pgd_quicklist */ - extern pgd_t empty_pg_dir[1024]; - unsigned long addr = (unsigned long)empty_pg_dir; - unsigned long alias_base = kern_base + PAGE_OFFSET - - (long)(KERNBASE); - - memset(empty_pg_dir, 0, sizeof(empty_pg_dir)); - addr += alias_base; - free_pgd_fast((pgd_t *)addr); - num_physpages++; - totalram_pages++; - } -#endif - printk("Memory: %uk available (%ldk kernel code, %ldk data, %ldk init) [%016lx,%016lx]\n", nr_free_pages() << (PAGE_SHIFT-10), codepages << (PAGE_SHIFT-10), diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index 6255d6ef48eb0..90ca99d0b89cd 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c @@ -26,15 +26,13 @@ void flush_tlb_pending(void) struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); if (mp->tlb_nr) { - unsigned long context = mp->mm->context; - - if (CTX_VALID(context)) { + if (CTX_VALID(mp->mm->context)) { #ifdef CONFIG_SMP smp_flush_tlb_pending(mp->mm, mp->tlb_nr, &mp->vaddrs[0]); #else - __flush_tlb_pending(CTX_HWBITS(context), mp->tlb_nr, - &mp->vaddrs[0]); + __flush_tlb_pending(CTX_HWBITS(mp->mm->context), + mp->tlb_nr, &mp->vaddrs[0]); #endif } mp->tlb_nr = 0; @@ -73,6 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t } no_cache_flush: + if (mp->tlb_frozen) return; @@ -101,11 +100,10 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long if (mp->tlb_frozen) return; - /* Nobody should call us with start below VM hole and end above. - * See if it is really true. - */ - BUG_ON(s > e); + /* If start is greater than end, that is a real problem. */ + BUG_ON(start > end); + /* However, straddling the VA space hole is quite normal. */ s &= PMD_MASK; e = (e + PMD_SIZE - 1) & PMD_MASK; @@ -123,6 +121,22 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long start = vpte_base + (s >> (PAGE_SHIFT - 3)); end = vpte_base + (e >> (PAGE_SHIFT - 3)); + + /* If the request straddles the VA space hole, we + * need to swap start and end. The reason this + * occurs is that "vpte_base" is the center of + * the linear page table mapping area. Thus, + * high addresses with the sign bit set map to + * addresses below vpte_base and non-sign bit + * addresses map to addresses above vpte_base. + */ + if (end < start) { + unsigned long tmp = start; + + start = end; + end = tmp; + } + while (start < end) { mp->vaddrs[nr] = start; mp->tlb_nr = ++nr; @@ -135,10 +149,3 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long if (nr) flush_tlb_pending(); } - -unsigned long __ptrs_per_pmd(void) -{ - if (test_thread_flag(TIF_32BIT)) - return (1UL << (32 - (PAGE_SHIFT-3) - PAGE_SHIFT)); - return REAL_PTRS_PER_PMD; -} diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S index af8205edfbd0f..7a0934321010a 100644 --- a/arch/sparc64/mm/ultra.S +++ b/arch/sparc64/mm/ultra.S @@ -13,6 +13,7 @@ #include <asm/pil.h> #include <asm/head.h> #include <asm/thread_info.h> +#include <asm/cacheflush.h> /* Basically, most of the Spitfire vs. Cheetah madness * has to do with the fact that Cheetah does not support @@ -49,9 +50,9 @@ __flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */ .globl __flush_tlb_pending __flush_tlb_pending: /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */ - rdpr %pstate, %g5 + rdpr %pstate, %g7 sllx %o1, 3, %o1 - andn %g5, PSTATE_IE, %g2 + andn %g7, PSTATE_IE, %g2 wrpr %g2, %pstate mov SECONDARY_CONTEXT, %o4 ldxa [%o4] ASI_DMMU, %g2 @@ -70,7 +71,7 @@ __flush_tlb_pending: stxa %g2, [%o4] ASI_DMMU flush %g6 retl - wrpr %g5, 0x0, %pstate + wrpr %g7, 0x0, %pstate .align 32 .globl __flush_tlb_kernel_range @@ -114,64 +115,27 @@ __spitfire_flush_tlb_mm_slow: .align 32 .globl __flush_icache_page __flush_icache_page: /* %o0 = phys_page */ - sethi %hi(1 << 13), %o2 ! IC_set bit - mov 1, %g1 - srlx %o0, 5, %o0 - clr %o1 ! IC_addr - sllx %g1, 36, %g1 - ldda [%o1] ASI_IC_TAG, %o4 - sub %g1, 1, %g2 - or %o0, %g1, %o0 ! VALID+phys-addr comparitor - - sllx %g2, 1, %g2 - andn %g2, ITAG_MASK, %g2 ! IC_tag mask - nop - nop - nop - nop - nop - nop - -1: addx %g0, %g0, %g0 - ldda [%o1 + %o2] ASI_IC_TAG, %g4 - addx %g0, %g0, %g0 - and %o5, %g2, %g3 - cmp %g3, %o0 - add %o1, 0x20, %o1 - ldda [%o1] ASI_IC_TAG, %o4 - be,pn %xcc, iflush1 - -2: nop - and %g5, %g2, %g5 - cmp %g5, %o0 - be,pn %xcc, iflush2 -3: cmp %o1, %o2 - bne,pt %xcc, 1b - addx %g0, %g0, %g0 - nop - + membar #StoreStore + srlx %o0, PAGE_SHIFT, %o0 + sethi %uhi(PAGE_OFFSET), %g1 + sllx %o0, PAGE_SHIFT, %o0 + sethi %hi(PAGE_SIZE), %g2 + sllx %g1, 32, %g1 + add %o0, %g1, %o0 +1: subcc %g2, 32, %g2 + bne,pt %icc, 1b + flush %o0 + %g2 retl - ldx [%g6 + TI_TASK], %g4 + nop -iflush1:sub %o1, 0x20, %g3 - stxa %g0, [%g3] ASI_IC_TAG - flush %g6 - ba,a,pt %xcc, 2b -iflush2:sub %o1, 0x20, %g3 - stxa %g0, [%o1 + %o2] ASI_IC_TAG - flush %g6 - ba,a,pt %xcc, 3b +#ifdef DCACHE_ALIASING_POSSIBLE -#if (PAGE_SHIFT == 13) -#define DTAG_MASK 0x3 -#elif (PAGE_SHIFT == 16) -#define DTAG_MASK 0x1f -#elif (PAGE_SHIFT == 19) -#define DTAG_MASK 0xff -#elif (PAGE_SHIFT == 22) -#define DTAG_MASK 0x3ff +#if (PAGE_SHIFT != 13) +#error only page shift of 13 is supported by dcache flush #endif +#define DTAG_MASK 0x3 + .align 64 .globl __flush_dcache_page __flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */ @@ -228,6 +192,7 @@ dflush4:stxa %g0, [%o4] ASI_DCACHE_TAG membar #Sync ba,pt %xcc, 2b nop +#endif /* DCACHE_ALIASING_POSSIBLE */ .align 32 __prefill_dtlb: @@ -258,10 +223,18 @@ __update_mmu_cache: /* %o0=hw_context, %o1=address, %o2=pte, %o3=fault_code */ or %o5, %o0, %o5 ba,a,pt %xcc, __prefill_itlb - /* Cheetah specific versions, patched at boot time. */ + /* Cheetah specific versions, patched at boot time. + * + * This writes of the PRIMARY_CONTEXT register in this file are + * safe even on Cheetah+ and later wrt. the page size fields. + * The nucleus page size fields do not matter because we make + * no data references, and these instructions execute out of a + * locked I-TLB entry sitting in the fully assosciative I-TLB. + * This sequence should also never trap. + */ __cheetah_flush_tlb_mm: /* 15 insns */ - rdpr %pstate, %g5 - andn %g5, PSTATE_IE, %g2 + rdpr %pstate, %g7 + andn %g7, PSTATE_IE, %g2 wrpr %g2, 0x0, %pstate wrpr %g0, 1, %tl mov PRIMARY_CONTEXT, %o2 @@ -274,13 +247,13 @@ __cheetah_flush_tlb_mm: /* 15 insns */ flush %g6 wrpr %g0, 0, %tl retl - wrpr %g5, 0x0, %pstate + wrpr %g7, 0x0, %pstate __cheetah_flush_tlb_pending: /* 22 insns */ /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */ - rdpr %pstate, %g5 + rdpr %pstate, %g7 sllx %o1, 3, %o1 - andn %g5, PSTATE_IE, %g2 + andn %g7, PSTATE_IE, %g2 wrpr %g2, 0x0, %pstate wrpr %g0, 1, %tl mov PRIMARY_CONTEXT, %o4 @@ -299,8 +272,9 @@ __cheetah_flush_tlb_pending: /* 22 insns */ flush %g6 wrpr %g0, 0, %tl retl - wrpr %g5, 0x0, %pstate + wrpr %g7, 0x0, %pstate +#ifdef DCACHE_ALIASING_POSSIBLE flush_dcpage_cheetah: /* 11 insns */ sethi %uhi(PAGE_OFFSET), %g1 sllx %g1, 32, %g1 @@ -313,6 +287,7 @@ flush_dcpage_cheetah: /* 11 insns */ nop retl /* I-cache flush never needed on Cheetah, see callers. */ nop +#endif /* DCACHE_ALIASING_POSSIBLE */ cheetah_patch_one: 1: lduw [%o1], %g1 @@ -343,12 +318,14 @@ cheetah_patch_cachetlbops: call cheetah_patch_one mov 22, %o2 +#ifdef DCACHE_ALIASING_POSSIBLE sethi %hi(__flush_dcache_page), %o0 or %o0, %lo(__flush_dcache_page), %o0 sethi %hi(flush_dcpage_cheetah), %o1 or %o1, %lo(flush_dcpage_cheetah), %o1 call cheetah_patch_one mov 11, %o2 +#endif /* DCACHE_ALIASING_POSSIBLE */ ret restore @@ -464,6 +441,7 @@ xcall_report_regs: b rtrap_xcall ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 +#ifdef DCACHE_ALIASING_POSSIBLE .align 32 .globl xcall_flush_dcache_page_cheetah xcall_flush_dcache_page_cheetah: /* %g1 == physical page address */ @@ -475,12 +453,13 @@ xcall_flush_dcache_page_cheetah: /* %g1 == physical page address */ nop retry nop +#endif /* DCACHE_ALIASING_POSSIBLE */ .globl xcall_flush_dcache_page_spitfire xcall_flush_dcache_page_spitfire: /* %g1 == physical page address %g7 == kernel page virtual address %g5 == (page->mapping != NULL) */ -#if (L1DCACHE_SIZE > PAGE_SIZE) +#ifdef DCACHE_ALIASING_POSSIBLE srlx %g1, (13 - 2), %g1 ! Form tag comparitor sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K sub %g3, (1 << 5), %g3 ! D$ linesize == 32 @@ -499,7 +478,7 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address sub %g3, (1 << 5), %g3 brz,pn %g5, 2f -#endif /* L1DCACHE_SIZE > PAGE_SIZE */ +#endif /* DCACHE_ALIASING_POSSIBLE */ sethi %hi(PAGE_SIZE), %g3 1: flush %g7 diff --git a/arch/sparc64/prom/map.S b/arch/sparc64/prom/map.S index 509f7b4abef1e..21b3f9c99ea77 100644 --- a/arch/sparc64/prom/map.S +++ b/arch/sparc64/prom/map.S @@ -32,6 +32,7 @@ prom_remap: /* %o0 = physpage, %o1 = virtpage, %o2 = mmu_ihandle */ ldx [%g2 + 0x08], %l0 ! prom_cif_handler mov %g6, %i3 mov %g4, %i4 + mov %g5, %i5 flushw sethi %hi(prom_remap - call_method), %g7 @@ -62,6 +63,7 @@ prom_remap: /* %o0 = physpage, %o1 = virtpage, %o2 = mmu_ihandle */ /* Restore hard-coded globals. */ mov %i3, %g6 mov %i4, %g4 + mov %i5, %g5 /* Wheee.... we are done. */ ret diff --git a/arch/sparc64/prom/p1275.c b/arch/sparc64/prom/p1275.c index 9eab4421e1e4c..59fe38bba39e8 100644 --- a/arch/sparc64/prom/p1275.c +++ b/arch/sparc64/prom/p1275.c @@ -30,6 +30,16 @@ extern void prom_world(int); extern void prom_cif_interface(void); extern void prom_cif_callback(void); +static inline unsigned long spitfire_get_primary_context(void) +{ + unsigned long ctx; + + __asm__ __volatile__("ldxa [%1] %2, %0" + : "=r" (ctx) + : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); + return ctx; +} + /* * This provides SMP safety on the p1275buf. prom_callback() drops this lock * to allow recursuve acquisition. @@ -43,14 +53,9 @@ long p1275_cmd (char *service, long fmt, ...) int nargs, nrets, i; va_list list; long attrs, x; - long ctx = 0; p = p1275buf.prom_buffer; - ctx = spitfire_get_primary_context (); - if (ctx) { - flushw_user (); - spitfire_set_primary_context (0); - } + BUG_ON((spitfire_get_primary_context() & CTX_NR_MASK) != 0); spin_lock_irqsave(&prom_entry_lock, flags); @@ -146,9 +151,6 @@ long p1275_cmd (char *service, long fmt, ...) spin_unlock_irqrestore(&prom_entry_lock, flags); - if (ctx) - spitfire_set_primary_context (ctx); - return x; } diff --git a/include/asm-parisc/unaligned.h b/include/asm-parisc/unaligned.h index 0896a9f66529d..53c905838d933 100644 --- a/include/asm-parisc/unaligned.h +++ b/include/asm-parisc/unaligned.h @@ -1,7 +1,7 @@ #ifndef _ASM_PARISC_UNALIGNED_H_ #define _ASM_PARISC_UNALIGNED_H_ -#include <asm-parisc/unaligned.h> +#include <asm-generic/unaligned.h> #ifdef __KERNEL__ struct pt_regs; diff --git a/include/asm-sparc/pgtable.h b/include/asm-sparc/pgtable.h index 3d2418c28ff58..373a6c327590d 100644 --- a/include/asm-sparc/pgtable.h +++ b/include/asm-sparc/pgtable.h @@ -150,6 +150,7 @@ BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) BTFIXUPDEF_CALL(void, pte_clear, pte_t *) +BTFIXUPDEF_CALL(int, pte_read, pte_t) extern __inline__ int pte_none(pte_t pte) { @@ -158,6 +159,7 @@ extern __inline__ int pte_none(pte_t pte) #define pte_present(pte) BTFIXUP_CALL(pte_present)(pte) #define pte_clear(mm,addr,pte) BTFIXUP_CALL(pte_clear)(pte) +#define pte_read(pte) BTFIXUP_CALL(pte_read)(pte) BTFIXUPDEF_CALL_CONST(int, pmd_bad, pmd_t) BTFIXUPDEF_CALL_CONST(int, pmd_present, pmd_t) @@ -186,31 +188,10 @@ BTFIXUPDEF_CALL(void, pgd_clear, pgd_t *) * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -BTFIXUPDEF_HALF(pte_readi) BTFIXUPDEF_HALF(pte_writei) BTFIXUPDEF_HALF(pte_dirtyi) BTFIXUPDEF_HALF(pte_youngi) -extern int pte_read(pte_t pte) __attribute_const__; -extern __inline__ int pte_read(pte_t pte) -{ - switch (sparc_cpu_model){ - case sun4: - case sun4c: - return pte_val(pte) & BTFIXUP_HALF(pte_readi); - case sun4d: - case sun4e: - case sun4m: - return !(pte_val(pte) & BTFIXUP_HALF(pte_readi)); - /* pacify gcc warnings */ - case sun4u: - case sun_unknown: - case ap1000: - default: - return 0; - } -} - extern int pte_write(pte_t pte) __attribute_const__; extern __inline__ int pte_write(pte_t pte) { diff --git a/include/asm-sparc64/cacheflush.h b/include/asm-sparc64/cacheflush.h index f1f8661cf83a8..86f02937ff1b7 100644 --- a/include/asm-sparc64/cacheflush.h +++ b/include/asm-sparc64/cacheflush.h @@ -2,6 +2,17 @@ #define _SPARC64_CACHEFLUSH_H #include <linux/config.h> +#include <asm/page.h> + +/* Flushing for D-cache alias handling is only needed if + * the page size is smaller than 16K. + */ +#if PAGE_SHIFT < 14 +#define DCACHE_ALIASING_POSSIBLE +#endif + +#ifndef __ASSEMBLY__ + #include <linux/mm.h> /* Cache flush operations. */ @@ -20,9 +31,9 @@ * module load, so we need this. */ extern void flush_icache_range(unsigned long start, unsigned long end); +extern void __flush_icache_page(unsigned long); extern void __flush_dcache_page(void *addr, int flush_icache); -extern void __flush_icache_page(unsigned long); extern void flush_dcache_page_impl(struct page *page); #ifdef CONFIG_SMP extern void smp_flush_dcache_page_impl(struct page *page, int cpu); @@ -33,6 +44,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page); #endif extern void __flush_dcache_range(unsigned long start, unsigned long end); +extern void flush_dcache_page(struct page *page); #define flush_icache_page(vma, pg) do { } while(0) #define flush_icache_user_range(vma,pg,adr,len) do { } while (0) @@ -49,11 +61,12 @@ extern void __flush_dcache_range(unsigned long start, unsigned long end); memcpy(dst, src, len); \ } while (0) -extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) +#endif /* !__ASSEMBLY__ */ + #endif /* _SPARC64_CACHEFLUSH_H */ diff --git a/include/asm-sparc64/checksum.h b/include/asm-sparc64/checksum.h index 91136a643c27f..dc8bed246fc98 100644 --- a/include/asm-sparc64/checksum.h +++ b/include/asm-sparc64/checksum.h @@ -38,47 +38,44 @@ extern unsigned int csum_partial(const unsigned char * buff, int len, unsigned i * here even more important to align src and dst on a 32-bit (or even * better 64-bit) boundary */ -extern unsigned int csum_partial_copy_sparc64(const unsigned char *src, unsigned char *dst, +extern unsigned int csum_partial_copy_nocheck(const unsigned char *src, + unsigned char *dst, int len, unsigned int sum); - -static inline unsigned int -csum_partial_copy_nocheck (const unsigned char *src, unsigned char *dst, int len, - unsigned int sum) -{ - int ret; - unsigned char cur_ds = get_thread_current_ds(); - __asm__ __volatile__ ("wr %%g0, %0, %%asi" : : "i" (ASI_P)); - ret = csum_partial_copy_sparc64(src, dst, len, sum); - __asm__ __volatile__ ("wr %%g0, %0, %%asi" : : "r" (cur_ds)); - return ret; -} -static inline unsigned int -csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, int len, +extern long __csum_partial_copy_from_user(const unsigned char __user *src, + unsigned char *dst, int len, + unsigned int sum); + +static inline unsigned int +csum_partial_copy_from_user(const unsigned char __user *src, + unsigned char *dst, int len, unsigned int sum, int *err) { - __asm__ __volatile__ ("stx %0, [%%sp + 0x7ff + 128]" - : : "r" (err)); - return csum_partial_copy_sparc64((__force const char *) src, - dst, len, sum); + long ret = __csum_partial_copy_from_user(src, dst, len, sum); + if (ret < 0) + *err = -EFAULT; + return (unsigned int) ret; } /* * Copy and checksum to user */ #define HAVE_CSUM_COPY_USER -extern unsigned int csum_partial_copy_user_sparc64(const unsigned char *src, unsigned char __user *dst, - int len, unsigned int sum); +extern long __csum_partial_copy_to_user(const unsigned char *src, + unsigned char __user *dst, int len, + unsigned int sum); -static inline unsigned int -csum_and_copy_to_user(const unsigned char *src, unsigned char __user *dst, int len, +static inline unsigned int +csum_and_copy_to_user(const unsigned char *src, + unsigned char __user *dst, int len, unsigned int sum, int *err) { - __asm__ __volatile__ ("stx %0, [%%sp + 0x7ff + 128]" - : : "r" (err)); - return csum_partial_copy_user_sparc64(src, dst, len, sum); + long ret = __csum_partial_copy_to_user(src, dst, len, sum); + if (ret < 0) + *err = -EFAULT; + return (unsigned int) ret; } - + /* ihl is always 5 or greater, almost always is 5, and iph is word aligned * the majority of the time. */ diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h index d7625ffc0b85a..cc7198aaac505 100644 --- a/include/asm-sparc64/cpudata.h +++ b/include/asm-sparc64/cpudata.h @@ -19,12 +19,13 @@ typedef struct { /* Dcache line 2 */ unsigned int pgcache_size; - unsigned int pgdcache_size; + unsigned int __pad1; unsigned long *pte_cache[2]; unsigned long *pgd_cache; } cpuinfo_sparc; DECLARE_PER_CPU(cpuinfo_sparc, __cpu_data); -#define cpu_data(__cpu) per_cpu(__cpu_data, (__cpu)) +#define cpu_data(__cpu) per_cpu(__cpu_data, (__cpu)) +#define local_cpu_data() __get_cpu_var(__cpu_data) #endif /* _SPARC64_CPUDATA_H */ diff --git a/include/asm-sparc64/ide.h b/include/asm-sparc64/ide.h index 6b327402277fd..4c1098474c73f 100644 --- a/include/asm-sparc64/ide.h +++ b/include/asm-sparc64/ide.h @@ -13,8 +13,8 @@ #include <linux/config.h> #include <asm/pgalloc.h> #include <asm/io.h> -#include <asm/page.h> #include <asm/spitfire.h> +#include <asm/cacheflush.h> #ifndef MAX_HWIFS # ifdef CONFIG_BLK_DEV_IDEPCI @@ -51,7 +51,7 @@ static inline unsigned int inw_be(void __iomem *addr) static inline void __ide_insw(void __iomem *port, void *dst, u32 count) { -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE unsigned long end = (unsigned long)dst + (count << 1); #endif u16 *ps = dst; @@ -74,7 +74,7 @@ static inline void __ide_insw(void __iomem *port, void *dst, u32 count) if(count) *ps++ = inw_be(port); -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_range((unsigned long)dst, end); #endif } @@ -88,7 +88,7 @@ static inline void outw_be(unsigned short w, void __iomem *addr) static inline void __ide_outsw(void __iomem *port, void *src, u32 count) { -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE unsigned long end = (unsigned long)src + (count << 1); #endif const u16 *ps = src; @@ -111,7 +111,7 @@ static inline void __ide_outsw(void __iomem *port, void *src, u32 count) if(count) outw_be(*ps, port); -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_range((unsigned long)src, end); #endif } diff --git a/include/asm-sparc64/mmu.h b/include/asm-sparc64/mmu.h index ccd36d26615a7..8627eed6e83df 100644 --- a/include/asm-sparc64/mmu.h +++ b/include/asm-sparc64/mmu.h @@ -1,7 +1,99 @@ #ifndef __MMU_H #define __MMU_H -/* Default "unsigned long" context */ -typedef unsigned long mm_context_t; +#include <linux/config.h> +#include <asm/page.h> +#include <asm/const.h> +/* + * For the 8k pagesize kernel, use only 10 hw context bits to optimize some + * shifts in the fast tlbmiss handlers, instead of all 13 bits (specifically + * for vpte offset calculation). For other pagesizes, this optimization in + * the tlbhandlers can not be done; but still, all 13 bits can not be used + * because the tlb handlers use "andcc" instruction which sign extends 13 + * bit arguments. + */ +#if PAGE_SHIFT == 13 +#define CTX_NR_BITS 10 +#else +#define CTX_NR_BITS 12 #endif + +#define TAG_CONTEXT_BITS ((_AC(1,UL) << CTX_NR_BITS) - _AC(1,UL)) + +/* UltraSPARC-III+ and later have a feature whereby you can + * select what page size the various Data-TLB instances in the + * chip. In order to gracefully support this, we put the version + * field in a spot outside of the areas of the context register + * where this parameter is specified. + */ +#define CTX_VERSION_SHIFT 22 +#define CTX_VERSION_MASK ((~0UL) << CTX_VERSION_SHIFT) + +#define CTX_PGSZ_8KB _AC(0x0,UL) +#define CTX_PGSZ_64KB _AC(0x1,UL) +#define CTX_PGSZ_512KB _AC(0x2,UL) +#define CTX_PGSZ_4MB _AC(0x3,UL) +#define CTX_PGSZ_BITS _AC(0x7,UL) +#define CTX_PGSZ0_NUC_SHIFT 61 +#define CTX_PGSZ1_NUC_SHIFT 58 +#define CTX_PGSZ0_SHIFT 16 +#define CTX_PGSZ1_SHIFT 19 +#define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \ + (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT)) + +#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) +#define CTX_PGSZ_BASE CTX_PGSZ_8KB +#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) +#define CTX_PGSZ_BASE CTX_PGSZ_64KB +#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB) +#define CTX_PGSZ_BASE CTX_PGSZ_512KB +#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB) +#define CTX_PGSZ_BASE CTX_PGSZ_4MB +#else +#error No page size specified in kernel configuration +#endif + +#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) +#define CTX_PGSZ_HUGE CTX_PGSZ_4MB +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) +#define CTX_PGSZ_HUGE CTX_PGSZ_512KB +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +#define CTX_PGSZ_HUGE CTX_PGSZ_64KB +#endif + +#define CTX_PGSZ_KERN CTX_PGSZ_4MB + +/* Thus, when running on UltraSPARC-III+ and later, we use the following + * PRIMARY_CONTEXT register values for the kernel context. + */ +#define CTX_CHEETAH_PLUS_NUC \ + ((CTX_PGSZ_KERN << CTX_PGSZ0_NUC_SHIFT) | \ + (CTX_PGSZ_BASE << CTX_PGSZ1_NUC_SHIFT)) + +#define CTX_CHEETAH_PLUS_CTX0 \ + ((CTX_PGSZ_KERN << CTX_PGSZ0_SHIFT) | \ + (CTX_PGSZ_BASE << CTX_PGSZ1_SHIFT)) + +/* If you want "the TLB context number" use CTX_NR_MASK. If you + * want "the bits I program into the context registers" use + * CTX_HW_MASK. + */ +#define CTX_NR_MASK TAG_CONTEXT_BITS +#define CTX_HW_MASK (CTX_NR_MASK | CTX_PGSZ_MASK) + +#define CTX_FIRST_VERSION ((_AC(1,UL) << CTX_VERSION_SHIFT) + _AC(1,UL)) +#define CTX_VALID(__ctx) \ + (!(((__ctx.sparc64_ctx_val) ^ tlb_context_cache) & CTX_VERSION_MASK)) +#define CTX_HWBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_HW_MASK) +#define CTX_NRBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_NR_MASK) + +#ifndef __ASSEMBLY__ + +typedef struct { + unsigned long sparc64_ctx_val; +} mm_context_t; + +#endif /* !__ASSEMBLY__ */ + +#endif /* __MMU_H */ diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h index 08275bc3478ac..87c43c67866e9 100644 --- a/include/asm-sparc64/mmu_context.h +++ b/include/asm-sparc64/mmu_context.h @@ -4,23 +4,6 @@ /* Derived heavily from Linus's Alpha/AXP ASN code... */ -#include <asm/page.h> - -/* - * For the 8k pagesize kernel, use only 10 hw context bits to optimize some shifts in - * the fast tlbmiss handlers, instead of all 13 bits (specifically for vpte offset - * calculation). For other pagesizes, this optimization in the tlbhandlers can not be - * done; but still, all 13 bits can not be used because the tlb handlers use "andcc" - * instruction which sign extends 13 bit arguments. - */ -#if PAGE_SHIFT == 13 -#define CTX_VERSION_SHIFT 10 -#define TAG_CONTEXT_BITS 0x3ff -#else -#define CTX_VERSION_SHIFT 12 -#define TAG_CONTEXT_BITS 0xfff -#endif - #ifndef __ASSEMBLY__ #include <linux/spinlock.h> @@ -35,19 +18,14 @@ extern spinlock_t ctx_alloc_lock; extern unsigned long tlb_context_cache; extern unsigned long mmu_context_bmap[]; -#define CTX_VERSION_MASK ((~0UL) << CTX_VERSION_SHIFT) -#define CTX_FIRST_VERSION ((1UL << CTX_VERSION_SHIFT) + 1UL) -#define CTX_VALID(__ctx) \ - (!(((__ctx) ^ tlb_context_cache) & CTX_VERSION_MASK)) -#define CTX_HWBITS(__ctx) ((__ctx) & ~CTX_VERSION_MASK) - extern void get_new_mmu_context(struct mm_struct *mm); /* Initialize a new mmu context. This is invoked when a new * address space instance (unique or shared) is instantiated. * This just needs to set mm->context to an invalid context. */ -#define init_new_context(__tsk, __mm) (((__mm)->context = 0UL), 0) +#define init_new_context(__tsk, __mm) \ + (((__mm)->context.sparc64_ctx_val = 0UL), 0) /* Destroy a dead context. This occurs when mmput drops the * mm_users count to zero, the mmaps have been released, and @@ -59,7 +37,7 @@ extern void get_new_mmu_context(struct mm_struct *mm); #define destroy_context(__mm) \ do { spin_lock(&ctx_alloc_lock); \ if (CTX_VALID((__mm)->context)) { \ - unsigned long nr = CTX_HWBITS((__mm)->context); \ + unsigned long nr = CTX_NRBITS((__mm)->context); \ mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63)); \ } \ spin_unlock(&ctx_alloc_lock); \ @@ -101,7 +79,7 @@ do { \ "flush %%g6" \ : /* No outputs */ \ : "r" (CTX_HWBITS((__mm)->context)), \ - "r" (0x10), "i" (ASI_DMMU)) + "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU)) extern void __flush_tlb_mm(unsigned long, unsigned long); @@ -135,7 +113,8 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str */ if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) { cpu_set(cpu, mm->cpu_vm_mask); - __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT); + __flush_tlb_mm(CTX_HWBITS(mm->context), + SECONDARY_CONTEXT); } } spin_unlock(&mm->page_table_lock); diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h index c3dc444563e07..219ea043a14a8 100644 --- a/include/asm-sparc64/page.h +++ b/include/asm-sparc64/page.h @@ -6,7 +6,18 @@ #include <linux/config.h> #include <asm/const.h> +#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) #define PAGE_SHIFT 13 +#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) +#define PAGE_SHIFT 16 +#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB) +#define PAGE_SHIFT 19 +#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB) +#define PAGE_SHIFT 22 +#else +#error No page size specified in kernel configuration +#endif + #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h index 8571d6d1a9dfe..80d66d31b62d6 100644 --- a/include/asm-sparc64/percpu.h +++ b/include/asm-sparc64/percpu.h @@ -1,6 +1,45 @@ #ifndef __ARCH_SPARC64_PERCPU__ #define __ARCH_SPARC64_PERCPU__ -#include <asm-generic/percpu.h> +#include <linux/compiler.h> + +#define __GENERIC_PER_CPU +#ifdef CONFIG_SMP + +extern unsigned long __per_cpu_offset[NR_CPUS]; + +/* Separate out the type, so (int[3], foo) works. */ +#define DEFINE_PER_CPU(type, name) \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name + +register unsigned long __local_per_cpu_offset asm("g5"); + +/* var is in discarded region: offset to particular copy we want */ +#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) +#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) + +/* A macro to avoid #include hell... */ +#define percpu_modcopy(pcpudst, src, size) \ +do { \ + unsigned int __i; \ + for (__i = 0; __i < NR_CPUS; __i++) \ + if (cpu_possible(__i)) \ + memcpy((pcpudst)+__per_cpu_offset[__i], \ + (src), (size)); \ +} while (0) +#else /* ! SMP */ + +#define DEFINE_PER_CPU(type, name) \ + __typeof__(type) per_cpu__##name + +#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define __get_cpu_var(var) per_cpu__##var + +#endif /* SMP */ + +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name + +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) #endif /* __ARCH_SPARC64_PERCPU__ */ diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h index 167d514bdf6ee..2c28e1f605b76 100644 --- a/include/asm-sparc64/pgalloc.h +++ b/include/asm-sparc64/pgalloc.h @@ -9,84 +9,23 @@ #include <asm/spitfire.h> #include <asm/cpudata.h> +#include <asm/cacheflush.h> /* Page table allocation/freeing. */ #ifdef CONFIG_SMP /* Sliiiicck */ -#define pgt_quicklists cpu_data(smp_processor_id()) +#define pgt_quicklists local_cpu_data() #else extern struct pgtable_cache_struct { unsigned long *pgd_cache; unsigned long *pte_cache[2]; unsigned int pgcache_size; - unsigned int pgdcache_size; } pgt_quicklists; #endif #define pgd_quicklist (pgt_quicklists.pgd_cache) #define pmd_quicklist ((unsigned long *)0) #define pte_quicklist (pgt_quicklists.pte_cache) #define pgtable_cache_size (pgt_quicklists.pgcache_size) -#define pgd_cache_size (pgt_quicklists.pgdcache_size) - -#ifndef CONFIG_SMP - -static __inline__ void free_pgd_fast(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - preempt_disable(); - if (!page->lru.prev) { - page->lru.next = (void *) pgd_quicklist; - pgd_quicklist = (unsigned long *)page; - } - page->lru.prev = (void *) - (((unsigned long)page->lru.prev) | - (((unsigned long)pgd & (PAGE_SIZE / 2)) ? 2 : 1)); - pgd_cache_size++; - preempt_enable(); -} - -static __inline__ pgd_t *get_pgd_fast(void) -{ - struct page *ret; - - preempt_disable(); - if ((ret = (struct page *)pgd_quicklist) != NULL) { - unsigned long mask = (unsigned long)ret->lru.prev; - unsigned long off = 0; - - if (mask & 1) - mask &= ~1; - else { - off = PAGE_SIZE / 2; - mask &= ~2; - } - ret->lru.prev = (void *) mask; - if (!mask) - pgd_quicklist = (unsigned long *)ret->lru.next; - ret = (struct page *)(__page_address(ret) + off); - pgd_cache_size--; - preempt_enable(); - } else { - struct page *page; - - preempt_enable(); - page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (page) { - ret = (struct page *)page_address(page); - page->lru.prev = (void *) 2UL; - - preempt_disable(); - page->lru.next = (void *) pgd_quicklist; - pgd_quicklist = (unsigned long *)page; - pgd_cache_size++; - preempt_enable(); - } - } - return (pgd_t *)ret; -} - -#else /* CONFIG_SMP */ static __inline__ void free_pgd_fast(pgd_t *pgd) { @@ -121,9 +60,7 @@ static __inline__ void free_pgd_slow(pgd_t *pgd) free_page((unsigned long)pgd); } -#endif /* CONFIG_SMP */ - -#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */ +#ifdef DCACHE_ALIASING_POSSIBLE #define VPTE_COLOR(address) (((address) >> (PAGE_SHIFT + 10)) & 1UL) #define DCACHE_COLOR(address) (((address) >> PAGE_SHIFT) & 1UL) #else diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h index dfb8a88863186..ca04ac105b694 100644 --- a/include/asm-sparc64/pgtable.h +++ b/include/asm-sparc64/pgtable.h @@ -60,44 +60,24 @@ #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PMD_BITS 11 +#define PMD_BITS (PAGE_SHIFT - 2) /* PGDIR_SHIFT determines what a third-level page table entry can map */ #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) +#define PGDIR_BITS (PAGE_SHIFT - 2) #ifndef __ASSEMBLY__ #include <linux/sched.h> /* Entries per page directory level. */ -#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) - -/* We the first one in this file, what we export to the kernel - * is different so we can optimize correctly for 32-bit tasks. - */ -#define REAL_PTRS_PER_PMD (1UL << PMD_BITS) - -/* This is gross, but unless we do this gcc retests the - * thread flag every interation in pmd traversal loops. - */ -extern unsigned long __ptrs_per_pmd(void) __attribute_const__; -#define PTRS_PER_PMD __ptrs_per_pmd() - -/* - * We cannot use the top address range because VPTE table lives there. This - * formula finds the total legal virtual space in the processor, subtracts the - * vpte size, then aligns it to the number of bytes mapped by one pgde, and - * thus calculates the number of pgdes needed. - */ -#define PTRS_PER_PGD (((1UL << VA_BITS) - VPTE_SIZE + (1UL << (PAGE_SHIFT + \ - (PAGE_SHIFT-3) + PMD_BITS)) - 1) / (1UL << (PAGE_SHIFT + \ - (PAGE_SHIFT-3) + PMD_BITS))) +#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) +#define PTRS_PER_PMD (1UL << PMD_BITS) +#define PTRS_PER_PGD (1UL << PGDIR_BITS) /* Kernel has a separate 44bit address space. */ -#define USER_PTRS_PER_PGD ((const int)(test_thread_flag(TIF_32BIT)) ? \ - (1) : (PTRS_PER_PGD)) #define FIRST_USER_PGD_NR 0 #define pte_ERROR(e) __builtin_trap() @@ -236,8 +216,8 @@ extern struct page *mem_map_zero; /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where - * the first physical page in the machine is at some huge physical address, such - * as 4GB. This is common on a partitioned E10000, for example. + * the first physical page in the machine is at some huge physical address, + * such as 4GB. This is common on a partitioned E10000, for example. */ #define pfn_pte(pfn, prot) \ @@ -308,7 +288,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_MODIFIED | _PAGE_W)) /* to find an entry in a page-table-directory. */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD)) +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) /* to find an entry in a kernel page-table-directory */ @@ -322,7 +302,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot) /* Find an entry in the second-level page table.. */ #define pmd_offset(pudp, address) \ ((pmd_t *) pud_page(*(pudp)) + \ - (((address) >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) \ diff --git a/include/asm-sparc64/rwsem-const.h b/include/asm-sparc64/rwsem-const.h new file mode 100644 index 0000000000000..a303c9d64d845 --- /dev/null +++ b/include/asm-sparc64/rwsem-const.h @@ -0,0 +1,12 @@ +/* rwsem-const.h: RW semaphore counter constants. */ +#ifndef _SPARC64_RWSEM_CONST_H +#define _SPARC64_RWSEM_CONST_H + +#define RWSEM_UNLOCKED_VALUE 0x00000000 +#define RWSEM_ACTIVE_BIAS 0x00000001 +#define RWSEM_ACTIVE_MASK 0x0000ffff +#define RWSEM_WAITING_BIAS 0xffff0000 +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + +#endif /* _SPARC64_RWSEM_CONST_H */ diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h index 82fffac5b0b81..bf2ae90ed3df8 100644 --- a/include/asm-sparc64/rwsem.h +++ b/include/asm-sparc64/rwsem.h @@ -15,17 +15,12 @@ #include <linux/list.h> #include <linux/spinlock.h> +#include <asm/rwsem-const.h> struct rwsem_waiter; struct rw_semaphore { signed int count; -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS 0xffff0000 -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) spinlock_t wait_lock; struct list_head wait_list; }; @@ -56,16 +51,16 @@ static __inline__ int rwsem_atomic_update(int delta, struct rw_semaphore *sem) int tmp = delta; __asm__ __volatile__( - "1:\tlduw [%2], %%g5\n\t" - "add %%g5, %1, %%g7\n\t" - "cas [%2], %%g5, %%g7\n\t" - "cmp %%g5, %%g7\n\t" + "1:\tlduw [%2], %%g1\n\t" + "add %%g1, %1, %%g7\n\t" + "cas [%2], %%g1, %%g7\n\t" + "cmp %%g1, %%g7\n\t" "bne,pn %%icc, 1b\n\t" " membar #StoreLoad | #StoreStore\n\t" "mov %%g7, %0\n\t" : "=&r" (tmp) : "0" (tmp), "r" (sem) - : "g5", "g7", "memory", "cc"); + : "g1", "g7", "memory", "cc"); return tmp + delta; } diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h index 6ee83ff2fde36..ad78ce64d69ee 100644 --- a/include/asm-sparc64/spitfire.h +++ b/include/asm-sparc64/spitfire.h @@ -34,6 +34,9 @@ #define PHYS_WATCHPOINT 0x0000000000000040 #define SPITFIRE_HIGHEST_LOCKED_TLBENT (64 - 1) +#define CHEETAH_HIGHEST_LOCKED_TLBENT (16 - 1) + +#define L1DCACHE_SIZE 0x4000 #ifndef __ASSEMBLY__ @@ -45,10 +48,6 @@ enum ultra_tlb_layout { extern enum ultra_tlb_layout tlb_type; -#define CHEETAH_HIGHEST_LOCKED_TLBENT (16 - 1) - -#define L1DCACHE_SIZE 0x4000 - #define sparc64_highest_locked_tlbent() \ (tlb_type == spitfire ? \ SPITFIRE_HIGHEST_LOCKED_TLBENT : \ @@ -100,46 +99,6 @@ static __inline__ void spitfire_put_dsfsr(unsigned long sfsr) : "r" (sfsr), "r" (TLB_SFSR), "i" (ASI_DMMU)); } -static __inline__ unsigned long spitfire_get_primary_context(void) -{ - unsigned long ctx; - - __asm__ __volatile__("ldxa [%1] %2, %0" - : "=r" (ctx) - : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - return ctx; -} - -static __inline__ void spitfire_set_primary_context(unsigned long ctx) -{ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : /* No outputs */ - : "r" (ctx & 0x3ff), - "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); - __asm__ __volatile__ ("membar #Sync" : : : "memory"); -} - -static __inline__ unsigned long spitfire_get_secondary_context(void) -{ - unsigned long ctx; - - __asm__ __volatile__("ldxa [%1] %2, %0" - : "=r" (ctx) - : "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU)); - return ctx; -} - -static __inline__ void spitfire_set_secondary_context(unsigned long ctx) -{ - __asm__ __volatile__("stxa %0, [%1] %2\n\t" - "membar #Sync" - : /* No outputs */ - : "r" (ctx & 0x3ff), - "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU)); - __asm__ __volatile__ ("membar #Sync" : : : "memory"); -} - /* The data cache is write through, so this just invalidates the * specified line. */ diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h index e8ba9d5277e15..fd12ca386f486 100644 --- a/include/asm-sparc64/system.h +++ b/include/asm-sparc64/system.h @@ -182,7 +182,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ __asm__ __volatile__("wr %%g0, %0, %%asi" \ : : "r" (__thread_flag_byte_ptr(next->thread_info)[TI_FLAG_BYTE_CURRENT_DS]));\ __asm__ __volatile__( \ - "mov %%g4, %%g5\n\t" \ + "mov %%g4, %%g7\n\t" \ "wrpr %%g0, 0x95, %%pstate\n\t" \ "stx %%i6, [%%sp + 2047 + 0x70]\n\t" \ "stx %%i7, [%%sp + 2047 + 0x78]\n\t" \ @@ -207,7 +207,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ "wrpr %%g0, 0x96, %%pstate\n\t" \ "andcc %%o7, %6, %%g0\n\t" \ "beq,pt %%icc, 1f\n\t" \ - " mov %%g5, %0\n\t" \ + " mov %%g7, %0\n\t" \ "b,a ret_from_syscall\n\t" \ "1:\n\t" \ : "=&r" (last) \ @@ -215,7 +215,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ "i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_FLAGS), "i" (TI_CWP), \ "i" (_TIF_NEWCHILD), "i" (TI_TASK) \ : "cc", \ - "g1", "g2", "g3", "g5", "g7", \ + "g1", "g2", "g3", "g7", \ "l2", "l3", "l4", "l5", "l6", "l7", \ "i0", "i1", "i2", "i3", "i4", "i5", \ "o0", "o1", "o2", "o3", "o4", "o5", "o7" EXTRA_CLOBBER);\ @@ -226,37 +226,41 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ } \ } while(0) -static __inline__ unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) +static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) { + unsigned long tmp1, tmp2; + __asm__ __volatile__( " membar #StoreLoad | #LoadLoad\n" -" mov %0, %%g5\n" -"1: lduw [%2], %%g7\n" -" cas [%2], %%g7, %0\n" -" cmp %%g7, %0\n" +" mov %0, %1\n" +"1: lduw [%4], %2\n" +" cas [%4], %2, %0\n" +" cmp %2, %0\n" " bne,a,pn %%icc, 1b\n" -" mov %%g5, %0\n" +" mov %1, %0\n" " membar #StoreLoad | #StoreStore\n" - : "=&r" (val) + : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) : "0" (val), "r" (m) - : "g5", "g7", "cc", "memory"); + : "cc", "memory"); return val; } -static __inline__ unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val) +static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val) { + unsigned long tmp1, tmp2; + __asm__ __volatile__( " membar #StoreLoad | #LoadLoad\n" -" mov %0, %%g5\n" -"1: ldx [%2], %%g7\n" -" casx [%2], %%g7, %0\n" -" cmp %%g7, %0\n" +" mov %0, %1\n" +"1: ldx [%4], %2\n" +" casx [%4], %2, %0\n" +" cmp %2, %0\n" " bne,a,pn %%xcc, 1b\n" -" mov %%g5, %0\n" +" mov %1, %0\n" " membar #StoreLoad | #StoreStore\n" - : "=&r" (val) + : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) : "0" (val), "r" (m) - : "g5", "g7", "cc", "memory"); + : "cc", "memory"); return val; } diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index fa0ebf6786fc9..9baf57db01d20 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -44,7 +44,7 @@ extern void flush_tlb_pending(void); static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *mp = &per_cpu(mmu_gathers, smp_processor_id()); + struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); BUG_ON(mp->tlb_nr); @@ -89,9 +89,7 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un tlb_flush_mmu(mp); if (mp->tlb_frozen) { - unsigned long context = mm->context; - - if (CTX_VALID(context)) + if (CTX_VALID(mm->context)) do_flush_tlb_mm(mm); mp->tlb_frozen = 0; } else |