[ppc64] __hash_page rewrite, from Ben Herrenschmidt Rewrite __hash_page function in assembly in such a way we don't need the page table lock any more. We now rely on a BUSY bit in the linux PTE on which we spin on when doing an update of the PTE --- /dev/null | 494 ---------------------------------------- arch/ppc64/kernel/Makefile | 2 arch/ppc64/kernel/setup.c | 4 arch/ppc64/kernel/vmlinux.lds.S | 2 arch/ppc64/mm/Makefile | 2 arch/ppc64/mm/hash_low.S | 283 ++++++++++++++++++++++ arch/ppc64/mm/hash_utils.c | 358 ++++++++++++++++++++++++++++ arch/ppc64/mm/hugetlbpage.c | 10 arch/ppc64/mm/init.c | 7 include/asm-ppc64/mmu.h | 12 include/asm-ppc64/pgtable.h | 66 +++-- 11 files changed, 717 insertions(+), 523 deletions(-) diff -puN arch/ppc64/kernel/Makefile~ppc64-hash_page_rewrite arch/ppc64/kernel/Makefile --- 25/arch/ppc64/kernel/Makefile~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/Makefile 2004-01-13 23:23:06.000000000 -0800 @@ -7,7 +7,7 @@ extra-y := head.o vmlinux.lds.s obj-y := setup.o entry.o traps.o irq.o idle.o \ time.o process.o signal.o syscalls.o misc.o ptrace.o \ - align.o semaphore.o bitops.o stab.o htab.o pacaData.o \ + align.o semaphore.o bitops.o stab.o pacaData.o \ udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \ ptrace32.o signal32.o pmc.o rtc.o init_task.o \ lmb.o cputable.o diff -puN -L arch/ppc64/kernel/htab.c arch/ppc64/kernel/htab.c~ppc64-hash_page_rewrite /dev/null --- 25/arch/ppc64/kernel/htab.c +++ /dev/null 2002-08-30 16:31:37.000000000 -0700 @@ -1,494 +0,0 @@ -/* - * PowerPC64 port by Mike Corrigan and Dave Engebretsen - * {mikejc|engebret}@us.ibm.com - * - * Copyright (c) 2000 Mike Corrigan - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * Module name: htab.c - * - * Description: - * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Note: pte --> Linux PTE - * HPTE --> PowerPC Hashed Page Table Entry - * - * Execution context: - * htab_initialize is called with the MMU off (of course), but - * the kernel has been copied down to zero so it can directly - * reference global data. At this point it is very difficult - * to print debug info. - * - */ - -HTAB htab_data = {NULL, 0, 0, 0, 0}; - -extern unsigned long _SDR1; - -#define KB (1024) -#define MB (1024*KB) - -static inline void -loop_forever(void) -{ - volatile unsigned long x = 1; - for(;x;x|=1) - ; -} - -#ifdef CONFIG_PPC_PSERIES -static inline void -create_pte_mapping(unsigned long start, unsigned long end, - unsigned long mode, int large) -{ - unsigned long addr; - unsigned int step; - - if (large) - step = 16*MB; - else - step = 4*KB; - - for (addr = start; addr < end; addr += step) { - unsigned long vpn, hash, hpteg; - unsigned long vsid = get_kernel_vsid(addr); - unsigned long va = (vsid << 28) | (addr & 0xfffffff); - int ret; - - if (large) - vpn = va >> LARGE_PAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - - hash = hpt_hash(vpn, large); - - hpteg = ((hash & htab_data.htab_hash_mask)*HPTES_PER_GROUP); - - if (systemcfg->platform == PLATFORM_PSERIES_LPAR) - ret = pSeries_lpar_hpte_insert(hpteg, va, - (unsigned long)__v2a(addr) >> PAGE_SHIFT, - 0, mode, 1, large); - else - ret = pSeries_hpte_insert(hpteg, va, - (unsigned long)__v2a(addr) >> PAGE_SHIFT, - 0, mode, 1, large); - - if (ret == -1) { - ppc64_terminate_msg(0x20, "create_pte_mapping"); - loop_forever(); - } - } -} - -void -htab_initialize(void) -{ - unsigned long table, htab_size_bytes; - unsigned long pteg_count; - unsigned long mode_rw; - - /* - * Calculate the required size of the htab. We want the number of - * PTEGs to equal one half the number of real pages. - */ - htab_size_bytes = 1UL << naca->pftSize; - pteg_count = htab_size_bytes >> 7; - - /* For debug, make the HTAB 1/8 as big as it normally would be. */ - ifppcdebug(PPCDBG_HTABSIZE) { - pteg_count >>= 3; - htab_size_bytes = pteg_count << 7; - } - - htab_data.htab_num_ptegs = pteg_count; - htab_data.htab_hash_mask = pteg_count - 1; - - if (systemcfg->platform == PLATFORM_PSERIES) { - /* Find storage for the HPT. Must be contiguous in - * the absolute address space. - */ - table = lmb_alloc(htab_size_bytes, htab_size_bytes); - if ( !table ) { - ppc64_terminate_msg(0x20, "hpt space"); - loop_forever(); - } - htab_data.htab = (HPTE *)__a2v(table); - - /* htab absolute addr + encoded htabsize */ - _SDR1 = table + __ilog2(pteg_count) - 11; - - /* Initialize the HPT with no entries */ - memset((void *)table, 0, htab_size_bytes); - } else { - /* Using a hypervisor which owns the htab */ - htab_data.htab = NULL; - _SDR1 = 0; - } - - mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX; - - /* XXX we currently map kernel text rw, should fix this */ - if ((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) - && systemcfg->physicalMemorySize > 256*MB) { - create_pte_mapping((unsigned long)KERNELBASE, - KERNELBASE + 256*MB, mode_rw, 0); - create_pte_mapping((unsigned long)KERNELBASE + 256*MB, - KERNELBASE + (systemcfg->physicalMemorySize), - mode_rw, 1); - } else { - create_pte_mapping((unsigned long)KERNELBASE, - KERNELBASE+(systemcfg->physicalMemorySize), - mode_rw, 0); - } -} -#undef KB -#undef MB -#endif - -/* - * find_linux_pte returns the address of a linux pte for a given - * effective address and directory. If not found, it returns zero. - */ -pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) -{ - pgd_t *pg; - pmd_t *pm; - pte_t *pt = NULL; - pte_t pte; - - pg = pgdir + pgd_index(ea); - if (!pgd_none(*pg)) { - - pm = pmd_offset(pg, ea); - if (pmd_present(*pm)) { - pt = pte_offset_kernel(pm, ea); - pte = *pt; - if (!pte_present(pte)) - pt = NULL; - } - } - - return pt; -} - -static inline unsigned long computeHptePP(unsigned long pte) -{ - return (pte & _PAGE_USER) | - (((pte & _PAGE_USER) >> 1) & - ((~((pte >> 2) & /* _PAGE_RW */ - (pte >> 7))) & /* _PAGE_DIRTY */ - 1)); -} - -/* - * Handle a fault by adding an HPTE. If the address can't be determined - * to be valid via Linux page tables, return 1. If handled return 0 - */ -int __hash_page(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, int local) -{ - unsigned long va, vpn; - unsigned long newpp, prpn; - unsigned long hpteflags; - long slot; - pte_t old_pte, new_pte; - - /* XXX fix for large ptes */ - int large = 0; - - /* Search the Linux page table for a match with va */ - va = (vsid << 28) | (ea & 0x0fffffff); - - if (large) - vpn = va >> LARGE_PAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - - /* - * If no pte found or not present, send the problem up to - * do_page_fault - */ - if (unlikely(!ptep || !pte_present(*ptep))) - return 1; - - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - access |= _PAGE_PRESENT; - if (unlikely(access & ~(pte_val(*ptep)))) - return 1; - - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - old_pte = *ptep; - new_pte = old_pte; - /* If the attempted access was a store */ - if (access & _PAGE_RW) - pte_val(new_pte) |= _PAGE_ACCESSED | _PAGE_DIRTY; - else - pte_val(new_pte) |= _PAGE_ACCESSED; - - newpp = computeHptePP(pte_val(new_pte)); - -#define PPC64_HWNOEXEC (1 << 2) - - /* We do lazy icache flushing on cpus that support it */ - if (unlikely((cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE) - && pfn_valid(pte_pfn(new_pte)))) { - struct page *page = pte_page(new_pte); - - /* page is dirty */ - if (!PageReserved(page) && - !test_bit(PG_arch_1, &page->flags)) { - if (trap == 0x400) { - __flush_dcache_icache(page_address(page)); - set_bit(PG_arch_1, &page->flags); - } else { - newpp |= PPC64_HWNOEXEC; - } - } - } - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot, secondary; - - hash = hpt_hash(vpn, large); - secondary = (pte_val(old_pte) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; - - if (ppc_md.hpte_updatepp(slot, newpp, va, large, local) == -1) - pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; - else - if (!pte_same(old_pte, new_pte)) - *ptep = new_pte; - } - - if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, large); - unsigned long hpte_group; - prpn = pte_val(old_pte) >> PTE_SHIFT; - -repeat: - hpte_group = ((hash & htab_data.htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* Update the linux pte with the HPTE slot */ - pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; - pte_val(new_pte) |= _PAGE_HASHPTE; - - /* copy appropriate flags from linux pte */ - hpteflags = (pte_val(new_pte) & 0x1f8) | newpp; - - slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0, - hpteflags, 0, large); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - pte_val(new_pte) |= 1 << 15; - hpte_group = ((~hash & htab_data.htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, prpn, - 1, hpteflags, 0, large); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } - - if (unlikely(slot == -2)) - panic("hash_page: pte_insert failed\n"); - - pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; - - /* - * No need to use ldarx/stdcx here because all who - * might be updating the pte will hold the - * page_table_lock or the hash_table_lock - * (we hold both) - */ - *ptep = new_pte; - } - - return 0; -} - -int hash_page(unsigned long ea, unsigned long access, unsigned long trap) -{ - void *pgdir; - unsigned long vsid; - struct mm_struct *mm; - pte_t *ptep; - int ret; - int user_region = 0; - int local = 0; - cpumask_t tmp; - - /* Check for invalid addresses. */ - if (!IS_VALID_EA(ea)) - return 1; - - switch (REGION_ID(ea)) { - case USER_REGION_ID: - user_region = 1; - mm = current->mm; - if (mm == NULL) - return 1; - - vsid = get_vsid(mm->context, ea); - break; - case IO_REGION_ID: - mm = &ioremap_mm; - vsid = get_kernel_vsid(ea); - break; - case VMALLOC_REGION_ID: - mm = &init_mm; - vsid = get_kernel_vsid(ea); - break; -#if 0 - case EEH_REGION_ID: - /* - * Should only be hit if there is an access to MMIO space - * which is protected by EEH. - * Send the problem up to do_page_fault - */ - case KERNEL_REGION_ID: - /* - * Should never get here - entire 0xC0... region is bolted. - * Send the problem up to do_page_fault - */ -#endif - default: - /* Not a valid range - * Send the problem up to do_page_fault - */ - return 1; - break; - } - - pgdir = mm->pgd; - - if (pgdir == NULL) - return 1; - - /* - * Lock the Linux page table to prevent mmap and kswapd - * from modifying entries while we search and update - */ - spin_lock(&mm->page_table_lock); - - tmp = cpumask_of_cpu(smp_processor_id()); - if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) - local = 1; - - ret = hash_huge_page(mm, access, ea, vsid, local); - if (ret < 0) { - ptep = find_linux_pte(pgdir, ea); - ret = __hash_page(ea, access, vsid, ptep, trap, local); - } - - spin_unlock(&mm->page_table_lock); - - return ret; -} - -void flush_hash_page(unsigned long context, unsigned long ea, pte_t pte, - int local) -{ - unsigned long vsid, vpn, va, hash, secondary, slot; - - /* XXX fix for large ptes */ - unsigned long large = 0; - - if ((ea >= USER_START) && (ea <= USER_END)) - vsid = get_vsid(context, ea); - else - vsid = get_kernel_vsid(ea); - - va = (vsid << 28) | (ea & 0x0fffffff); - if (large) - vpn = va >> LARGE_PAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - hash = hpt_hash(vpn, large); - secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; - - ppc_md.hpte_invalidate(slot, va, large, local); -} - -void flush_hash_range(unsigned long context, unsigned long number, int local) -{ - if (ppc_md.flush_hash_range) { - ppc_md.flush_hash_range(context, number, local); - } else { - int i; - struct ppc64_tlb_batch *batch = - &ppc64_tlb_batch[smp_processor_id()]; - - for (i = 0; i < number; i++) - flush_hash_page(context, batch->addr[i], batch->pte[i], - local); - } -} diff -puN arch/ppc64/kernel/setup.c~ppc64-hash_page_rewrite arch/ppc64/kernel/setup.c --- 25/arch/ppc64/kernel/setup.c~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/setup.c 2004-01-13 23:23:06.000000000 -0800 @@ -213,6 +213,10 @@ void setup_system(unsigned long r3, unsi #endif } #endif + /* Finish initializing the hash table (do the dynamic + * patching for the fast-path hashtable.S code) + */ + htab_finish_init(); printk("Starting Linux PPC64 %s\n", UTS_RELEASE); diff -puN arch/ppc64/kernel/vmlinux.lds.S~ppc64-hash_page_rewrite arch/ppc64/kernel/vmlinux.lds.S --- 25/arch/ppc64/kernel/vmlinux.lds.S~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/kernel/vmlinux.lds.S 2004-01-13 23:23:06.000000000 -0800 @@ -53,7 +53,6 @@ SECTIONS *(.data1) *(.sdata) *(.sdata2) - *(.got.plt) *(.got) *(.dynamic) CONSTRUCTORS } @@ -126,6 +125,7 @@ SECTIONS /* freed after init ends here */ __toc_start = .; + .got : { *(.got.plt) *(.got) } .toc : { *(.toc) } . = ALIGN(4096); __toc_end = .; diff -puN arch/ppc64/mm/Makefile~ppc64-hash_page_rewrite arch/ppc64/mm/Makefile --- 25/arch/ppc64/mm/Makefile~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/mm/Makefile 2004-01-13 23:23:06.000000000 -0800 @@ -4,6 +4,6 @@ EXTRA_CFLAGS += -mno-minimal-toc -obj-y := fault.o init.o extable.o imalloc.o +obj-y := fault.o init.o extable.o imalloc.o hash_utils.o hash_low.o obj-$(CONFIG_DISCONTIGMEM) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff -puN /dev/null arch/ppc64/mm/hash_low.S --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/arch/ppc64/mm/hash_low.S 2004-01-13 23:23:06.000000000 -0800 @@ -0,0 +1,283 @@ +/* + * ppc64 MMU hashtable management routines + * + * (c) Copyright IBM Corp. 2003 + * + * Maintained by: Benjamin Herrenschmidt + * + * + * This file is covered by the GNU Public Licence v2 as + * described in the kernel's COPYING file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .text + +/* + * Stackframe: + * + * +-> Back chain (SP + 256) + * | General register save area (SP + 112) + * | Parameter save area (SP + 48) + * | TOC save area (SP + 40) + * | link editor doubleword (SP + 32) + * | compiler doubleword (SP + 24) + * | LR save area (SP + 16) + * | CR save area (SP + 8) + * SP ---> +-- Back chain (SP + 0) + */ +#define STACKFRAMESIZE 256 + +/* Save parameters offsets */ +#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) + +/* Save non-volatile offsets */ +#define STK_REG(i) (112 + ((i)-14)*8) + +/* + * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local) + * + * Adds a page to the hash table. This is the non-LPAR version for now + */ + +_GLOBAL(__hash_page) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + bne- 1b + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY,HASHPTE and ACCESSED) + */ + rlwinm r30,r4,5,24,24 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 + rldicl r3,r3,0,36 + or r29,r3,r29 + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ + xor r28,r5,r0 + + /* Convert linux PTE bits into HW equivalents + */ + andi. r3,r30,0x1fa /* Get basic set of flags */ + rlwinm r0,r30,32-2+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_data@got(2) + ld r27,16(r4) /* htab_data.htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + andi. r0,r31,_PAGE_HASHPTE + bne htab_modify_pte + +htab_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HASHPTE + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 + ori r30,r30,_PAGE_HASHPTE + + /* page number in r5 */ + rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r6,0 /* primary slot */ + li r8,0 /* not bolted and not large */ + li r9,0 +_GLOBAL(htab_call_hpte_insert1) + bl . /* Will be patched by htab_finish_init() */ + cmpi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + ori r30,r30,_PAGE_SECONDARY + + /* page number in r5 */ + rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r6,1 /* secondary slot */ + li r8,0 /* not bolted and not large */ + li r9,0 +_GLOBAL(htab_call_hpte_insert2) + bl . /* Will be patched by htab_finish_init() */ + cmpi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(htab_call_hpte_remove) + bl . /* Will be patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + +htab_pte_insert_ok: + /* Insert slot number in PTE */ + rldimi r30,r3,12,63-14 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +htab_write_out_pte: + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3, 0 +bail: + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_SECONDARY + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,0 /* large is 0 */ + ld r7,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(htab_call_hpte_updatepp) + bl . /* Will be patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b htab_write_out_pte + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b bail + +htab_pte_insert_failure: + b .htab_insert_failure + + diff -puN /dev/null arch/ppc64/mm/hash_utils.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/arch/ppc64/mm/hash_utils.c 2004-01-13 23:23:06.000000000 -0800 @@ -0,0 +1,358 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +HTAB htab_data = {NULL, 0, 0, 0, 0}; + +extern unsigned long _SDR1; + +#define KB (1024) +#define MB (1024*KB) + +static inline void loop_forever(void) +{ + volatile unsigned long x = 1; + for(;x;x|=1) + ; +} + +#ifdef CONFIG_PPC_PSERIES +static inline void create_pte_mapping(unsigned long start, unsigned long end, + unsigned long mode, int large) +{ + unsigned long addr; + unsigned int step; + + if (large) + step = 16*MB; + else + step = 4*KB; + + for (addr = start; addr < end; addr += step) { + unsigned long vpn, hash, hpteg; + unsigned long vsid = get_kernel_vsid(addr); + unsigned long va = (vsid << 28) | (addr & 0xfffffff); + int ret; + + if (large) + vpn = va >> LARGE_PAGE_SHIFT; + else + vpn = va >> PAGE_SHIFT; + + hash = hpt_hash(vpn, large); + + hpteg = ((hash & htab_data.htab_hash_mask)*HPTES_PER_GROUP); + + if (systemcfg->platform == PLATFORM_PSERIES_LPAR) + ret = pSeries_lpar_hpte_insert(hpteg, va, + (unsigned long)__v2a(addr) >> PAGE_SHIFT, + 0, mode, 1, large); + else + ret = pSeries_hpte_insert(hpteg, va, + (unsigned long)__v2a(addr) >> PAGE_SHIFT, + 0, mode, 1, large); + + if (ret == -1) { + ppc64_terminate_msg(0x20, "create_pte_mapping"); + loop_forever(); + } + } +} + +void __init htab_initialize(void) +{ + unsigned long table, htab_size_bytes; + unsigned long pteg_count; + unsigned long mode_rw; + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = 1UL << naca->pftSize; + pteg_count = htab_size_bytes >> 7; + + /* For debug, make the HTAB 1/8 as big as it normally would be. */ + ifppcdebug(PPCDBG_HTABSIZE) { + pteg_count >>= 3; + htab_size_bytes = pteg_count << 7; + } + + htab_data.htab_num_ptegs = pteg_count; + htab_data.htab_hash_mask = pteg_count - 1; + + if (systemcfg->platform == PLATFORM_PSERIES) { + /* Find storage for the HPT. Must be contiguous in + * the absolute address space. + */ + table = lmb_alloc(htab_size_bytes, htab_size_bytes); + if ( !table ) { + ppc64_terminate_msg(0x20, "hpt space"); + loop_forever(); + } + htab_data.htab = (HPTE *)__a2v(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(pteg_count) - 11; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + } else { + /* Using a hypervisor which owns the htab */ + htab_data.htab = NULL; + _SDR1 = 0; + } + + mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX; + + /* XXX we currently map kernel text rw, should fix this */ + if ((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) + && systemcfg->physicalMemorySize > 256*MB) { + create_pte_mapping((unsigned long)KERNELBASE, + KERNELBASE + 256*MB, mode_rw, 0); + create_pte_mapping((unsigned long)KERNELBASE + 256*MB, + KERNELBASE + (systemcfg->physicalMemorySize), + mode_rw, 1); + } else { + create_pte_mapping((unsigned long)KERNELBASE, + KERNELBASE+(systemcfg->physicalMemorySize), + mode_rw, 0); + } +} +#undef KB +#undef MB +#endif + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + +#define PPC64_HWNOEXEC (1 << 2) + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + __flush_dcache_icache(page_address(page)); + set_bit(PG_arch_1, &page->flags); + } else + pp |= PPC64_HWNOEXEC; + } + return pp; +} + +/* + * Called by asm hashtable.S in case of critical insert failure + */ +void htab_insert_failure(void) +{ + panic("hash_page: pte_insert failed\n"); +} + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap) +{ + void *pgdir; + unsigned long vsid; + struct mm_struct *mm; + pte_t *ptep; + int ret; + int user_region = 0; + int local = 0; + cpumask_t tmp; + + /* Check for invalid addresses. */ + if (!IS_VALID_EA(ea)) + return 1; + + switch (REGION_ID(ea)) { + case USER_REGION_ID: + user_region = 1; + mm = current->mm; + if (mm == NULL) + return 1; + + vsid = get_vsid(mm->context, ea); + break; + case IO_REGION_ID: + mm = &ioremap_mm; + vsid = get_kernel_vsid(ea); + break; + case VMALLOC_REGION_ID: + mm = &init_mm; + vsid = get_kernel_vsid(ea); + break; +#if 0 + case EEH_REGION_ID: + /* + * Should only be hit if there is an access to MMIO space + * which is protected by EEH. + * Send the problem up to do_page_fault + */ + case KERNEL_REGION_ID: + /* + * Should never get here - entire 0xC0... region is bolted. + * Send the problem up to do_page_fault + */ +#endif + default: + /* Not a valid range + * Send the problem up to do_page_fault + */ + return 1; + break; + } + + pgdir = mm->pgd; + + if (pgdir == NULL) + return 1; + + tmp = cpumask_of_cpu(smp_processor_id()); + if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) + local = 1; + + /* Is this a huge page ? */ + if (unlikely(in_hugepage_area(mm->context, ea))) + ret = hash_huge_page(mm, access, ea, vsid, local); + else { + ptep = find_linux_pte(pgdir, ea); + if (ptep == NULL) + return 1; + ret = __hash_page(ea, access, vsid, ptep, trap, local); + } + + + return ret; +} + +void flush_hash_page(unsigned long context, unsigned long ea, pte_t pte, + int local) +{ + unsigned long vsid, vpn, va, hash, secondary, slot; + + /* XXX fix for large ptes */ + unsigned long large = 0; + + if ((ea >= USER_START) && (ea <= USER_END)) + vsid = get_vsid(context, ea); + else + vsid = get_kernel_vsid(ea); + + va = (vsid << 28) | (ea & 0x0fffffff); + if (large) + vpn = va >> LARGE_PAGE_SHIFT; + else + vpn = va >> PAGE_SHIFT; + hash = hpt_hash(vpn, large); + secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; + if (secondary) + hash = ~hash; + slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP; + slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; + + ppc_md.hpte_invalidate(slot, va, large, local); +} + +void flush_hash_range(unsigned long context, unsigned long number, int local) +{ + if (ppc_md.flush_hash_range) { + ppc_md.flush_hash_range(context, number, local); + } else { + int i; + struct ppc64_tlb_batch *batch = + &ppc64_tlb_batch[smp_processor_id()]; + + for (i = 0; i < number; i++) + flush_hash_page(context, batch->addr[i], batch->pte[i], + local); + } +} + +static inline void make_bl(unsigned int *insn_addr, void *func) +{ + unsigned long funcp = *((unsigned long *)func); + int offset = funcp - (unsigned long)insn_addr; + + *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); + flush_icache_range((unsigned long)insn_addr, 4+ + (unsigned long)insn_addr); +} + +void __init htab_finish_init(void) +{ + extern unsigned int *htab_call_hpte_insert1; + extern unsigned int *htab_call_hpte_insert2; + extern unsigned int *htab_call_hpte_remove; + extern unsigned int *htab_call_hpte_updatepp; + + make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); + make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); + make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); + make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); +} diff -puN arch/ppc64/mm/hugetlbpage.c~ppc64-hash_page_rewrite arch/ppc64/mm/hugetlbpage.c --- 25/arch/ppc64/mm/hugetlbpage.c~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/mm/hugetlbpage.c 2004-01-13 23:23:06.000000000 -0800 @@ -652,13 +652,9 @@ int hash_huge_page(struct mm_struct *mm, unsigned long va, vpn; int is_write; hugepte_t old_pte, new_pte; - unsigned long hpteflags, prpn; + unsigned long hpteflags, prpn, flags; long slot; - /* Is this for us? */ - if (!in_hugepage_area(mm->context, ea)) - return -1; - ea &= ~(HPAGE_SIZE-1); /* We have to find the first hugepte in the batch, since @@ -698,6 +694,8 @@ int hash_huge_page(struct mm_struct *mm, * page is currently not DIRTY. */ + spin_lock_irqsave(&mm->page_table_lock, flags); + old_pte = *ptep; new_pte = old_pte; @@ -769,6 +767,8 @@ repeat: *ptep = new_pte; } + spin_unlock_irqrestore(&mm->page_table_lock, flags); + return 0; } diff -puN include/asm-ppc64/mmu.h~ppc64-hash_page_rewrite include/asm-ppc64/mmu.h --- 25/include/asm-ppc64/mmu.h~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/include/asm-ppc64/mmu.h 2004-01-13 23:23:06.000000000 -0800 @@ -13,6 +13,8 @@ #ifndef _PPC64_MMU_H_ #define _PPC64_MMU_H_ +#include + #ifndef __ASSEMBLY__ /* Default "unsigned long" context */ @@ -245,6 +247,16 @@ static inline void tlbiel(unsigned long asm volatile("ptesync": : :"memory"); } +/* + * Handle a fault by adding an HPTE. If the address can't be determined + * to be valid via Linux page tables, return 1. If handled return 0 + */ +extern int __hash_page(unsigned long ea, unsigned long access, + unsigned long vsid, pte_t *ptep, unsigned long trap, + int local); + +extern void htab_finish_init(void); + #endif /* __ASSEMBLY__ */ /* diff -puN include/asm-ppc64/pgtable.h~ppc64-hash_page_rewrite include/asm-ppc64/pgtable.h --- 25/include/asm-ppc64/pgtable.h~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/include/asm-ppc64/pgtable.h 2004-01-13 23:23:06.000000000 -0800 @@ -7,6 +7,7 @@ */ #ifndef __ASSEMBLY__ +#include #include /* For TASK_SIZE */ #include #include @@ -74,22 +75,23 @@ * Bits in a linux-style PTE. These match the bits in the * (hardware-defined) PowerPC PTE as closely as possible. */ -#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */ -#define _PAGE_USER 0x002UL /* matches one of the PP bits */ -#define _PAGE_RW 0x004UL /* software: user write access allowed */ -#define _PAGE_GUARDED 0x008UL -#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */ -#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */ -#define _PAGE_DIRTY 0x080UL /* C: page changed */ -#define _PAGE_ACCESSED 0x100UL /* R: page referenced */ -#define _PAGE_FILE 0x200UL /* software: pte holds file offset */ -#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */ -#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */ -#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */ -#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */ +#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ +#define _PAGE_USER 0x0002 /* matches one of the PP bits */ +#define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */ +#define _PAGE_RW 0x0004 /* software: user write access allowed */ +#define _PAGE_GUARDED 0x0008 +#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ +#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */ +#define _PAGE_DIRTY 0x0080 /* C: page changed */ +#define _PAGE_ACCESSED 0x0100 /* R: page referenced */ +#define _PAGE_EXEC 0x0200 /* software: i-cache coherence required */ +#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */ +#define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ +#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */ +#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */ /* Bits 0x7000 identify the index within an HPT Group */ -#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX) +#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX) /* PAGE_MASK gives the right answer below, but only by accident */ /* It should be preserving the high 48 bits and then specifically */ /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */ @@ -157,8 +159,10 @@ extern unsigned long empty_zero_page[PAG #define _PMD_HUGEPAGE 0x00000001U #define HUGEPTE_BATCH_SIZE (1<<(HPAGE_SHIFT-PMD_SHIFT)) +#ifndef __ASSEMBLY__ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local); +#endif /* __ASSEMBLY__ */ #define HAVE_ARCH_UNMAPPED_AREA #else @@ -288,15 +292,17 @@ static inline unsigned long pte_update( unsigned long set ) { unsigned long old, tmp; - + __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ + andi. %1,%0,%7\n\ + bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%5 \n\ stdcx. %1,0,%3 \n\ bne- 1b" : "=&r" (old), "=&r" (tmp), "=m" (*p) - : "r" (p), "r" (clr), "r" (set), "m" (*p) + : "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY) : "cc" ); return old; } @@ -422,5 +428,31 @@ long pSeries_hpte_insert(unsigned long h unsigned long prpn, int secondary, unsigned long hpteflags, int bolted, int large); +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) +{ + pgd_t *pg; + pmd_t *pm; + pte_t *pt = NULL; + pte_t pte; + + pg = pgdir + pgd_index(ea); + if (!pgd_none(*pg)) { + + pm = pmd_offset(pg, ea); + if (pmd_present(*pm)) { + pt = pte_offset_kernel(pm, ea); + pte = *pt; + if (!pte_present(pte)) + pt = NULL; + } + } + + return pt; +} + #endif /* __ASSEMBLY__ */ #endif /* _PPC64_PGTABLE_H */ diff -puN arch/ppc64/mm/init.c~ppc64-hash_page_rewrite arch/ppc64/mm/init.c --- 25/arch/ppc64/mm/init.c~ppc64-hash_page_rewrite 2004-01-13 23:23:06.000000000 -0800 +++ 25-akpm/arch/ppc64/mm/init.c 2004-01-13 23:23:06.000000000 -0800 @@ -901,10 +901,6 @@ void flush_icache_user_range(struct vm_a flush_icache_range(maddr, maddr + len); } -extern pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea); -int __hash_page(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, int local); - /* * This is called at the end of handling a user page fault, when the * fault has been handled by updating a PTE in the linux page tables. @@ -944,6 +940,9 @@ void update_mmu_cache(struct vm_area_str return; ptep = find_linux_pte(pgdir, ea); + if (!ptep) + return; + vsid = get_vsid(vma->vm_mm->context, ea); tmp = cpumask_of_cpu(smp_processor_id()); _