From: Paul Mundt This implements hugetlb support for SH-4. SH-4 supports 1k/4k/64k/1MB pages, since we're only interested in the 64k/1MB sizes, this is what we support. The sh hugetlbpage code borrows heavily off of the sparc64 port, which works fine for these cases at this point in time. --- 25-akpm/arch/sh/Kconfig | 13 25-akpm/arch/sh/mm/Makefile | 3 25-akpm/arch/sh/mm/hugetlbpage.c | 519 +++++++++++++++++++++++++++++++++++++++ 25-akpm/arch/sh/mm/tlb-sh4.c | 2 25-akpm/fs/Kconfig | 2 25-akpm/include/asm-sh/page.h | 12 25-akpm/include/asm-sh/pgtable.h | 27 +- 7 files changed, 566 insertions(+), 12 deletions(-) diff -puN arch/sh/Kconfig~sh-05-hugetlb arch/sh/Kconfig --- 25/arch/sh/Kconfig~sh-05-hugetlb 2004-03-23 02:05:26.400090072 -0800 +++ 25-akpm/arch/sh/Kconfig 2004-03-23 02:05:41.859739848 -0800 @@ -244,6 +244,19 @@ config MMU turning this off will boot the kernel on these machines with the MMU implicitly switched off. +choice + prompt "HugeTLB page size" + depends on HUGETLB_PAGE && CPU_SH4 && MMU + default HUGETLB_PAGE_SIZE_64K + +config HUGETLB_PAGE_SIZE_64K + bool "64K" + +config HUGETLB_PAGE_SIZE_1MB + bool "1MB" + +endchoice + config CMDLINE_BOOL bool "Default bootloader kernel arguments" diff -puN /dev/null arch/sh/mm/hugetlbpage.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/arch/sh/mm/hugetlbpage.c 2004-03-23 02:05:26.412088248 -0800 @@ -0,0 +1,519 @@ +/* + * arch/sh/mm/hugetlbpage.c + * + * SuperH HugeTLB page support. + * + * Cloned from sparc64 by Paul Mundt. + * + * Copyright (C) 2002, 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static long htlbpagemem; +int htlbpage_max; +static long htlbzone_pages; + +static struct list_head hugepage_freelists[MAX_NUMNODES]; +static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED; + +static void enqueue_huge_page(struct page *page) +{ + list_add(&page->list, + &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); +} + +static struct page *dequeue_huge_page(void) +{ + int nid = numa_node_id(); + struct page *page = NULL; + + if (list_empty(&hugepage_freelists[nid])) { + for (nid = 0; nid < MAX_NUMNODES; ++nid) + if (!list_empty(&hugepage_freelists[nid])) + break; + } + if (nid >= 0 && nid < MAX_NUMNODES && + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, list); + list_del(&page->list); + } + return page; +} + +static struct page *alloc_fresh_huge_page(void) +{ + static int nid = 0; + struct page *page; + page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER); + nid = (nid + 1) % numnodes; + return page; +} + +static void free_huge_page(struct page *page); + +static struct page *alloc_hugetlb_page(void) +{ + struct page *page; + + spin_lock(&htlbpage_lock); + page = dequeue_huge_page(); + if (!page) { + spin_unlock(&htlbpage_lock); + return NULL; + } + htlbpagemem--; + spin_unlock(&htlbpage_lock); + set_page_count(page, 1); + page->lru.prev = (void *)free_huge_page; + memset(page_address(page), 0, HPAGE_SIZE); + return page; +} + +static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd) { + pmd = pmd_alloc(mm, pgd, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + } + return pte; +} + +static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd) { + pmd = pmd_offset(pgd, addr); + if (pmd) + pte = pte_offset_map(pmd, addr); + } + return pte; +} + +#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) + +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, int write_access) +{ + unsigned long i; + pte_t entry; + + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + + if (write_access) + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, + vma->vm_page_prot))); + else + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = pte_mkyoung(entry); + mk_pte_huge(entry); + + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte(page_table, entry); + page_table++; + + pte_val(entry) += PAGE_SIZE; + } +} + +/* + * This function checks for proper alignment of input addr and len parameters. + */ +int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + int i; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + BUG_ON(!src_pte || pte_none(*src_pte)); + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte(dst_pte, entry); + pte_val(entry) += PAGE_SIZE; + dst_pte++; + } + dst->rss += (HPAGE_SIZE / PAGE_SIZE); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i) +{ + unsigned long vaddr = *position; + int remainder = *length; + + WARN_ON(!is_vm_hugetlb_page(vma)); + + while (vaddr < vma->vm_end && remainder) { + if (pages) { + pte_t *pte; + struct page *page; + + pte = huge_pte_offset(mm, vaddr); + + /* hugetlb should be locked, and hence, prefaulted */ + BUG_ON(!pte || pte_none(*pte)); + + page = pte_page(*pte); + + WARN_ON(!PageCompound(page)); + + get_page(page); + pages[i] = page; + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + --remainder; + ++i; + } + + *length = remainder; + *position = vaddr; + + return i; +} + +struct page *follow_huge_addr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, int write) +{ + return NULL; +} + +struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr) +{ + return NULL; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + return NULL; +} + +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + BUG_ON(page->mapping); + + INIT_LIST_HEAD(&page->list); + + spin_lock(&htlbpage_lock); + enqueue_huge_page(page); + htlbpagemem++; + spin_unlock(&htlbpage_lock); +} + +void huge_page_release(struct page *page) +{ + if (!put_page_testzero(page)) + return; + + free_huge_page(page); +} + +void unmap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + struct page *page; + int i; + + BUG_ON(start & (HPAGE_SIZE - 1)); + BUG_ON(end & (HPAGE_SIZE - 1)); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_pte_offset(mm, address); + BUG_ON(!pte); + if (pte_none(*pte)) + continue; + page = pte_page(*pte); + huge_page_release(page); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + pte_clear(pte); + pte++; + } + } + mm->rss -= (end - start) >> PAGE_SHIFT; + flush_tlb_range(vma, start, end); +} + +void zap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long length) +{ + struct mm_struct *mm = vma->vm_mm; + + spin_lock(&mm->page_table_lock); + unmap_hugepage_range(vma, start, start + length); + spin_unlock(&mm->page_table_lock); +} + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (!pte_none(*pte)) + continue; + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_hugetlb_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + unlock_page(page); + if (ret) { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +static void update_and_free_page(struct page *page) +{ + int j; + struct page *map; + + map = page; + htlbzone_pages--; + for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) { + map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1<< PG_writeback); + set_page_count(map, 0); + map++; + } + set_page_count(page, 1); + __free_pages(page, HUGETLB_PAGE_ORDER); +} + +static int try_to_free_low(int count) +{ + struct list_head *p; + struct page *page, *map; + + map = NULL; + spin_lock(&htlbpage_lock); + /* all lowmem is on node 0 */ + list_for_each(p, &hugepage_freelists[0]) { + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + map = NULL; + if (++count == 0) + break; + } + page = list_entry(p, struct page, list); + if (!PageHighMem(page)) + map = page; + } + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + count++; + } + spin_unlock(&htlbpage_lock); + return count; +} + +static int set_hugetlb_mem_size(int count) +{ + int lcount; + struct page *page; + + if (count < 0) + lcount = count; + else + lcount = count - htlbzone_pages; + + if (lcount == 0) + return (int)htlbzone_pages; + if (lcount > 0) { /* Increase the mem size. */ + while (lcount--) { + page = alloc_fresh_huge_page(); + if (page == NULL) + break; + spin_lock(&htlbpage_lock); + enqueue_huge_page(page); + htlbpagemem++; + htlbzone_pages++; + spin_unlock(&htlbpage_lock); + } + return (int) htlbzone_pages; + } + /* Shrink the memory size. */ + lcount = try_to_free_low(lcount); + while (lcount++) { + page = alloc_hugetlb_page(); + if (page == NULL) + break; + spin_lock(&htlbpage_lock); + update_and_free_page(page); + spin_unlock(&htlbpage_lock); + } + return (int) htlbzone_pages; +} + +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + struct file *file, void *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + htlbpage_max = set_hugetlb_mem_size(htlbpage_max); + return 0; +} + +static int __init hugetlb_setup(char *s) +{ + if (sscanf(s, "%d", &htlbpage_max) <= 0) + htlbpage_max = 0; + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static int __init hugetlb_init(void) +{ + int i; + struct page *page; + + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&hugepage_freelists[i]); + + for (i = 0; i < htlbpage_max; ++i) { + page = alloc_fresh_huge_page(); + if (!page) + break; + spin_lock(&htlbpage_lock); + enqueue_huge_page(page); + spin_unlock(&htlbpage_lock); + } + htlbpage_max = htlbpagemem = htlbzone_pages = i; + printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem); + return 0; +} +module_init(hugetlb_init); + +int hugetlb_report_meminfo(char *buf) +{ + return sprintf(buf, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "Hugepagesize: %5lu kB\n", + htlbzone_pages, + htlbpagemem, + HPAGE_SIZE/1024); +} + +int is_hugepage_mem_enough(size_t size) +{ + return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem; +} + +/* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the + * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * this far. + */ +static struct page *hugetlb_nopage(struct vm_area_struct *vma, + unsigned long address, int *unused) +{ + BUG(); + return NULL; +} + +struct vm_operations_struct hugetlb_vm_ops = { + .nopage = hugetlb_nopage, +}; diff -puN arch/sh/mm/Makefile~sh-05-hugetlb arch/sh/mm/Makefile --- 25/arch/sh/mm/Makefile~sh-05-hugetlb 2004-03-23 02:05:26.401089920 -0800 +++ 25-akpm/arch/sh/mm/Makefile 2004-03-23 02:05:26.412088248 -0800 @@ -2,13 +2,14 @@ # Makefile for the Linux SuperH-specific parts of the memory manager. # -obj-y := init.o extable.o +obj-y := init.o extable.o consistent.o obj-$(CONFIG_CPU_SH2) += cache-sh2.o obj-$(CONFIG_CPU_SH3) += cache-sh3.o obj-$(CONFIG_CPU_SH4) += cache-sh4.o pg-sh4.o obj-$(CONFIG_DMA_PAGE_OPS) += pg-dma.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o mmu-y := fault-nommu.o tlb-nommu.o pg-nommu.o mmu-$(CONFIG_MMU) := fault.o clear_page.o copy_page.o diff -puN arch/sh/mm/tlb-sh4.c~sh-05-hugetlb arch/sh/mm/tlb-sh4.c --- 25/arch/sh/mm/tlb-sh4.c~sh-05-hugetlb 2004-03-23 02:05:26.402089768 -0800 +++ 25-akpm/arch/sh/mm/tlb-sh4.c 2004-03-23 02:05:26.413088096 -0800 @@ -68,7 +68,7 @@ void update_mmu_cache(struct vm_area_str /* Set PTEL register */ pteval &= _PAGE_FLAGS_HARDWARE_MASK; /* drop software flags */ #ifdef CONFIG_SH_WRITETHROUGH - pteval |= 1; + pteval |= _PAGE_WT; #endif /* conveniently, we want all the software flags to be 0 anyway */ ctrl_outl(pteval, MMU_PTEL); diff -puN fs/Kconfig~sh-05-hugetlb fs/Kconfig --- 25/fs/Kconfig~sh-05-hugetlb 2004-03-23 02:05:26.404089464 -0800 +++ 25-akpm/fs/Kconfig 2004-03-23 02:05:26.414087944 -0800 @@ -866,7 +866,7 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" - depends X86 || IA64 || PPC64 || SPARC64 || X86_64 || BROKEN + depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN config HUGETLB_PAGE def_bool HUGETLBFS diff -puN include/asm-sh/page.h~sh-05-hugetlb include/asm-sh/page.h --- 25/include/asm-sh/page.h~sh-05-hugetlb 2004-03-23 02:05:26.405089312 -0800 +++ 25-akpm/include/asm-sh/page.h 2004-03-23 02:05:26.415087792 -0800 @@ -21,6 +21,18 @@ #define PAGE_MASK (~(PAGE_SIZE-1)) #define PTE_MASK PAGE_MASK +#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +#define HPAGE_SHIFT 16 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1MB) +#define HPAGE_SHIFT 20 +#endif + +#ifdef CONFIG_HUGETLB_PAGE +#define HPAGE_SIZE (1UL << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE-1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT-PAGE_SHIFT) +#endif + #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff -puN include/asm-sh/pgtable.h~sh-05-hugetlb include/asm-sh/pgtable.h --- 25/include/asm-sh/pgtable.h~sh-05-hugetlb 2004-03-23 02:05:26.407089008 -0800 +++ 25-akpm/include/asm-sh/pgtable.h 2004-03-23 02:05:41.875737416 -0800 @@ -53,20 +53,20 @@ extern unsigned long empty_zero_page[102 #define VMALLOC_START (P3SEG+0x00100000) #define VMALLOC_END P4SEG -/* 0x001 WT-bit on SH-4, 0 on SH-3 */ +#define _PAGE_WT 0x001 /* WT-bit on SH-4, 0 on SH-3 */ #define _PAGE_HW_SHARED 0x002 /* SH-bit : page is shared among processes */ #define _PAGE_DIRTY 0x004 /* D-bit : page changed */ #define _PAGE_CACHABLE 0x008 /* C-bit : cachable */ -/* 0x010 SZ0-bit : Size of page */ +#define _PAGE_SZ0 0x010 /* SZ0-bit : Size of page */ #define _PAGE_RW 0x020 /* PR0-bit : write access allowed */ #define _PAGE_USER 0x040 /* PR1-bit : user space access allowed */ -/* 0x080 SZ1-bit : Size of page (on SH-4) */ +#define _PAGE_SZ1 0x080 /* SZ1-bit : Size of page (on SH-4) */ #define _PAGE_PRESENT 0x100 /* V-bit : page is valid */ #define _PAGE_PROTNONE 0x200 /* software: if not present */ #define _PAGE_ACCESSED 0x400 /* software: page referenced */ #define _PAGE_U0_SHARED 0x800 /* software: page is shared in user space */ -#define _PAGE_FILE 0x080 /* software: pagecache or swap? */ +#define _PAGE_FILE _PAGE_WT /* software: pagecache or swap? */ /* software: moves to PTEA.TC (Timing Control) */ #define _PAGE_PCC_AREA5 0x00000000 /* use BSC registers for area5 */ @@ -83,20 +83,29 @@ extern unsigned long empty_zero_page[102 /* Mask which drop software flags - * We also drop SZ1 bit since it is always 0 and used for _PAGE_FILE + * We also drop WT bit since it is used for _PAGE_FILE * bit in this implementation. */ +#define _PAGE_CLEAR_FLAGS (_PAGE_WT | _PAGE_PROTNONE | _PAGE_ACCESSED | _PAGE_U0_SHARED) + #if defined(CONFIG_CPU_SH3) /* * MMU on SH-3 has bug on SH-bit: We can't use it if MMUCR.IX=1. * Work around: Just drop SH-bit. */ -#define _PAGE_FLAGS_HARDWARE_MASK 0x1ffff17c +#define _PAGE_FLAGS_HARDWARE_MASK (0x1fffffff & ~(_PAGE_CLEAR_FLAGS | _PAGE_HW_SHARED)) #else -#define _PAGE_FLAGS_HARDWARE_MASK 0x1ffff17e +#define _PAGE_FLAGS_HARDWARE_MASK (0x1fffffff & ~(_PAGE_CLEAR_FLAGS)) +#endif + +/* Hardware flags: SZ0=1 (4k-byte) */ +#define _PAGE_FLAGS_HARD _PAGE_SZ0 + +#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +#define _PAGE_SZHUGE (_PAGE_SZ1) +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1MB) +#define _PAGE_SZHUGE (_PAGE_SZ0 | _PAGE_SZ1) #endif -/* Hardware flags: SZ=1 (4k-byte) */ -#define _PAGE_FLAGS_HARD 0x00000010 #define _PAGE_SHARED _PAGE_U0_SHARED _