Index: linux/arch/ia64/mm/hugetlbpage.c =================================================================== --- linux.orig/arch/ia64/mm/hugetlbpage.c 2004-09-18 11:30:22.000000000 -0700 +++ linux/arch/ia64/mm/hugetlbpage.c 2004-09-21 12:37:51.000000000 -0700 @@ -43,13 +43,23 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, unsigned long addr) { int nid = mpol_first_node(vma, addr); + int tid, nid2; struct page *page = NULL; if (list_empty(&hugepage_freelists[nid])) { - for (nid = 0; nid < MAX_NUMNODES; ++nid) - if (mpol_node_valid(nid, vma, addr) && - !list_empty(&hugepage_freelists[nid])) - break; + /* Prefer the neighboring nodes for hugepage allocation */ + for (tid =1 ; tid < MAX_NUMNODES; tid++) { + nid2 = (nid + tid) % MAX_NUMNODES; + if (mpol_node_valid(nid2, vma, addr) && + !list_empty(&hugepage_freelists[nid2])) + break; + if (tid > nid) continue; + nid2 = nid - tid; + if (mpol_node_valid(nid2, vma, addr) && + !list_empty(&hugepage_freelists[nid2])) + break; + } + nid = nid2; } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { @@ -87,6 +97,27 @@ return page; } +/* variation on the above. acquire htlbpage_lock as in 2.6.9-rc2 */ +static struct page *__alloc_hugetlb_page2(struct vm_area_struct *vma, unsigned long addr) +{ + int i; + struct page *page; + + spin_lock(&htlbpage_lock); + page = dequeue_huge_page(vma, addr); + if (!page) { + spin_unlock(&htlbpage_lock); + return NULL; + } + htlbpagemem[page_zone(page)->zone_pgdat->node_id]--; + spin_unlock(&htlbpage_lock); + set_page_count(page, 1); + page->lru.prev = (void *)free_huge_page; + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) + clear_highpage(&page[i]); + return page; +} + static pte_t * huge_pte_alloc (struct mm_struct *mm, unsigned long addr) { @@ -659,6 +690,7 @@ ret = VM_FAULT_MINOR; if (unlikely(!pte_none(*pte))) goto out; + spin_unlock(&mm->page_table_lock); mapping = vma->vm_file->f_mapping; idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) @@ -666,22 +698,19 @@ retry: page = find_get_page(mapping, idx); if (!page) { - spin_lock(&htlbpage_lock); /* Should do this at prefault time, but that gets us into trouble with freeing right now. We do a quick overcommit check instead. */ ret = hugetlb_get_quota(mapping); if (ret) { - spin_unlock(&htlbpage_lock); ret = VM_FAULT_OOM; - goto out; + goto out2; } - page = __alloc_hugetlb_page(vma, addr); + page = __alloc_hugetlb_page2(vma, addr); if (!page) { hugetlb_put_quota(mapping); - spin_unlock(&htlbpage_lock); /* Instead of OOMing here could just transparently use small pages. */ @@ -689,7 +718,7 @@ current->comm, current->pid); ret = VM_FAULT_OOM; - goto out; + goto out2; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); if (likely(!ret)) @@ -698,21 +727,24 @@ hugetlb_put_quota(mapping); if (put_page_testzero(page)) __free_huge_page(page); - spin_unlock(&htlbpage_lock); if (ret == -EEXIST) goto retry; ret = VM_FAULT_SIGBUS; - goto out; + goto out2; } - spin_unlock(&htlbpage_lock); ret = VM_FAULT_MAJOR; } else ret = VM_FAULT_MINOR; - - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + + spin_lock(&mm->page_table_lock); + if (pte_none(*pte)) + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + else + page_cache_release(page); out: spin_unlock(&mm->page_table_lock); +out2: return ret; }