From: Christoph Lameter Do not use the page_table_lock in do_anonymous_page. This will significantly increase the parallelism in the page fault handler for SMP systems. The patch also modifies the definitions of _mm_counter functions so that rss and anon_rss become atomic (and will use atomic64_t if available). Signed-off-by: Christoph Lameter Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/sched.h | 31 +++++++++++++++++++++++++++++++ mm/memory.c | 14 +++++--------- 2 files changed, 36 insertions(+), 9 deletions(-) diff -puN include/linux/sched.h~page-fault-patches-no-pagetable-lock-in-do_anon_page include/linux/sched.h --- 25/include/linux/sched.h~page-fault-patches-no-pagetable-lock-in-do_anon_page Wed Aug 17 15:10:30 2005 +++ 25-akpm/include/linux/sched.h Wed Aug 17 15:10:45 2005 @@ -204,12 +204,43 @@ arch_get_unmapped_area_topdown(struct fi extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#ifdef CONFIG_ATOMIC_TABLE_OPS +/* + * No spinlock is held during atomic page table operations. The + * counters are not protected anymore and must also be + * incremented atomically. +*/ +#ifdef ATOMIC64_INIT +#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) +typedef atomic64_t mm_counter_t; +#else +/* + * This may limit process memory to 2^31 * PAGE_SIZE which may be around 8TB + * if using 4KB page size + */ +#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) +typedef atomic_t mm_counter_t; +#endif +#else +/* + * No atomic page table operations. Counters are protected by + * the page table lock + */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- typedef unsigned long mm_counter_t; +#endif struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ diff -puN mm/memory.c~page-fault-patches-no-pagetable-lock-in-do_anon_page mm/memory.c --- 25/mm/memory.c~page-fault-patches-no-pagetable-lock-in-do_anon_page Wed Aug 17 15:10:30 2005 +++ 25-akpm/mm/memory.c Wed Aug 17 15:10:48 2005 @@ -1772,12 +1772,12 @@ do_anonymous_page(struct mm_struct *mm, } else { inc_page_state(cmpxchg_fail_anon_read); } - pte_unmap(page_table); goto minor_fault; } /* This leaves the write case */ page_table_atomic_stop(mm); + pte_unmap(page_table); if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -1788,13 +1788,13 @@ do_anonymous_page(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); - spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + page_table_atomic_start(mm); if (!ptep_cmpxchg(mm, addr, page_table, orig_entry, entry)) { - pte_unmap(page_table); page_cache_release(page); inc_page_state(cmpxchg_fail_anon_write); - goto minor_fault_atomic; + goto minor_fault; } /* @@ -1805,16 +1805,12 @@ do_anonymous_page(struct mm_struct *mm, page_add_anon_rmap(page, vma, addr); lru_cache_add_active(page); inc_mm_counter(mm, rss); - pte_unmap(page_table); update_mmu_cache(vma, addr, entry); lazy_mmu_prot_update(entry); minor_fault: - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; - -minor_fault_atomic: page_table_atomic_stop(mm); + pte_unmap(page_table); return VM_FAULT_MINOR; oom: _