From: Hugh Dickins Let's lighten the global spinlock mmlist_lock. What's it for? 1. Its original role is to guard mmlist. 2. It later got a second role, to prevent get_task_mm from raising mm_users from the dead, just after it went down to 0. Firstly consider the second: __exit_mm sets tsk->mm NULL while holding task_lock before calling mmput; so mmlist_lock only guards against the exceptional case, of get_task_mm on a kernel workthread which did AIO's use_mm (which transiently sets its tsk->mm without raising mm_users) on an mm now exiting. Well, I don't think get_task_mm should succeed at all on use_mm tasks. It's mainly used by /proc/pid and ptrace, seems at best confusing for those to present the kernel thread as having a user mm, which it won't have a moment later. Define PF_BORROWED_MM, set in use_mm, clear in unuse_mm (though we could just leave it), get_task_mm give NULL if set. Secondly consider the first: and what's mmlist for? 1. Its original role was for swap_out to scan: rmap ended that in 2.5.27. 2. In 2.4.10 it got a second role, for try_to_unuse to scan for swapoff. So, make mmlist a list of mms which maybe have pages on swap: add mm to mmlist when first swap entry is assigned in try_to_unmap_one (pageout), or in copy_page_range (fork); and mmput remove it from mmlist as before, except usually list_empty and there's no need to lock. drain_mmlist added to swapoff, to empty out the mmlist if no swap is then in use. mmput leave mm on mmlist until after its exit_mmap, so try_to_unmap_one can still add mm to mmlist without worrying about the mm_users 0 case; but try_to_unuse must avoid the mm_users 0 case (when an mm might be removed from mmlist, and freed, while it's down in unuse_process): use atomic_inc_return now all architectures support that. Some of the detailed comments in try_to_unuse have grown out of date: updated and trimmed some, but leave SWAP_MAP_MAX for another occasion. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/mm/pgtable.c | 6 +---- 25-akpm/fs/aio.c | 2 + 25-akpm/fs/exec.c | 6 ----- 25-akpm/include/linux/sched.h | 5 +--- 25-akpm/kernel/fork.c | 37 +++++++++--------------------- 25-akpm/mm/memory.c | 9 ++++++- 25-akpm/mm/rmap.c | 6 +++++ 25-akpm/mm/swapfile.c | 49 +++++++++++++++++++++++++++-------------- 8 files changed, 64 insertions(+), 56 deletions(-) diff -puN arch/i386/mm/pgtable.c~lighten-mmlist_lock arch/i386/mm/pgtable.c --- 25/arch/i386/mm/pgtable.c~lighten-mmlist_lock 2004-10-03 16:43:32.051458992 -0700 +++ 25-akpm/arch/i386/mm/pgtable.c 2004-10-03 16:43:32.066456712 -0700 @@ -165,10 +165,8 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * If the locking proves to be non-performant, a ticketing scheme with - * checks at dup_mmap(), exec(), and other mmlist addition points - * could be used. The locking scheme was chosen on the basis of - * manfred's recommendations and having no core impact whatsoever. + * The locking scheme was chosen on the basis of manfred's + * recommendations and having no core impact whatsoever. * -- wli */ spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; diff -puN fs/aio.c~lighten-mmlist_lock fs/aio.c --- 25/fs/aio.c~lighten-mmlist_lock 2004-10-03 16:43:32.053458688 -0700 +++ 25-akpm/fs/aio.c 2004-10-03 16:43:32.067456560 -0700 @@ -572,6 +572,7 @@ void use_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); + tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); tsk->mm = mm; @@ -598,6 +599,7 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); + tsk->flags &= ~PF_BORROWED_MM; tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); diff -puN fs/exec.c~lighten-mmlist_lock fs/exec.c --- 25/fs/exec.c~lighten-mmlist_lock 2004-10-03 16:43:32.054458536 -0700 +++ 25-akpm/fs/exec.c 2004-10-03 16:43:32.068456408 -0700 @@ -531,12 +531,6 @@ static int exec_mmap(struct mm_struct *m struct task_struct *tsk; struct mm_struct * old_mm, *active_mm; - /* Add it to the list of mm's */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, &init_mm.mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; diff -puN include/linux/sched.h~lighten-mmlist_lock include/linux/sched.h --- 25/include/linux/sched.h~lighten-mmlist_lock 2004-10-03 16:43:32.056458232 -0700 +++ 25-akpm/include/linux/sched.h 2004-10-03 16:43:32.070456104 -0700 @@ -219,7 +219,7 @@ struct mm_struct { struct rw_semaphore mmap_sem; spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ - struct list_head mmlist; /* List of all active mm's. These are globally strung + struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */ @@ -253,8 +253,6 @@ struct mm_struct { struct kioctx default_kioctx; }; -extern int mmlist_nr; - struct sighand_struct { atomic_t count; struct k_sigaction action[_NSIG]; @@ -770,6 +768,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_UISLEEP 0x00400000 /* Uninterruptible sleep */ #define PF_SINBINNED 0x00800000 /* I am sinbinned */ #define PF_UNPRIV_RT 0x01000000 /* I wanted to be RT but had insufficient privilege*/ +#define PF_BORROWED_MM 0x02000000 /* I am a kthread doing use_mm */ /* * Scheduling statistics for a task/thread diff -puN kernel/fork.c~lighten-mmlist_lock kernel/fork.c --- 25/kernel/fork.c~lighten-mmlist_lock 2004-10-03 16:43:32.058457928 -0700 +++ 25-akpm/kernel/fork.c 2004-10-03 16:43:32.072455800 -0700 @@ -184,17 +184,6 @@ static inline int dup_mmap(struct mm_str rb_parent = NULL; pprev = &mm->mmap; - /* - * Add it to the mmlist after the parent. - * Doing it this way means that we can order the list, - * and fork() won't mess up the ordering significantly. - * Add it first so that swapoff can see any swap entries. - */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, ¤t->mm->mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { struct file *file; @@ -295,7 +284,6 @@ static inline void mm_free_pgd(struct mm #endif /* CONFIG_MMU */ spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -int mmlist_nr; #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) @@ -307,6 +295,7 @@ static struct mm_struct * mm_init(struct atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); + INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->page_table_lock = SPIN_LOCK_UNLOCKED; mm->ioctx_list_lock = RW_LOCK_UNLOCKED; @@ -355,12 +344,14 @@ void fastcall __mmdrop(struct mm_struct */ void mmput(struct mm_struct *mm) { - if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - list_del(&mm->mmlist); - mmlist_nr--; - spin_unlock(&mmlist_lock); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } put_swap_token(mm); mmdrop(mm); } @@ -370,15 +361,11 @@ EXPORT_SYMBOL_GPL(mmput); /** * get_task_mm - acquire a reference to the task's mm * - * Returns %NULL if the task has no mm. Checks if the use count - * of the mm is non-zero and if so returns a reference to it, after + * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * this kernel workthread has transiently adopted a user mm with use_mm, + * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. - * - * If the use count is zero, it means that this mm is going away, - * so return %NULL. This only happens in the case of an AIO daemon - * which has temporarily adopted an mm (see use_mm), in the course - * of its final mmput, before exit_aio has completed. */ struct mm_struct *get_task_mm(struct task_struct *task) { @@ -387,12 +374,10 @@ struct mm_struct *get_task_mm(struct tas task_lock(task); mm = task->mm; if (mm) { - spin_lock(&mmlist_lock); - if (!atomic_read(&mm->mm_users)) + if (task->flags & PF_BORROWED_MM) mm = NULL; else atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); } task_unlock(task); return mm; diff -puN mm/memory.c~lighten-mmlist_lock mm/memory.c --- 25/mm/memory.c~lighten-mmlist_lock 2004-10-03 16:43:32.059457776 -0700 +++ 25-akpm/mm/memory.c 2004-10-03 16:43:32.074455496 -0700 @@ -288,8 +288,15 @@ skip_copy_pte_range: goto cont_copy_pte_range_noset; /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { - if (!pte_file(pte)) + if (!pte_file(pte)) { swap_duplicate(pte_to_swp_entry(pte)); + if (list_empty(&dst->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&dst->mmlist, + &src->mmlist); + spin_unlock(&mmlist_lock); + } + } set_pte(dst_pte, pte); goto cont_copy_pte_range_noset; } diff -puN mm/rmap.c~lighten-mmlist_lock mm/rmap.c --- 25/mm/rmap.c~lighten-mmlist_lock 2004-10-03 16:43:32.061457472 -0700 +++ 25-akpm/mm/rmap.c 2004-10-03 16:43:32.075455344 -0700 @@ -35,6 +35,7 @@ * mm->page_table_lock * zone->lru_lock (in mark_page_accessed) * swap_list_lock (in swap_free etc's swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) * swap_device_lock (in swap_duplicate, swap_info_get) * mapping->private_lock (in __set_page_dirty_buffers) * inode_lock (in set_page_dirty's __mark_inode_dirty) @@ -569,6 +570,11 @@ static int try_to_unmap_one(struct page */ BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } set_pte(pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); } diff -puN mm/swapfile.c~lighten-mmlist_lock mm/swapfile.c --- 25/mm/swapfile.c~lighten-mmlist_lock 2004-10-03 16:43:32.062457320 -0700 +++ 25-akpm/mm/swapfile.c 2004-10-03 16:43:32.077455040 -0700 @@ -648,11 +648,12 @@ static int try_to_unuse(unsigned int typ * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less - * advantage of mmlist ordering (now preserved by swap_out()), - * which clusters forked address spaces together, most recent - * child immediately after parent. If we race with dup_mmap(), - * we very much want to resolve parent before child, otherwise - * we may miss some entries: using last mm would invert that. + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. Though it's only a serious concern when an overflowed + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -660,15 +661,7 @@ static int try_to_unuse(unsigned int typ /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: - * mmput() removes mm from mmlist before exit_mmap() and its - * zap_page_range(). That's not too bad, those entries are - * on their way out, and handled faster there than here. - * do_munmap() behaves similarly, taking the range out of mm's - * vma list before zap_page_range(). But unfortunately, when - * unmapping a part of a vma, it takes the whole out first, - * then reinserts what's left after (might even reschedule if - * open() method called) - so swap entries may be invisible - * to swapoff for a while, then reappear - but that is rare. + * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { @@ -720,7 +713,7 @@ static int try_to_unuse(unsigned int typ wait_on_page_writeback(page); /* - * Remove all references to entry, without blocking. + * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ @@ -745,7 +738,10 @@ static int try_to_unuse(unsigned int typ while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); - atomic_inc(&mm->mm_users); + if (atomic_inc_return(&mm->mm_users) == 1) { + atomic_dec(&mm->mm_users); + continue; + } spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; @@ -859,6 +855,26 @@ static int try_to_unuse(unsigned int typ } /* + * After a successful try_to_unuse, if no swap is now in use, we know we + * can empty the mmlist. swap_list_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int i; + + for (i = 0; i < nr_swapfiles; i++) + if (swap_info[i].inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which * corresponds to page offset `offset'. */ @@ -1172,6 +1188,7 @@ asmlinkage long sys_swapoff(const char _ } down(&swapon_sem); swap_list_lock(); + drain_mmlist(); swap_device_lock(p); swap_file = p->swap_file; p->swap_file = NULL; _