diff options
author | Hugh Dickins <hugh@veritas.com> | 2004-10-19 18:17:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-10-19 18:17:12 -0700 |
commit | 1b46884a372f362e5edb1bb406bf0c5ca98bade7 (patch) | |
tree | cdfc9bfc76b98cc06f270e0a19f1da04005ee26d /mm | |
parent | 5dfd31d1133be6b1c697c082900cc0e50d878639 (diff) | |
download | history-1b46884a372f362e5edb1bb406bf0c5ca98bade7.tar.gz |
[PATCH] lighten mmlist_lock
Let's lighten the global spinlock mmlist_lock.
What's it for?
1. Its original role is to guard mmlist.
2. It later got a second role, to prevent get_task_mm from raising
mm_users from the dead, just after it went down to 0.
Firstly consider the second: __exit_mm sets tsk->mm NULL while holding
task_lock before calling mmput; so mmlist_lock only guards against the
exceptional case, of get_task_mm on a kernel workthread which did AIO's
use_mm (which transiently sets its tsk->mm without raising mm_users) on an
mm now exiting.
Well, I don't think get_task_mm should succeed at all on use_mm tasks.
It's mainly used by /proc/pid and ptrace, seems at best confusing for those
to present the kernel thread as having a user mm, which it won't have a
moment later. Define PF_BORROWED_MM, set in use_mm, clear in unuse_mm
(though we could just leave it), get_task_mm give NULL if set.
Secondly consider the first: and what's mmlist for?
1. Its original role was for swap_out to scan: rmap ended that in 2.5.27.
2. In 2.4.10 it got a second role, for try_to_unuse to scan for swapoff.
So, make mmlist a list of mms which maybe have pages on swap: add mm to
mmlist when first swap entry is assigned in try_to_unmap_one (pageout), or
in copy_page_range (fork); and mmput remove it from mmlist as before,
except usually list_empty and there's no need to lock. drain_mmlist added
to swapoff, to empty out the mmlist if no swap is then in use.
mmput leave mm on mmlist until after its exit_mmap, so try_to_unmap_one can
still add mm to mmlist without worrying about the mm_users 0 case; but
try_to_unuse must avoid the mm_users 0 case (when an mm might be removed
from mmlist, and freed, while it's down in unuse_process): use
atomic_inc_return now all architectures support that.
Some of the detailed comments in try_to_unuse have grown out of date:
updated and trimmed some, but leave SWAP_MAP_MAX for another occasion.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 49 |
3 files changed, 47 insertions, 17 deletions
diff --git a/mm/memory.c b/mm/memory.c index 52f96233a13cfe..c43881bbd00d28 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -289,8 +289,15 @@ skip_copy_pte_range: goto cont_copy_pte_range_noset; /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { - if (!pte_file(pte)) + if (!pte_file(pte)) { swap_duplicate(pte_to_swp_entry(pte)); + if (list_empty(&dst->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&dst->mmlist, + &src->mmlist); + spin_unlock(&mmlist_lock); + } + } set_pte(dst_pte, pte); goto cont_copy_pte_range_noset; } diff --git a/mm/rmap.c b/mm/rmap.c index f1c40c722fe6f8..d4a9a6a50fc87b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -35,6 +35,7 @@ * mm->page_table_lock * zone->lru_lock (in mark_page_accessed) * swap_list_lock (in swap_free etc's swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) * swap_device_lock (in swap_duplicate, swap_info_get) * mapping->private_lock (in __set_page_dirty_buffers) * inode_lock (in set_page_dirty's __mark_inode_dirty) @@ -576,6 +577,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) */ BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } set_pte(pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 55bb488aa10f33..446b6fd9871e19 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -648,11 +648,12 @@ static int try_to_unuse(unsigned int type) * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less - * advantage of mmlist ordering (now preserved by swap_out()), - * which clusters forked address spaces together, most recent - * child immediately after parent. If we race with dup_mmap(), - * we very much want to resolve parent before child, otherwise - * we may miss some entries: using last mm would invert that. + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. Though it's only a serious concern when an overflowed + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -660,15 +661,7 @@ static int try_to_unuse(unsigned int type) /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: - * mmput() removes mm from mmlist before exit_mmap() and its - * zap_page_range(). That's not too bad, those entries are - * on their way out, and handled faster there than here. - * do_munmap() behaves similarly, taking the range out of mm's - * vma list before zap_page_range(). But unfortunately, when - * unmapping a part of a vma, it takes the whole out first, - * then reinserts what's left after (might even reschedule if - * open() method called) - so swap entries may be invisible - * to swapoff for a while, then reappear - but that is rare. + * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { @@ -720,7 +713,7 @@ static int try_to_unuse(unsigned int type) wait_on_page_writeback(page); /* - * Remove all references to entry, without blocking. + * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ @@ -745,7 +738,10 @@ static int try_to_unuse(unsigned int type) while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); - atomic_inc(&mm->mm_users); + if (atomic_inc_return(&mm->mm_users) == 1) { + atomic_dec(&mm->mm_users); + continue; + } spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; @@ -859,6 +855,26 @@ static int try_to_unuse(unsigned int type) } /* + * After a successful try_to_unuse, if no swap is now in use, we know we + * can empty the mmlist. swap_list_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int i; + + for (i = 0; i < nr_swapfiles; i++) + if (swap_info[i].inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which * corresponds to page offset `offset'. */ @@ -1172,6 +1188,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) } down(&swapon_sem); swap_list_lock(); + drain_mmlist(); swap_device_lock(p); swap_file = p->swap_file; p->swap_file = NULL; |