aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-01-07 21:58:53 -0800
committerLinus Torvalds <torvalds@evo.osdl.org>2005-01-07 21:58:53 -0800
commit3ee07371ab56b82c00847aa8a4a27e0769640a09 (patch)
tree868c11f52d829328a6e10a02f3ac4b084f9a9626 /mm
parent25f5906cfbf8bf8603599edb5bce4577de0d7085 (diff)
downloadhistory-3ee07371ab56b82c00847aa8a4a27e0769640a09.tar.gz
[PATCH] vmtrunc: unmap_mapping dropping i_mmap_lock
vmtruncate (or more generally, unmap_mapping_range) has been observed responsible for very high latencies: the lockbreak work in unmap_vmas is good for munmap or exit_mmap, but no use while mapping->i_mmap_lock is held, to keep our place in the prio_tree (or list) of a file's vmas. Extend the zap_details block with i_mmap_lock pointer, so unmap_vmas can detect if that needs lockbreak, and break_addr so it can notify where it left off. Add unmap_mapping_range_vma, used from both prio_tree and nonlinear list handlers. This is what now calls zap_page_range (above unmap_vmas), but handles the lockbreak and restart issues: letting unmap_mapping_range_ tree or list know when they need to start over because lock was dropped. When restarting, of course there's a danger of never making progress. Add vm_truncate_count field to vm_area_struct, update that to mapping-> truncate_count once fully scanned, skip up-to-date vmas without a scan (and without dropping i_mmap_lock). Further danger of never making progress if a vma is very large: when breaking out, save restart_vma and restart_addr (and restart_pgoff to confirm, in case vma gets reused), to help continue where we left off. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c166
1 files changed, 153 insertions, 13 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fd761e2ab80791..30c2592b007040 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -655,7 +655,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
int ret = 0;
- int atomic = details && details->atomic;
+ spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
+ int fullmm = tlb_is_full_mm(*tlbp);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long start;
@@ -694,16 +695,28 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
if ((long)zap_bytes > 0)
continue;
- if (!atomic) {
- int fullmm = tlb_is_full_mm(*tlbp);
- tlb_finish_mmu(*tlbp, tlb_start, start);
- cond_resched_lock(&mm->page_table_lock);
- *tlbp = tlb_gather_mmu(mm, fullmm);
- tlb_start_valid = 0;
+ tlb_finish_mmu(*tlbp, tlb_start, start);
+
+ if (need_resched() ||
+ need_lockbreak(&mm->page_table_lock) ||
+ (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+ if (i_mmap_lock) {
+ /* must reset count of rss freed */
+ *tlbp = tlb_gather_mmu(mm, fullmm);
+ details->break_addr = start;
+ goto out;
+ }
+ spin_unlock(&mm->page_table_lock);
+ cond_resched();
+ spin_lock(&mm->page_table_lock);
}
+
+ *tlbp = tlb_gather_mmu(mm, fullmm);
+ tlb_start_valid = 0;
zap_bytes = ZAP_BLOCK_SIZE;
}
}
+out:
return ret;
}
@@ -1355,7 +1368,114 @@ no_new_page:
/*
* Helper functions for unmap_mapping_range().
+ *
+ * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ *
+ * We have to restart searching the prio_tree whenever we drop the lock,
+ * since the iterator is only valid while the lock is held, and anyway
+ * a later vma might be split and reinserted earlier while lock dropped.
+ *
+ * The list of nonlinear vmas could be handled more efficiently, using
+ * a placeholder, but handle it in the same way until a need is shown.
+ * It is important to search the prio_tree before nonlinear list: a vma
+ * may become nonlinear and be shifted from prio_tree to nonlinear list
+ * while the lock is dropped; but never shifted from list to prio_tree.
+ *
+ * In order to make forward progress despite restarting the search,
+ * vm_truncate_count is used to mark a vma as now dealt with, so we can
+ * quickly skip it next time around. Since the prio_tree search only
+ * shows us those vmas affected by unmapping the range in question, we
+ * can't efficiently keep all vmas in step with mapping->truncate_count:
+ * so instead reset them all whenever it wraps back to 0 (then go to 1).
+ * mapping->truncate_count and vma->vm_truncate_count are protected by
+ * i_mmap_lock.
+ *
+ * In order to make forward progress despite repeatedly restarting some
+ * large vma, note the break_addr set by unmap_vmas when it breaks out:
+ * and restart from that address when we reach that vma again (so long
+ * as another such was not inserted earlier while the lock was dropped).
+ *
+ * There is no guarantee that the restart_vma will remain intact when
+ * the lock is regained: but if it has been freed to some other use,
+ * it cannot then appear in the tree or list of vmas, so no harm done;
+ * if it has been reused for a new vma of the same mapping, nopage
+ * checks on i_size and truncate_count ensure it cannot be mapping any
+ * of the truncated pages, so the area below restart_addr is still safe
+ * to skip - but we must check pgoff to prevent spurious unmapping; or
+ * restart_vma may have been split or merged, shrunk or extended - but
+ * never shifted, so restart_addr and restart_pgoff remain in synch
+ * (even in the case of mremap move, which makes a copy of the vma).
*/
+
+static void reset_vma_truncate_counts(struct address_space *mapping)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+ vma->vm_truncate_count = 0;
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ vma->vm_truncate_count = 0;
+}
+
+static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr,
+ struct zap_details *details)
+{
+ int need_break;
+
+ if (vma == details->restart_vma &&
+ start_addr < details->restart_addr) {
+ /*
+ * Be careful: this vm_area_struct may have been reused
+ * meanwhile. If pgoff matches up, it's probably the
+ * same one (perhaps shrunk or extended), but no harm is
+ * done if actually it's new. If pgoff does not match,
+ * it would likely be wrong to unmap from restart_addr,
+ * but it must be new, so we can just mark it completed.
+ */
+ start_addr = details->restart_addr;
+ if (linear_page_index(vma, start_addr) !=
+ details->restart_pgoff || start_addr >= end_addr) {
+ vma->vm_truncate_count = details->truncate_count;
+ return 0;
+ }
+ }
+again:
+ details->break_addr = end_addr;
+ zap_page_range(vma, start_addr, end_addr - start_addr, details);
+
+ /*
+ * We cannot rely on the break test in unmap_vmas:
+ * on the one hand, we don't want to restart our loop
+ * just because that broke out for the page_table_lock;
+ * on the other hand, it does no test when vma is small.
+ */
+ need_break = need_resched() ||
+ need_lockbreak(details->i_mmap_lock);
+
+ if (details->break_addr >= end_addr) {
+ /* We have now completed this vma: mark it so */
+ vma->vm_truncate_count = details->truncate_count;
+ if (!need_break)
+ return 0;
+ } else {
+ if (!need_break) {
+ start_addr = details->break_addr;
+ goto again;
+ }
+ details->restart_vma = vma;
+ details->restart_pgoff =
+ linear_page_index(vma, details->break_addr);
+ details->restart_addr = details->break_addr;
+ }
+
+ spin_unlock(details->i_mmap_lock);
+ cond_resched();
+ spin_lock(details->i_mmap_lock);
+ return -EINTR;
+}
+
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct zap_details *details)
{
@@ -1363,8 +1483,13 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
+restart:
vma_prio_tree_foreach(vma, &iter, root,
details->first_index, details->last_index) {
+ /* Skip quickly over those we have already dealt with */
+ if (vma->vm_truncate_count == details->truncate_count)
+ continue;
+
vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
@@ -1374,9 +1499,12 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
zea = details->last_index;
if (zea > vea)
zea = vea;
- zap_page_range(vma,
+
+ if (unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
- (zea - zba + 1) << PAGE_SHIFT, details);
+ ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
+ details) < 0)
+ goto restart;
}
}
@@ -1391,10 +1519,15 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
+restart:
list_for_each_entry(vma, head, shared.vm_set.list) {
+ /* Skip quickly over those we have already dealt with */
+ if (vma->vm_truncate_count == details->truncate_count)
+ continue;
details->nonlinear_vma = vma;
- zap_page_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start, details);
+ if (unmap_mapping_range_vma(vma, vma->vm_start,
+ vma->vm_end, details) < 0)
+ goto restart;
}
}
@@ -1433,13 +1566,20 @@ void unmap_mapping_range(struct address_space *mapping,
details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
- details.atomic = 1; /* A spinlock is held */
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
+ details.i_mmap_lock = &mapping->i_mmap_lock;
+ details.restart_vma = NULL;
spin_lock(&mapping->i_mmap_lock);
- /* Protect against page fault */
+
+ /* Protect against page faults, and endless unmapping loops */
mapping->truncate_count++;
+ if (unlikely(mapping->truncate_count == 0)) {
+ mapping->truncate_count++;
+ reset_vma_truncate_counts(mapping);
+ }
+ details.truncate_count = mapping->truncate_count;
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);