diff options
author | Hugh Dickins <hugh@veritas.com> | 2004-08-23 21:24:22 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-08-23 21:24:22 -0700 |
commit | 77631565ae40a44f23eac2e9c440cbceed8962a7 (patch) | |
tree | 8a8eba7032f859ed2f4505a2c07b55256b568b75 /mm | |
parent | edcc56dc6a7c758c4862321fc2c3a9d5a1f4dc5e (diff) | |
download | history-77631565ae40a44f23eac2e9c440cbceed8962a7.tar.gz |
[PATCH] rmaplock: SLAB_DESTROY_BY_RCU
With page_map_lock gone, how to stabilize page->mapping's anon_vma while
acquiring anon_vma->lock in page_referenced_anon and try_to_unmap_anon?
The page cannot actually be freed (vmscan holds reference), but however much
we check page_mapped (which guarantees that anon_vma is in use - or would
guarantee that if we added suitable barriers), there's no locking against page
becoming unmapped the instant after, then anon_vma freed.
It's okay to take anon_vma->lock after it's freed, so long as it remains a
struct anon_vma (its list would become empty, or perhaps reused for an
unrelated anon_vma: but no problem since we always check that the page located
is the right one); but corruption if that memory gets reused for some other
purpose.
This is not unique: it's liable to be problem whenever the kernel tries to
approach a structure obliquely. It's generally solved with an atomic
reference count; but one advantage of anon_vma over anonmm is that it does not
have such a count, and it would be a backward step to add one.
Therefore... implement SLAB_DESTROY_BY_RCU flag, to guarantee that such a
kmem_cache_alloc'ed structure cannot get freed to other use while the
rcu_read_lock is held i.e. preempt disabled; and use that for anon_vma.
Fix concerns raised by Manfred: this flag is incompatible with poisoning and
destructor, and kmem_cache_destroy needs to synchronize_kernel.
I hope SLAB_DESTROY_BY_RCU may be useful elsewhere; but though it's safe for
little anon_vma, I'd be reluctant to use it on any caches whose immediate
shrinkage under pressure is important to the system.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/rmap.c | 50 | ||||
-rw-r--r-- | mm/slab.c | 69 |
2 files changed, 95 insertions, 24 deletions
diff --git a/mm/rmap.c b/mm/rmap.c index 175e76591602b5..088f140ec71b0c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rmap.h> +#include <linux/rcupdate.h> #include <asm/tlbflush.h> @@ -159,8 +160,31 @@ static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) void __init anon_vma_init(void) { - anon_vma_cachep = kmem_cache_create("anon_vma", - sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL); + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); +} + +/* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) page->mapping; + if (!(anon_mapping & PAGE_MAPPING_ANON)) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); +out: + rcu_read_unlock(); + return anon_vma; } /* @@ -238,19 +262,15 @@ out: static int page_referenced_anon(struct page *page) { unsigned int mapcount; - struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; + struct anon_vma *anon_vma; struct vm_area_struct *vma; int referenced = 0; - /* - * Recheck mapcount: it is not safe to take anon_vma->lock after - * last page_remove_rmap, since struct anon_vma might be reused. - */ - mapcount = page_mapcount(page); - if (!mapcount) + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) return referenced; - spin_lock(&anon_vma->lock); + mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) @@ -634,18 +654,14 @@ out_unlock: static int try_to_unmap_anon(struct page *page) { - struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; + struct anon_vma *anon_vma; struct vm_area_struct *vma; int ret = SWAP_AGAIN; - /* - * Recheck mapped: it is not safe to take anon_vma->lock after - * last page_remove_rmap, since struct anon_vma might be reused. - */ - if (!page_mapped(page)) + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) return ret; - spin_lock(&anon_vma->lock); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { ret = try_to_unmap_one(page, vma); if (ret == SWAP_FAIL || !page_mapped(page)) diff --git a/mm/slab.c b/mm/slab.c index fd80d31fbab0ae..887bec4dde632d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -91,6 +91,7 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> +#include <linux/rcupdate.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -139,11 +140,13 @@ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_NO_REAP | SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) #endif /* @@ -190,6 +193,28 @@ struct slab { }; /* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + * + * We assume struct slab_rcu can overlay struct slab when destroying. + */ +struct slab_rcu { + struct rcu_head head; + kmem_cache_t *cachep; + void *addr; +}; + +/* * struct array_cache * * Per cpu structures @@ -873,6 +898,16 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr) atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); } +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slab_rcu *slab_rcu = (struct slab_rcu *) head; + kmem_cache_t *cachep = slab_rcu->cachep; + + kmem_freepages(cachep, slab_rcu->addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); +} + #if DEBUG #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1026,6 +1061,8 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) */ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) { + void *addr = slabp->s_mem - slabp->colouroff; + #if DEBUG int i; for (i = 0; i < cachep->num; i++) { @@ -1061,10 +1098,19 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) } } #endif - - kmem_freepages(cachep, slabp->s_mem-slabp->colouroff); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { + struct slab_rcu *slab_rcu; + + slab_rcu = (struct slab_rcu *) slabp; + slab_rcu->cachep = cachep; + slab_rcu->addr = addr; + call_rcu(&slab_rcu->head, kmem_rcu_free); + } else { + kmem_freepages(cachep, addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); + } } /** @@ -1139,9 +1185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) flags |= SLAB_RED_ZONE|SLAB_STORE_USER; - flags |= SLAB_POISON; + if (!(flags & SLAB_DESTROY_BY_RCU)) + flags |= SLAB_POISON; #endif + if (flags & SLAB_DESTROY_BY_RCU) + BUG_ON(flags & SLAB_POISON); #endif + if (flags & SLAB_DESTROY_BY_RCU) + BUG_ON(dtor); + /* * Always checks flags, a caller might be expecting debug * support which isn't available. @@ -1553,6 +1605,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep) return 1; } + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + synchronize_kernel(); + /* no cpu_online check required here since we clear the percpu * array on cpu offline and set this to NULL. */ |