From fecfaf03f489b00f3d6aeb9aff44774746e549ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 08:29:37 -0500 Subject: [PATCH] mm: page_alloc: rt-friendly per-cpu pages commit ff3fd6afd788760c846a2f4449487debb6c4b0ac in tip. rt-friendly per-cpu pages: convert the irqs-off per-cpu locking method into a preemptible, explicit-per-cpu-locks method. Contains fixes from: Peter Zijlstra Thomas Gleixner [PG: upstream 99dcc3e5a94e muddies the waters of applying the original, for example free_zone_pagesets() is gone.] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Paul Gortmaker --- mm/page_alloc.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 117 insertions(+), 20 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d03c946..d870c91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -188,6 +188,54 @@ static unsigned long __meminitdata dma_reserve; EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +// PG: FIXME - zone_pcp is dead (99dcc3e5a9) so kill these get/put variants? +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return per_cpu_ptr(zone->pageset, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; int nr_online_nodes __read_mostly = 1; @@ -602,8 +650,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; - int bad = 0; + int i, this_cpu, bad = 0; int wasMlocked = __TestClearPageMlocked(page); trace_mm_page_free_direct(page, order); @@ -622,13 +669,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_events(PGFREE, 1 << order); + count_vm_events(PGFREE, 1 << order); + unlock_cpu_pcp(flags, this_cpu); free_one_page(page_zone(page), page, order, get_pageblock_migratetype(page)); - local_irq_restore(flags); } /* @@ -1005,15 +1052,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + int this_cpu; - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } #endif @@ -1033,13 +1081,18 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - local_irq_save(flags); + __lock_cpu_pcp(&flags, cpu); pset = per_cpu_ptr(zone->pageset, cpu); + if (!pset) { + unlock_cpu_pcp(flags, cpu); + WARN_ON(1); + continue; + } pcp = &pset->pcp; free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; - local_irq_restore(flags); + unlock_cpu_pcp(flags, cpu); } } @@ -1051,12 +1104,52 @@ void drain_local_pages(void *arg) drain_pages(smp_processor_id()); } +#ifdef CONFIG_PREEMPT_RT +static void drain_local_pages_work(struct work_struct *wrk) +{ + drain_pages(smp_processor_id()); +} +#endif + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator */ void drain_all_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(drain_local_pages_work); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(NULL); + } + +#else on_each_cpu(drain_local_pages, NULL, 1); +#endif } #ifdef CONFIG_HIBERNATION @@ -1102,10 +1195,11 @@ void mark_free_pages(struct zone *zone) void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; int migratetype; - int wasMlocked = __TestClearPageMlocked(page); + int this_cpu, wasMlocked = __TestClearPageMlocked(page); trace_mm_page_free_direct(page, 0); kmemcheck_free_shadow(page, 0); @@ -1122,12 +1216,13 @@ void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); - local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_event(PGFREE); + count_vm_event(PGFREE); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -1144,7 +1239,6 @@ void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } - pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1156,7 +1250,7 @@ void free_hot_cold_page(struct page *page, int cold) } out: - local_irq_restore(flags); + put_zone_pcp(zone, flags, this_cpu); } /* @@ -1200,15 +1294,18 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + struct per_cpu_pageset *pset; + int this_cpu; again: + pset = get_zone_pcp(zone, &flags, &this_cpu); + if (likely(order == 0)) { - struct per_cpu_pages *pcp; struct list_head *list; + struct per_cpu_pages *pcp = &pset->pcp; - local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; + if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1238,7 +1335,7 @@ again: */ WARN_ON_ONCE(order > 1); } - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1248,7 +1345,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); - local_irq_restore(flags); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1256,7 +1353,7 @@ again: return page; failed: - local_irq_restore(flags); + put_zone_pcp(zone, flags, this_cpu); return NULL; } -- 1.7.0.4