diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/arch/i386/mm/hugetlbpage.c 276-per_node_rss/arch/i386/mm/hugetlbpage.c --- 274-percpu_real_loadavg/arch/i386/mm/hugetlbpage.c 2003-11-24 16:12:27.000000000 -0800 +++ 276-per_node_rss/arch/i386/mm/hugetlbpage.c 2003-12-11 17:16:38.000000000 -0800 @@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag static void free_huge_page(struct page *page); +#ifdef CONFIG_NUMA + +static inline void huge_inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] += (HPAGE_SIZE / PAGE_SIZE); +} + +static inline void huge_dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss -= (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] -= (HPAGE_SIZE / PAGE_SIZE); +} + +#else /* !CONFIG_NUMA */ + +#define huge_inc_rss(mm, page) ((mm)->rss += (HPAGE_SIZE / PAGE_SIZE)) +#define huge_dec_rss(mm, page) ((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE)) + +#endif /* CONFIG_NUMA */ + static struct page *alloc_hugetlb_page(void) { int i; @@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc { pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(mm, page); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(dst, ptepage); addr += HPAGE_SIZE; } return 0; @@ -314,8 +335,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + huge_dec_rss(mm, page); } - mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); } diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/binfmt_aout.c 276-per_node_rss/fs/binfmt_aout.c --- 274-percpu_real_loadavg/fs/binfmt_aout.c 2003-10-01 11:48:15.000000000 -0700 +++ 276-per_node_rss/fs/binfmt_aout.c 2003-12-11 17:16:38.000000000 -0800 @@ -309,7 +309,7 @@ static int load_aout_binary(struct linux (current->mm->start_brk = N_BSSADDR(ex)); current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->rss = 0; + zero_rss(current->mm); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/binfmt_elf.c 276-per_node_rss/fs/binfmt_elf.c --- 274-percpu_real_loadavg/fs/binfmt_elf.c 2003-10-27 10:41:13.000000000 -0800 +++ 276-per_node_rss/fs/binfmt_elf.c 2003-12-11 17:16:38.000000000 -0800 @@ -644,7 +644,7 @@ static int load_elf_binary(struct linux_ /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->rss = 0; + zero_rss(current->mm); current->mm->free_area_cache = TASK_UNMAPPED_BASE; retval = setup_arg_pages(bprm); if (retval < 0) { diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/binfmt_flat.c 276-per_node_rss/fs/binfmt_flat.c --- 274-percpu_real_loadavg/fs/binfmt_flat.c 2003-10-01 11:35:23.000000000 -0700 +++ 276-per_node_rss/fs/binfmt_flat.c 2003-12-11 17:16:39.000000000 -0800 @@ -643,7 +643,7 @@ static int load_flat_file(struct linux_b current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; - current->mm->rss = 0; + zero_rss(current->mm); } if (flags & FLAT_FLAG_KTRACE) diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/binfmt_som.c 276-per_node_rss/fs/binfmt_som.c --- 274-percpu_real_loadavg/fs/binfmt_som.c 2003-02-13 16:36:36.000000000 -0800 +++ 276-per_node_rss/fs/binfmt_som.c 2003-12-11 17:16:39.000000000 -0800 @@ -259,7 +259,7 @@ load_som_binary(struct linux_binprm * bp create_som_tables(bprm); current->mm->start_stack = bprm->p; - current->mm->rss = 0; + zero_rss(current->mm); #if 0 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/exec.c 276-per_node_rss/fs/exec.c --- 274-percpu_real_loadavg/fs/exec.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/fs/exec.c 2003-12-11 17:16:39.000000000 -0800 @@ -327,7 +327,7 @@ void put_dirty_page(struct task_struct * set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + inc_rss(tsk->mm, page); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/fs/proc/task_mmu.c 276-per_node_rss/fs/proc/task_mmu.c --- 274-percpu_real_loadavg/fs/proc/task_mmu.c 2003-10-01 11:47:04.000000000 -0700 +++ 276-per_node_rss/fs/proc/task_mmu.c 2003-12-11 17:16:39.000000000 -0800 @@ -3,6 +3,22 @@ #include #include +#ifdef CONFIG_NUMA +char *task_mem_pernode(struct mm_struct *mm, char *buffer) +{ + int nid; + + for (nid = 0; nid < MAX_NUMNODES; nid++){ + buffer += sprintf(buffer, "VmRSS-node_%d:\t%8lu kb\n", + nid, mm->pernode_rss[nid] << (PAGE_SHIFT-10)); + } + + return buffer; +} +#else /* !CONFIG_NUMA */ +#define task_mem_pernode(mm, buffer) (buffer) +#endif /* CONFIG_NUMA */ + char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data = 0, stack = 0, exec = 0, lib = 0; @@ -39,6 +55,7 @@ char *task_mem(struct mm_struct *mm, cha mm->rss << (PAGE_SHIFT-10), data - stack, stack, exec - lib, lib); + buffer = task_mem_pernode(mm, buffer); up_read(&mm->mmap_sem); return buffer; } diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/include/asm-generic/tlb.h 276-per_node_rss/include/asm-generic/tlb.h --- 274-percpu_real_loadavg/include/asm-generic/tlb.h 2003-10-01 11:41:15.000000000 -0700 +++ 276-per_node_rss/include/asm-generic/tlb.h 2003-12-11 17:16:39.000000000 -0800 @@ -39,7 +39,6 @@ struct mmu_gather { unsigned int nr; /* set to ~0U means fast mode */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - unsigned long freed; struct page * pages[FREE_PTE_NR]; }; @@ -60,7 +59,6 @@ tlb_gather_mmu(struct mm_struct *mm, uns tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; tlb->fullmm = full_mm_flush; - tlb->freed = 0; return tlb; } @@ -85,13 +83,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = mm->rss; - - if (rss < freed) - freed = rss; - mm->rss = rss - freed; tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/include/linux/mm.h 276-per_node_rss/include/linux/mm.h --- 274-percpu_real_loadavg/include/linux/mm.h 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/include/linux/mm.h 2003-12-11 17:16:39.000000000 -0800 @@ -616,6 +616,39 @@ extern struct page * follow_page(struct extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); +/* + * Given a struct page, determine which node's memory it is from. + * TODO: There's probably a more efficient way to do this... + */ +static inline int page_to_nid(struct page *page) +{ + return pfn_to_nid(page_to_pfn(page)); +} + +#ifdef CONFIG_NUMA +static inline void zero_rss(struct mm_struct *mm) +{ + mm->rss = 0; + memset(mm->pernode_rss, 0, MAX_NUMNODES * sizeof(*mm->pernode_rss)); +} + +static inline void inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss++; + mm->pernode_rss[page_to_nid(page)]++; +} + +static inline void dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss--; + mm->pernode_rss[page_to_nid(page)]--; +} +#else /* !CONFIG_NUMA */ +#define zero_rss(mm) ((mm)->rss = 0) +#define inc_rss(mm, page) ((mm)->rss++) +#define dec_rss(mm, page) ((mm)->rss--) +#endif /* CONFIG_NUMA */ + #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/include/linux/sched.h 276-per_node_rss/include/linux/sched.h --- 274-percpu_real_loadavg/include/linux/sched.h 2003-12-11 17:16:31.000000000 -0800 +++ 276-per_node_rss/include/linux/sched.h 2003-12-11 17:16:39.000000000 -0800 @@ -197,7 +197,7 @@ struct mm_struct { atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ struct rw_semaphore mmap_sem; - spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + spinlock_t page_table_lock; /* Protects task page tables and RSS data */ struct list_head mmlist; /* List of all active mm's. These are globally strung * together off init_mm.mmlist, and are protected @@ -207,7 +207,11 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long total_vm, locked_vm; + unsigned long rss; +#ifdef CONFIG_NUMA + unsigned long pernode_rss[MAX_NUMNODES]; +#endif unsigned long def_flags; cpumask_t cpu_vm_mask; unsigned long swap_address; diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/kernel/fork.c 276-per_node_rss/kernel/fork.c --- 274-percpu_real_loadavg/kernel/fork.c 2003-12-01 17:08:35.000000000 -0800 +++ 276-per_node_rss/kernel/fork.c 2003-12-11 17:16:39.000000000 -0800 @@ -248,7 +248,7 @@ static inline int dup_mmap(struct mm_str mm->mmap_cache = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; - mm->rss = 0; + zero_rss(mm); cpus_clear(mm->cpu_vm_mask); pprev = &mm->mmap; diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/mm/fremap.c 276-per_node_rss/mm/fremap.c --- 274-percpu_real_loadavg/mm/fremap.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/mm/fremap.c 2003-12-11 17:16:39.000000000 -0800 @@ -38,7 +38,7 @@ static inline int zap_pte(struct mm_stru set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + dec_rss(mm, page); } } return 1; @@ -96,7 +96,7 @@ int install_page(struct mm_struct *mm, s flush = zap_pte(mm, vma, addr, pte); - mm->rss++; + inc_rss(mm, page); flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/mm/memory.c 276-per_node_rss/mm/memory.c --- 274-percpu_real_loadavg/mm/memory.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/mm/memory.c 2003-12-11 17:16:39.000000000 -0800 @@ -333,7 +333,7 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; + inc_rss(dst, page); set_pte(dst_pte, pte); pte_chain = page_add_rmap(page, dst_pte, @@ -425,7 +425,14 @@ zap_pte_range(struct mmu_gather *tlb, pm if (page->mapping && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); - tlb->freed++; + /* + * While we have the page that is being + * freed handy, make sure we decrement + * the mm's RSS accordingly. This is + * only important for NUMA per-node + * RSS accounting. + */ + dec_rss(tlb->mm, page); page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } @@ -1060,7 +1067,7 @@ static int do_wp_page(struct mm_struct * page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) - ++mm->rss; + inc_rss(mm, new_page); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); SetPageAnon(new_page); @@ -1295,7 +1302,7 @@ static int do_swap_page(struct mm_struct if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; + inc_rss(mm, page); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); @@ -1365,7 +1372,7 @@ do_anonymous_page(struct mm_struct *mm, ret = VM_FAULT_MINOR; goto out; } - mm->rss++; + inc_rss(mm, page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); @@ -1486,7 +1493,7 @@ retry: /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { if (!PageReserved(new_page)) - ++mm->rss; + inc_rss(mm, new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/mm/mmap.c 276-per_node_rss/mm/mmap.c --- 274-percpu_real_loadavg/mm/mmap.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/mm/mmap.c 2003-12-11 17:16:39.000000000 -0800 @@ -1433,7 +1433,7 @@ void exit_mmap(struct mm_struct *mm) vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; mm->mm_rb = RB_ROOT; - mm->rss = 0; + zero_rss(mm); mm->total_vm = 0; mm->locked_vm = 0; diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/mm/rmap.c 276-per_node_rss/mm/rmap.c --- 274-percpu_real_loadavg/mm/rmap.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/mm/rmap.c 2003-12-11 17:16:39.000000000 -0800 @@ -623,7 +623,7 @@ static int try_to_unmap_one(struct page if (pte_dirty(pte)) set_page_dirty(page); - mm->rss--; + dec_rss(mm, page); page_cache_release(page); ret = SWAP_SUCCESS; diff -purN -X /home/mbligh/.diff.exclude 274-percpu_real_loadavg/mm/swapfile.c 276-per_node_rss/mm/swapfile.c --- 274-percpu_real_loadavg/mm/swapfile.c 2003-12-11 17:16:05.000000000 -0800 +++ 276-per_node_rss/mm/swapfile.c 2003-12-11 17:16:39.000000000 -0800 @@ -387,7 +387,7 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { - vma->vm_mm->rss++; + inc_rss(vma->vm_mm, page); get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); SetPageAnon(page);