several functions in rmap.c are searching the ptes[] array fo find the first non-null entry. Despite the fact tha the whole lot is in L1 cache, it is expensive, especially on 128-byte cacheline machines. We can encode the index of the first non-null pte entry inside the pte_chain's `next' field and remove those searches altogether. This reduces the rmap CPU tax by about 25% on a P4. For a total runtime reduction of around 5% in the bash-script intensive test which I use. mm/rmap.c | 86 +++++++++++++++++++++++++++++++++++++++----------------------- 1 files changed, 55 insertions(+), 31 deletions(-) diff -puN mm/rmap.c~rmap-search-speedup mm/rmap.c --- 25/mm/rmap.c~rmap-search-speedup 2003-04-08 03:16:24.000000000 -0700 +++ 25-akpm/mm/rmap.c 2003-04-08 03:16:24.000000000 -0700 @@ -46,15 +46,40 @@ * We use an array of pte pointers in this structure to minimise cache misses * while traversing reverse maps. */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) +#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) +/* + * next_and_idx encodes both the address of the next pte_chain and the + * offset of the highest-index used pte in ptes[]. + */ struct pte_chain { - struct pte_chain *next; + unsigned long next_and_idx; pte_addr_t ptes[NRPTE]; } ____cacheline_aligned; kmem_cache_t *pte_chain_cache; +static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +{ + return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); +} + +static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +{ + return (struct pte_chain *)(pte_chain_addr & ~NRPTE); +} + +static inline int pte_chain_idx(struct pte_chain *pte_chain) +{ + return pte_chain->next_and_idx & NRPTE; +} + +static inline unsigned long +pte_chain_encode(struct pte_chain *pte_chain, int idx) +{ + return (unsigned long)pte_chain | idx; +} + /* * pte_chain list management policy: * @@ -89,7 +114,7 @@ kmem_cache_t *pte_chain_cache; */ int page_referenced(struct page * page) { - struct pte_chain * pc; + struct pte_chain *pc; int referenced = 0; if (TestClearPageReferenced(page)) @@ -104,7 +129,7 @@ int page_referenced(struct page * page) int nr_chains = 0; /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pc->next) { + for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { int i; for (i = NRPTE-1; i >= 0; i--) { @@ -144,7 +169,6 @@ page_add_rmap(struct page *page, pte_t * { pte_addr_t pte_paddr = ptep_to_paddr(ptep); struct pte_chain *cur_pte_chain; - int i; #ifdef DEBUG_RMAP if (!page || !ptep) @@ -165,12 +189,14 @@ page_add_rmap(struct page *page, pte_t * * This stuff needs help to get up to highmem speed. */ { - struct pte_chain * pc; + struct pte_chain *pc; + int i; + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) BUG(); } else { - for (pc = page->pte.chain; pc; pc = pc->next) { + for (pc = page->pte.chain; pc; pc=pte_chain_next(pc)) { for (i = 0; i < NRPTE; i++) { pte_addr_t p = pc->ptes[i]; @@ -194,6 +220,7 @@ page_add_rmap(struct page *page, pte_t * ClearPageDirect(page); pte_chain->ptes[NRPTE-1] = page->pte.direct; pte_chain->ptes[NRPTE-2] = pte_paddr; + pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); page->pte.direct = 0; page->pte.chain = pte_chain; pte_chain = NULL; /* We consumed it */ @@ -202,20 +229,15 @@ page_add_rmap(struct page *page, pte_t * cur_pte_chain = page->pte.chain; if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next = cur_pte_chain; + pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, + NRPTE - 1); page->pte.chain = pte_chain; pte_chain->ptes[NRPTE-1] = pte_paddr; pte_chain = NULL; /* We consumed it */ goto out; } - - for (i = NRPTE-2; i >= 0; i--) { - if (!cur_pte_chain->ptes[i]) { - cur_pte_chain->ptes[i] = pte_paddr; - goto out; - } - } - BUG(); + cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; + cur_pte_chain->next_and_idx--; out: pte_chain_unlock(page); return pte_chain; @@ -253,18 +275,18 @@ void page_remove_rmap(struct page * page } } else { struct pte_chain *start = page->pte.chain; + struct pte_chain *next; int victim_i = -1; - for (pc = start; pc; pc = pc->next) { + for (pc = start; pc; pc = next) { int i; - if (pc->next) - prefetch(pc->next); - for (i = 0; i < NRPTE; i++) { + next = pte_chain_next(pc); + if (next) + prefetch(next); + for (i = pte_chain_idx(pc); i < NRPTE; i++) { pte_addr_t pa = pc->ptes[i]; - if (!pa) - continue; if (victim_i == -1) victim_i = i; if (pa != pte_paddr) @@ -273,10 +295,10 @@ void page_remove_rmap(struct page * page start->ptes[victim_i] = 0; if (victim_i == NRPTE-1) { /* Emptied a pte_chain */ - page->pte.chain = start->next; + page->pte.chain = pte_chain_next(start); __pte_chain_free(start); } else { - /* Do singleton->PageDirect here */ + start->next_and_idx++; } goto out; } @@ -289,7 +311,7 @@ void page_remove_rmap(struct page * page if (PageDirect(page)) { printk("%llx", (u64)page->pte.direct); } else { - for (pc = page->pte.chain; pc; pc = pc->next) { + for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { int i; for (i = 0; i < NRPTE; i++) printk(" %d:%llx", i, (u64)pc->ptes[i]); @@ -439,10 +461,10 @@ int try_to_unmap(struct page * page) for (pc = start; pc; pc = next_pc) { int i; - next_pc = pc->next; + next_pc = pte_chain_next(pc); if (next_pc) prefetch(next_pc); - for (i = 0; i < NRPTE; i++) { + for (i = pte_chain_idx(pc); i < NRPTE; i++) { pte_addr_t pte_paddr = pc->ptes[i]; if (!pte_paddr) @@ -462,10 +484,12 @@ int try_to_unmap(struct page * page) start->ptes[victim_i] = 0; victim_i++; if (victim_i == NRPTE) { - page->pte.chain = start->next; + page->pte.chain = pte_chain_next(start); __pte_chain_free(start); start = page->pte.chain; victim_i = 0; + } else { + start->next_and_idx++; } break; case SWAP_AGAIN: @@ -507,8 +531,8 @@ void __pte_chain_free(struct pte_chain * int cpu = get_cpu(); struct pte_chain **pte_chainp; - if (pte_chain->next) - pte_chain->next = NULL; + if (pte_chain->next_and_idx) + pte_chain->next_and_idx = 0; pte_chainp = &per_cpu(local_pte_chain, cpu); if (*pte_chainp) kmem_cache_free(pte_chain_cache, *pte_chainp); @@ -553,7 +577,7 @@ void __init pte_chain_init(void) pte_chain_cache = kmem_cache_create( "pte_chain", sizeof(struct pte_chain), 0, - 0, + SLAB_MUST_HWCACHE_ALIGN, pte_chain_ctor, NULL); _