Move everything over to walking the radix tree via the PAGECACHE_TAG_DIRTY tag. Remove address_space.dirty_pages. --- 25-akpm/fs/buffer.c | 18 +---- 25-akpm/fs/fs-writeback.c | 15 ---- 25-akpm/fs/inode.c | 1 25-akpm/fs/mpage.c | 127 +++++++++++++++------------------------ 25-akpm/fs/xfs/linux/xfs_vnode.h | 3 25-akpm/include/linux/fs.h | 3 25-akpm/include/linux/pagemap.h | 7 +- 25-akpm/include/linux/pagevec.h | 7 +- 25-akpm/mm/filemap.c | 35 +++++++--- 25-akpm/mm/page-writeback.c | 29 +++++--- 25-akpm/mm/page_alloc.c | 2 25-akpm/mm/swap.c | 12 +++ 25-akpm/mm/swap_state.c | 3 fs/xfs/xfs_vfsops.c | 0 mm/truncate.c | 0 mm/vmscan.c | 0 16 files changed, 122 insertions(+), 140 deletions(-) diff -puN fs/buffer.c~stop-using-dirty-pages fs/buffer.c --- 25/fs/buffer.c~stop-using-dirty-pages 2004-04-03 03:00:12.216374384 -0800 +++ 25-akpm/fs/buffer.c 2004-04-03 03:00:12.242370432 -0800 @@ -825,12 +825,6 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * There is also a small window where the page is dirty, and not on dirty_pages. - * Also a possibility that by the time the page is added to dirty_pages, it has - * been set clean. The page lists are somewhat approximate in this regard. - * It's better to have clean pages accidentally attached to dirty_pages than to - * leave dirty pages attached to clean_pages. - * * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. @@ -871,8 +865,6 @@ int __set_page_dirty_buffers(struct page if (page->mapping) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_DIRTY); } @@ -1228,7 +1220,7 @@ __getblk_slow(struct block_device *bdev, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page appears on its address_space.dirty_pages list. + * the page is tagged dirty in its radix tree. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1250,10 +1242,10 @@ __getblk_slow(struct block_device *bdev, /** * mark_buffer_dirty - mark a buffer_head as needing writeout * - * mark_buffer_dirty() will set the dirty bit against the buffer, - * then set its backing page dirty, then attach the page to its - * address_space's dirty_pages list and then attach the address_space's - * inode to its superblock's dirty inode list. + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, * mapping->tree_lock and the global inode_lock. diff -puN fs/fs-writeback.c~stop-using-dirty-pages fs/fs-writeback.c --- 25/fs/fs-writeback.c~stop-using-dirty-pages 2004-04-03 03:00:12.217374232 -0800 +++ 25-akpm/fs/fs-writeback.c 2004-04-03 03:00:12.243370280 -0800 @@ -129,12 +129,6 @@ static void write_inode(struct inode *in * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. * - * So what we do is to move all pages which are to be written from dirty_pages - * onto io_pages. And keep on writing io_pages until it's empty. Refusing to - * move more pages onto io_pages until io_pages is empty. Once that point has - * been reached, we are ready to take another pass across the inode's dirty - * pages. - * * Called under inode_lock. */ static int @@ -159,10 +153,6 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock_irq(&mapping->tree_lock); - if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock_irq(&mapping->tree_lock); spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -180,10 +170,7 @@ __sync_single_inode(struct inode *inode, spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { - if (!list_empty(&mapping->io_pages)) { - /* Needs more writeback */ - inode->i_state |= I_DIRTY_PAGES; - } else if (!list_empty(&mapping->dirty_pages)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* Redirtied */ inode->i_state |= I_DIRTY_PAGES; inode->dirtied_when = jiffies; diff -puN fs/inode.c~stop-using-dirty-pages fs/inode.c --- 25/fs/inode.c~stop-using-dirty-pages 2004-04-03 03:00:12.219373928 -0800 +++ 25-akpm/fs/inode.c 2004-04-03 03:00:12.244370128 -0800 @@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_data.clean_pages); - INIT_LIST_HEAD(&inode->i_data.dirty_pages); INIT_LIST_HEAD(&inode->i_data.locked_pages); INIT_LIST_HEAD(&inode->i_data.io_pages); INIT_LIST_HEAD(&inode->i_dentry); diff -puN fs/mpage.c~stop-using-dirty-pages fs/mpage.c --- 25/fs/mpage.c~stop-using-dirty-pages 2004-04-03 03:00:12.220373776 -0800 +++ 25-akpm/fs/mpage.c 2004-04-03 03:00:12.245369976 -0800 @@ -592,28 +592,12 @@ out: * (The next two paragraphs refer to code which isn't here yet, but they * explain the presence of address_space.io_pages) * - * Pages can be moved from clean_pages or locked_pages onto dirty_pages - * at any time - it's not possible to lock against that. So pages which - * have already been added to a BIO may magically reappear on the dirty_pages - * list. And mpage_writepages() will again try to lock those pages. - * But I/O has not yet been started against the page. Thus deadlock. - * - * To avoid this, mpage_writepages() will only write pages from io_pages. The - * caller must place them there. We walk io_pages, locking the pages and - * submitting them for I/O, moving them to locked_pages. - * - * This has the added benefit of preventing a livelock which would otherwise - * occur if pages are being dirtied faster than we can write them out. - * * If a page is already under I/O, generic_writepages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. So if called_for_sync() * is true, we must wait for existing IO to complete. - * - * It's fairly rare for PageWriteback pages to be on ->dirty_pages. It - * means that someone redirtied the page while it was under I/O. */ int mpage_writepages(struct address_space *mapping, @@ -625,6 +609,9 @@ mpage_writepages(struct address_space *m int ret = 0; int done = 0; int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -635,72 +622,58 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock_irq(&mapping->tree_lock); - while (!list_empty(&mapping->io_pages) && !done) { - struct page *page = list_entry(mapping->io_pages.prev, - struct page, list); - list_del(&page->list); - if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) { - if (PageDirty(page)) { - list_add(&page->list, &mapping->dirty_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - continue; - } - if (!PageDirty(page)) { - list_add(&page->list, &mapping->clean_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - - page_cache_get(page); - spin_unlock_irq(&mapping->tree_lock); - - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file mapping. - */ - - lock_page(page); - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (page->mapping == mapping && !PageWriteback(page) && - test_clear_page_dirty(page)) { - if (writepage) { - ret = (*writepage)(page, wbc); - if (ret) { - if (ret == -ENOSPC) - set_bit(AS_ENOSPC, - &mapping->flags); - else - set_bit(AS_EIO, - &mapping->flags); + pagevec_init(&pvec, 0); + index = 0; + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + lock_page(page); + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (page->mapping == mapping && !PageWriteback(page) && + test_clear_page_dirty(page)) { + if (writepage) { + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, + &mapping->flags); + else + set_bit(AS_EIO, + &mapping->flags); + } + } else { + bio = mpage_writepage(bio, page, + get_block, &last_block_in_bio, + &ret, wbc); + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && + bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; } } else { - bio = mpage_writepage(bio, page, get_block, - &last_block_in_bio, &ret, wbc); - } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; + unlock_page(page); } - } else { - unlock_page(page); } - page_cache_release(page); - spin_lock_irq(&mapping->tree_lock); + pagevec_release(&pvec); } - /* - * Leave any remaining dirty pages on ->io_pages - */ - spin_unlock_irq(&mapping->tree_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -puN include/linux/fs.h~stop-using-dirty-pages include/linux/fs.h --- 25/include/linux/fs.h~stop-using-dirty-pages 2004-04-03 03:00:12.221373624 -0800 +++ 25-akpm/include/linux/fs.h 2004-04-03 03:00:12.247369672 -0800 @@ -324,7 +324,6 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ - struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ struct list_head io_pages; /* being prepared for I/O */ unsigned long nrpages; /* number of total pages */ @@ -371,6 +370,8 @@ struct block_device { #define PAGECACHE_TAG_DIRTY 0 #define PAGECACHE_TAG_WRITEBACK 1 +int mapping_tagged(struct address_space *mapping, int tag); + /* * Use sequence counter to get consistent i_size on 32-bit processors. */ diff -puN include/linux/pagemap.h~stop-using-dirty-pages include/linux/pagemap.h --- 25/include/linux/pagemap.h~stop-using-dirty-pages 2004-04-03 03:00:12.223373320 -0800 +++ 25-akpm/include/linux/pagemap.h 2004-04-03 03:00:12.248369520 -0800 @@ -69,9 +69,10 @@ extern struct page * find_trylock_page(s unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask); -extern unsigned int find_get_pages(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages); +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages); /* * Returns locked page at given index in given cache, creating it if needed. diff -puN include/linux/pagevec.h~stop-using-dirty-pages include/linux/pagevec.h --- 25/include/linux/pagevec.h~stop-using-dirty-pages 2004-04-03 03:00:12.224373168 -0800 +++ 25-akpm/include/linux/pagevec.h 2004-04-03 03:00:12.248369520 -0800 @@ -22,8 +22,11 @@ void __pagevec_free(struct pagevec *pvec void __pagevec_lru_add(struct pagevec *pvec); void __pagevec_lru_add_active(struct pagevec *pvec); void pagevec_strip(struct pagevec *pvec); -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages); +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages); +unsigned pagevec_lookup_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, int tag, + unsigned nr_pages); static inline void pagevec_init(struct pagevec *pvec, int cold) { diff -puN mm/filemap.c~stop-using-dirty-pages mm/filemap.c --- 25/mm/filemap.c~stop-using-dirty-pages 2004-04-03 03:00:12.225373016 -0800 +++ 25-akpm/mm/filemap.c 2004-04-03 03:00:12.250369216 -0800 @@ -100,9 +100,7 @@ void __remove_from_page_cache(struct pag struct address_space *mapping = page->mapping; radix_tree_delete(&mapping->page_tree, page->index); - list_del(&page->list); page->mapping = NULL; - mapping->nrpages--; pagecache_acct(-1); } @@ -148,9 +146,6 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock_irq(&mapping->tree_lock); - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock_irq(&mapping->tree_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -190,11 +185,7 @@ restart: struct page *page; page = list_entry(mapping->locked_pages.next,struct page,list); - list_del(&page->list); - if (PageDirty(page)) - list_add(&page->list, &mapping->dirty_pages); - else - list_add(&page->list, &mapping->clean_pages); + list_del_init(&page->list); if (!PageWriteback(page)) { if (++progress > 32) { @@ -228,7 +219,6 @@ restart: return ret; } - EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) @@ -539,7 +529,7 @@ EXPORT_SYMBOL(find_or_create_page); * * find_get_pages() returns the number of pages which were found. */ -unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { unsigned int i; @@ -555,6 +545,27 @@ unsigned int find_get_pages(struct addre } /* + * Like find_get_pages, except we only return pages which are tagged with + * `tag'. We update *start to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + spin_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup_tag(&mapping->page_tree, + (void **)pages, *index, nr_pages, tag); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + if (ret) + *index = pages[ret - 1]->index + 1; + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should diff -puN mm/page-writeback.c~stop-using-dirty-pages mm/page-writeback.c --- 25/mm/page-writeback.c~stop-using-dirty-pages 2004-04-03 03:00:12.227372712 -0800 +++ 25-akpm/mm/page-writeback.c 2004-04-03 03:00:12.251369064 -0800 @@ -472,12 +472,8 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock_irq(&mapping->tree_lock); - list_del(&page->list); if (test_clear_page_dirty(page)) { - list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock_irq(&mapping->tree_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -486,8 +482,6 @@ int write_one_page(struct page *page, in } page_cache_release(page); } else { - list_add(&page->list, &mapping->clean_pages); - spin_unlock_irq(&mapping->tree_lock); unlock_page(page); } return ret; @@ -495,9 +489,8 @@ int write_one_page(struct page *page, in EXPORT_SYMBOL(write_one_page); /* - * For address_spaces which do not use buffers. Just set the page's dirty bit - * and move it to the dirty_pages list. Also perform space reservation if - * required. + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. * * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at @@ -520,8 +513,6 @@ int __set_page_dirty_nobuffers(struct pa BUG_ON(page->mapping != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_DIRTY); } @@ -646,3 +637,19 @@ int test_set_page_writeback(struct page } EXPORT_SYMBOL(test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marged with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = radix_tree_tagged(&mapping->page_tree, tag); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff -puN mm/swap.c~stop-using-dirty-pages mm/swap.c --- 25/mm/swap.c~stop-using-dirty-pages 2004-04-03 03:00:12.228372560 -0800 +++ 25-akpm/mm/swap.c 2004-04-03 03:00:12.252368912 -0800 @@ -353,13 +353,21 @@ void pagevec_strip(struct pagevec *pvec) * * pagevec_lookup() returns the number of pages which were found. */ -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages) +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) { pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} + #ifdef CONFIG_SMP /* diff -puN mm/swap_state.c~stop-using-dirty-pages mm/swap_state.c --- 25/mm/swap_state.c~stop-using-dirty-pages 2004-04-03 03:00:12.230372256 -0800 +++ 25-akpm/mm/swap_state.c 2004-04-03 03:00:12.252368912 -0800 @@ -27,7 +27,6 @@ struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), .tree_lock = SPIN_LOCK_UNLOCKED, .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), .a_ops = &swap_aops, @@ -210,7 +209,6 @@ int move_to_swap_cache(struct page *page if (!err) { if (!swap_duplicate(entry)) BUG(); - /* shift page from clean_pages to dirty_pages list */ BUG_ON(PageDirty(page)); set_page_dirty(page); INC_CACHE_INFO(add_total); @@ -245,7 +243,6 @@ int move_from_swap_cache(struct page *pa if (!err) { swap_free(entry); - /* shift page from clean_pages to dirty_pages list */ __clear_page_dirty(page); set_page_dirty(page); } diff -puN mm/truncate.c~stop-using-dirty-pages mm/truncate.c diff -puN mm/vmscan.c~stop-using-dirty-pages mm/vmscan.c diff -puN fs/xfs/xfs_vfsops.c~stop-using-dirty-pages fs/xfs/xfs_vfsops.c diff -puN fs/xfs/linux/xfs_vnode.h~stop-using-dirty-pages fs/xfs/linux/xfs_vnode.h --- 25/fs/xfs/linux/xfs_vnode.h~stop-using-dirty-pages 2004-04-03 03:00:12.235371496 -0800 +++ 25-akpm/fs/xfs/linux/xfs_vnode.h 2004-04-03 03:00:12.253368760 -0800 @@ -600,7 +600,8 @@ static __inline__ void vn_flagclr(struct (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)))) #define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages))) +#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ + PAGECACHE_TAG_DIRTY) #define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED) #define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) diff -puN mm/page_alloc.c~stop-using-dirty-pages mm/page_alloc.c --- 25/mm/page_alloc.c~stop-using-dirty-pages 2004-04-03 03:00:12.237371192 -0800 +++ 25-akpm/mm/page_alloc.c 2004-04-03 03:00:12.254368608 -0800 @@ -682,6 +682,8 @@ nopage: return NULL; got_pg: kernel_map_pages(page, 1 << order, 1); + INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); return page; } _