From: Trond Myklebust Hi Andrew, Mind if I send this in to Linus. On my test setup, it makes up the difference between an immediate hang when I run iozone with > 1 thread on a Gbyte file, and being able to run sensibly with 8 threads or more. Please note the slight modification to nfs_writepages w.r.t. older versions of this patch. 25-akpm/fs/nfs/write.c | 12 +++++++++++- 25-akpm/include/linux/page-flags.h | 3 +++ 25-akpm/mm/page-writeback.c | 34 ++++++++++++++++++++-------------- 25-akpm/mm/page_alloc.c | 4 +++- 4 files changed, 37 insertions(+), 16 deletions(-) diff -puN fs/nfs/write.c~nfs-resource-management fs/nfs/write.c --- 25/fs/nfs/write.c~nfs-resource-management Mon Apr 7 12:26:00 2003 +++ 25-akpm/fs/nfs/write.c Mon Apr 7 12:26:00 2003 @@ -283,8 +283,14 @@ nfs_writepages(struct address_space *map err = nfs_flush_file(inode, NULL, 0, 0, 0); if (err < 0) goto out; - if (is_sync) + if (wbc->sync_mode == WB_SYNC_HOLD) + goto out; + if (is_sync && wbc->sync_mode == WB_SYNC_ALL) { err = nfs_wb_all(inode); + } else + nfs_commit_file(inode, NULL, 0, 0, 0); + /* Avoid races. Tell upstream we've done all we were told to do */ + wbc->nr_to_write = 0; out: return err; } @@ -371,6 +377,7 @@ nfs_mark_request_dirty(struct nfs_page * spin_lock(&nfs_wreq_lock); nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; + inc_page_state(nr_dirty); spin_unlock(&nfs_wreq_lock); mark_inode_dirty(inode); } @@ -398,6 +405,7 @@ nfs_mark_request_commit(struct nfs_page spin_lock(&nfs_wreq_lock); nfs_list_add_request(req, &nfsi->commit); nfsi->ncommit++; + inc_page_state(nr_unstable); spin_unlock(&nfs_wreq_lock); mark_inode_dirty(inode); } @@ -466,6 +474,7 @@ nfs_scan_dirty(struct inode *inode, stru int res; res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages); nfsi->ndirty -= res; + sub_page_state(nr_dirty,res); if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); return res; @@ -490,6 +499,7 @@ nfs_scan_commit(struct inode *inode, str int res; res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages); nfsi->ncommit -= res; + sub_page_state(nr_unstable,res); if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); return res; diff -puN include/linux/page-flags.h~nfs-resource-management include/linux/page-flags.h --- 25/include/linux/page-flags.h~nfs-resource-management Mon Apr 7 12:26:00 2003 +++ 25-akpm/include/linux/page-flags.h Mon Apr 7 12:26:00 2003 @@ -76,6 +76,7 @@ #define PG_compound 19 /* Part of a compound page */ #define PG_anon 20 /* Anonymous page */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -83,6 +84,7 @@ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ unsigned long nr_writeback; /* Pages under writeback */ + unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ unsigned long nr_slab; /* In slab */ @@ -131,6 +133,7 @@ extern void get_full_page_state(struct p #define inc_page_state(member) mod_page_state(member, 1UL) #define dec_page_state(member) mod_page_state(member, 0UL - 1) +#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta)) /* diff -puN mm/page_alloc.c~nfs-resource-management mm/page_alloc.c --- 25/mm/page_alloc.c~nfs-resource-management Mon Apr 7 12:26:00 2003 +++ 25-akpm/mm/page_alloc.c Mon Apr 7 12:26:00 2003 @@ -943,11 +943,12 @@ void show_free_areas(void) K(nr_free_pages()), K(nr_free_highpages())); - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u\n", + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%u\n", active, inactive, ps.nr_dirty, ps.nr_writeback, + ps.nr_unstable, nr_free_pages()); for_each_zone(zone) { @@ -1438,6 +1439,7 @@ struct seq_operations fragmentation_op = static char *vmstat_text[] = { "nr_dirty", "nr_writeback", + "nr_unstable", "nr_page_table_pages", "nr_mapped", "nr_slab", diff -puN mm/page-writeback.c~nfs-resource-management mm/page-writeback.c --- 25/mm/page-writeback.c~nfs-resource-management Mon Apr 7 12:26:00 2003 +++ 25-akpm/mm/page-writeback.c Mon Apr 7 12:26:00 2003 @@ -138,6 +138,7 @@ get_dirty_limits(struct page_state *ps, void balance_dirty_pages(struct address_space *mapping) { struct page_state ps; + long nr_reclaimable; long background_thresh; long dirty_thresh; unsigned long pages_written = 0; @@ -145,8 +146,7 @@ void balance_dirty_pages(struct address_ struct backing_dev_info *bdi = mapping->backing_dev_info; - get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - while (ps.nr_dirty + ps.nr_writeback > dirty_thresh) { + for (;;) { struct writeback_control wbc = { .bdi = bdi, .sync_mode = WB_SYNC_NONE, @@ -154,24 +154,30 @@ void balance_dirty_pages(struct address_ .nr_to_write = write_chunk, }; + get_dirty_limits(&ps, &background_thresh, &dirty_thresh); + nr_reclaimable = ps.nr_dirty + ps.nr_unstable; + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) + break; + dirty_exceeded = 1; - if (ps.nr_dirty) + if (nr_reclaimable) { writeback_inodes(&wbc); - - get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) - break; - pages_written += write_chunk - wbc.nr_to_write; - if (pages_written >= write_chunk) - break; /* We've done our duty */ + get_dirty_limits(&ps, &background_thresh, &dirty_thresh); + nr_reclaimable = ps.nr_dirty + ps.nr_unstable; + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) + break; + pages_written += write_chunk - wbc.nr_to_write; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } blk_congestion_wait(WRITE, HZ/10); } - if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) dirty_exceeded = 0; - if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh) + if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh) pdflush_operation(background_writeout, 0); } @@ -231,7 +237,7 @@ static void background_writeout(unsigned long dirty_thresh; get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - if (ps.nr_dirty < background_thresh && min_pages <= 0) + if (ps.nr_dirty + ps.nr_unstable < background_thresh && min_pages <= 0) break; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; @@ -302,7 +308,7 @@ static void wb_kupdate(unsigned long arg oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; start_jif = jiffies; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; - nr_to_write = ps.nr_dirty; + nr_to_write = ps.nr_dirty + ps.nr_unstable; while (nr_to_write > 0) { wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; _