From: Trond Myklebust Hi Andrew, Mind if I send this in to Linus. On my test setup, it makes up the difference between an immediate hang when I run iozone with > 1 thread on a Gbyte file, and being able to run sensibly with 8 threads or more. Please note the slight modification to nfs_writepages w.r.t. older versions of this patch. fs/nfs/write.c | 12 +++++++++++- include/linux/page-flags.h | 3 +++ mm/page-writeback.c | 42 ++++++++++++++++++++++++++++-------------- mm/page_alloc.c | 5 ++++- 4 files changed, 46 insertions(+), 16 deletions(-) diff -puN fs/nfs/write.c~nfs-resource-management fs/nfs/write.c --- 25/fs/nfs/write.c~nfs-resource-management 2003-04-13 12:45:29.000000000 -0700 +++ 25-akpm/fs/nfs/write.c 2003-04-13 12:45:29.000000000 -0700 @@ -283,8 +283,14 @@ nfs_writepages(struct address_space *map err = nfs_flush_file(inode, NULL, 0, 0, 0); if (err < 0) goto out; - if (is_sync) + if (wbc->sync_mode == WB_SYNC_HOLD) + goto out; + if (is_sync && wbc->sync_mode == WB_SYNC_ALL) { err = nfs_wb_all(inode); + } else + nfs_commit_file(inode, NULL, 0, 0, 0); + /* Avoid races. Tell upstream we've done all we were told to do */ + wbc->nr_to_write = 0; out: return err; } @@ -372,6 +378,7 @@ nfs_mark_request_dirty(struct nfs_page * nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; spin_unlock(&nfs_wreq_lock); + inc_page_state(nr_dirty); mark_inode_dirty(inode); } @@ -399,6 +406,7 @@ nfs_mark_request_commit(struct nfs_page nfs_list_add_request(req, &nfsi->commit); nfsi->ncommit++; spin_unlock(&nfs_wreq_lock); + inc_page_state(nr_unstable); mark_inode_dirty(inode); } #endif @@ -466,6 +474,7 @@ nfs_scan_dirty(struct inode *inode, stru int res; res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages); nfsi->ndirty -= res; + sub_page_state(nr_dirty,res); if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); return res; @@ -490,6 +499,7 @@ nfs_scan_commit(struct inode *inode, str int res; res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages); nfsi->ncommit -= res; + sub_page_state(nr_unstable,res); if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); return res; diff -puN include/linux/page-flags.h~nfs-resource-management include/linux/page-flags.h --- 25/include/linux/page-flags.h~nfs-resource-management 2003-04-13 12:45:29.000000000 -0700 +++ 25-akpm/include/linux/page-flags.h 2003-04-13 12:45:29.000000000 -0700 @@ -75,6 +75,7 @@ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -82,6 +83,7 @@ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ unsigned long nr_writeback; /* Pages under writeback */ + unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ unsigned long nr_slab; /* In slab */ @@ -130,6 +132,7 @@ extern void get_full_page_state(struct p #define inc_page_state(member) mod_page_state(member, 1UL) #define dec_page_state(member) mod_page_state(member, 0UL - 1) +#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta)) /* diff -puN mm/page_alloc.c~nfs-resource-management mm/page_alloc.c --- 25/mm/page_alloc.c~nfs-resource-management 2003-04-13 12:45:29.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-04-13 12:45:29.000000000 -0700 @@ -941,11 +941,13 @@ void show_free_areas(void) K(nr_free_pages()), K(nr_free_highpages())); - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u\n", + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u\n", active, inactive, ps.nr_dirty, ps.nr_writeback, + ps.nr_unstable, nr_free_pages()); for_each_zone(zone) { @@ -1444,6 +1446,7 @@ struct seq_operations fragmentation_op = static char *vmstat_text[] = { "nr_dirty", "nr_writeback", + "nr_unstable", "nr_page_table_pages", "nr_mapped", "nr_slab", diff -puN mm/page-writeback.c~nfs-resource-management mm/page-writeback.c --- 25/mm/page-writeback.c~nfs-resource-management 2003-04-13 12:45:29.000000000 -0700 +++ 25-akpm/mm/page-writeback.c 2003-04-13 12:45:29.000000000 -0700 @@ -138,6 +138,7 @@ get_dirty_limits(struct page_state *ps, void balance_dirty_pages(struct address_space *mapping) { struct page_state ps; + long nr_reclaimable; long background_thresh; long dirty_thresh; unsigned long pages_written = 0; @@ -145,8 +146,7 @@ void balance_dirty_pages(struct address_ struct backing_dev_info *bdi = mapping->backing_dev_info; - get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - while (ps.nr_dirty + ps.nr_writeback > dirty_thresh) { + for (;;) { struct writeback_control wbc = { .bdi = bdi, .sync_mode = WB_SYNC_NONE, @@ -154,24 +154,37 @@ void balance_dirty_pages(struct address_ .nr_to_write = write_chunk, }; + get_dirty_limits(&ps, &background_thresh, &dirty_thresh); + nr_reclaimable = ps.nr_dirty + ps.nr_unstable; + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) + break; + dirty_exceeded = 1; - if (ps.nr_dirty) + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ + if (nr_reclaimable) { writeback_inodes(&wbc); - - get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) - break; - pages_written += write_chunk - wbc.nr_to_write; - if (pages_written >= write_chunk) - break; /* We've done our duty */ + get_dirty_limits(&ps, &background_thresh, + &dirty_thresh); + nr_reclaimable = ps.nr_dirty + ps.nr_unstable; + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) + break; + pages_written += write_chunk - wbc.nr_to_write; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } blk_congestion_wait(WRITE, HZ/10); } - if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) + if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) dirty_exceeded = 0; - if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh) + if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh) pdflush_operation(background_writeout, 0); } @@ -231,7 +244,8 @@ static void background_writeout(unsigned long dirty_thresh; get_dirty_limits(&ps, &background_thresh, &dirty_thresh); - if (ps.nr_dirty < background_thresh && min_pages <= 0) + if (ps.nr_dirty + ps.nr_unstable < background_thresh + && min_pages <= 0) break; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; @@ -302,7 +316,7 @@ static void wb_kupdate(unsigned long arg oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; start_jif = jiffies; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; - nr_to_write = ps.nr_dirty; + nr_to_write = ps.nr_dirty + ps.nr_unstable; while (nr_to_write > 0) { wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; _