1: Introduces two new bdflush tunables: ndirty The maximum number of buffers which bdflush will attempt to write out in response to a wakeup. Previously, bdflush would write out the whole world. So this limits the amount of bdflush writeout in response to a single wakeup_bdflush(). NOTE: this code appears to be broken. If nfract_stop_bdflush is set at zero, ndirty will not prevent bdflush from writing out all dirty buffers. IOW, ndirty doesn't do anything at present. nfract_stop_bdflush In units of "percentage of total memory". bdflush will stop writing back data when the amount of memory which is dirty on the buffer LRU falls below this threshold. So this prevents bdflush from writing out *everything*. bdflush will stop, and will leave some dirty data behind for kupdate. However, `ndirty' has prececdence. So even if the amount of dirty data is less than nfract_bdflush_stop, bdflush will still attempt to write out `ndirty' buffers. 2: The mark_buffer_dirty() -> balance_dirty() path has been changed so that the process which is performing write(2) no longer starts some I/O when we're between the async and sync thresholds. Instead, we just wake bdflush. Also, when the writer reaches the sync threshold, we no longer throttle the writer by waiting on some I/O. We just start some more I/O, potentially asynchronously (but, in practice, usually blockingly, due to request queue exhaustion). Both these changes have the effect of weakening the writer-throttling at write(2) time. Presumably this is because the aa-020-sync_buffers changes now allow memory allocators to throttle on bdflush-written buffers more successfully. 3: kupdate no longer throttles itself on each wakeup. That always seemed rather pointless. This code works well. Fixes the problem where copying a large file between two disks only exercises one disk at a time. ===================================== --- 2.4.19-pre4/fs/buffer.c~aa-030-writeout_scheduling Tue Mar 26 23:11:24 2002 +++ 2.4.19-pre4-akpm/fs/buffer.c Tue Mar 26 23:11:24 2002 @@ -103,22 +103,23 @@ union bdflush_param { struct { int nfract; /* Percentage of buffer cache dirty to activate bdflush */ - int dummy1; /* old "ndirty" */ + int ndirty; /* Maximum number of dirty blocks to write out per + wake-cycle */ int dummy2; /* old "nrefill" */ int dummy3; /* unused */ int interval; /* jiffies delay between kupdate flushes */ int age_buffer; /* Time for normal buffer to age before we flush it */ int nfract_sync;/* Percentage of buffer cache dirty to activate bdflush synchronously */ - int dummy4; /* unused */ + int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */ int dummy5; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}}; +} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ -int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; -int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0}; +int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0}; +int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0}; void unlock_buffer(struct buffer_head *bh) { @@ -236,10 +237,9 @@ static int write_some_buffers(kdev_t dev */ static void write_unlocked_buffers(kdev_t dev) { - do { + do spin_lock(&lru_list_lock); - } while (write_some_buffers(dev)); - run_task_queue(&tq_disk); + while (write_some_buffers(dev)); } /* @@ -277,12 +277,6 @@ static int wait_for_buffers(kdev_t dev, return 0; } -static inline void wait_for_some_buffers(kdev_t dev) -{ - spin_lock(&lru_list_lock); - wait_for_buffers(dev, BUF_LOCKED, 1); -} - static int wait_for_locked_buffers(kdev_t dev, int index, int refile) { do { @@ -1066,6 +1060,21 @@ static int balance_dirty_state(void) return -1; } +static int bdflush_stop(void) +{ + unsigned long dirty, tot, dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush; + + if (dirty > dirty_limit) + return 0; + return 1; +} + /* * if a new dirty buffer is created we need to balance bdflush. * @@ -1080,19 +1089,16 @@ void balance_dirty(void) if (state < 0) return; - /* If we're getting into imbalance, start write-out */ - spin_lock(&lru_list_lock); - write_some_buffers(NODEV); + wakeup_bdflush(); /* * And if we're _really_ out of balance, wait for - * some of the dirty/locked buffers ourselves and - * start bdflush. + * some of the dirty/locked buffers ourselves. * This will throttle heavy writers. */ if (state > 0) { - wait_for_some_buffers(NODEV); - wakeup_bdflush(); + spin_lock(&lru_list_lock); + write_some_buffers(NODEV); } } @@ -2983,14 +2989,29 @@ int bdflush(void *startup) complete((struct completion *)startup); + /* + * FIXME: The ndirty logic here is wrong. It's supposed to + * send bdflush back to sleep after writing ndirty buffers. + * In fact, the test is wrong so bdflush will in fact + * sleep when bdflush_stop() returns true. + * + * FIXME: If it proves useful to implement ndirty properly, + * then perhaps the value of ndirty should be scaled by the + * amount of memory in the machine. + */ for (;;) { + int ndirty = bdf_prm.b_un.ndirty; + CHECK_EMERGENCY_SYNC - spin_lock(&lru_list_lock); - if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) { - wait_for_some_buffers(NODEV); - interruptible_sleep_on(&bdflush_wait); + while (ndirty > 0) { + spin_lock(&lru_list_lock); + if (!write_some_buffers(NODEV)) + break; + ndirty -= NRSYNC; } + if (ndirty > 0 || bdflush_stop()) + interruptible_sleep_on(&bdflush_wait); } } @@ -3019,8 +3040,6 @@ int kupdate(void *startup) complete((struct completion *)startup); for (;;) { - wait_for_some_buffers(NODEV); - /* update interval */ interval = bdf_prm.b_un.interval; if (interval) { @@ -3048,6 +3067,7 @@ int kupdate(void *startup) printk(KERN_DEBUG "kupdate() activated...\n"); #endif sync_old_buffers(); + run_task_queue(&tq_disk); } }