Various changes to the dirty buffer flushing code. The VM wants to throttle memory allocators down to match the disk's ability to clean memory. It does this by initiating and waiting on I/O. Also by waiting on I/O which is initiated by others. In the current kernel, try_to_free_buffers() only waits on writeout which try_to_free_buffers() started. This patch allows try_to_free_buffers() to also throttle on kupdate/bdflush writeout. Gives better throttling and avoids the situation where tasks (inside the context of the page allocator) pointlessly traverse large numbers of pages before finding one which is eligible for throttling. This allows us to no longer have to account for locked buffers in balance_dirty_state() - the "machine flooded with locked buffers" problem doesn't occur, because we can throttle on all buffers. I'm not completely sure that this is a good change. It could have the tendency to make all page allocators suffer because of a single writer. It probably looks good on disk-intensive workloads, but there is a risk that compute-intensive tasks which are allocating modest amounts of memory will get less work done. They'll spend more time in disk wait and the CPU will be idle. That being said, testing which was specifically designed to demonstrate this effect failed to do so... ===================================== --- 2.4.19-pre4/fs/buffer.c~aa-020-sync_buffers Tue Mar 26 23:11:23 2002 +++ 2.4.19-pre4-akpm/fs/buffer.c Tue Mar 26 23:11:23 2002 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -122,7 +123,14 @@ int bdflush_max[N_PARAM] = {100,50000, 2 void unlock_buffer(struct buffer_head *bh) { clear_bit(BH_Wait_IO, &bh->b_state); - clear_bit(BH_launder, &bh->b_state); + clear_bit(BH_Launder, &bh->b_state); + /* + * When a locked buffer is visible to the I/O layer BH_Launder + * is set. This means before unlocking we must clear BH_Launder, + * mb() on alpha and then clear BH_Lock, so no reader can see + * BH_Launder set on an unlocked buffer and then risk to deadlock. + */ + smp_mb__after_clear_bit(); clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); if (waitqueue_active(&bh->b_wait)) @@ -130,13 +138,9 @@ void unlock_buffer(struct buffer_head *b } /* - * Rewrote the wait-routines to use the "new" wait-queue functionality, - * and getting rid of the cli-sti pairs. The wait-queue routines still - * need cli-sti, but now it's just a couple of 386 instructions or so. - * * Note that the real wait_on_buffer() is an inline function that checks - * if 'b_wait' is set before calling this, so that the queues aren't set - * up unnecessarily. + * that the buffer is locked before calling this, so that unnecessary disk + * unplugging does not occur. */ void __wait_on_buffer(struct buffer_head * bh) { @@ -1046,7 +1050,6 @@ static int balance_dirty_state(void) unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; - dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT; tot = nr_free_buffer_pages(); dirty *= 100; @@ -2592,23 +2595,58 @@ static int grow_buffers(kdev_t dev, unsi return 1; } +/* + * The first time the VM inspects a page which has locked buffers, it + * will just mark it as needing waiting upon on the scan of the page LRU. + * BH_Wait_IO is used for this. + * + * The second time the VM visits the page, if it still has locked + * buffers, it is time to start writing them out. (BH_Wait_IO was set). + * + * The third time the VM visits the page, if the I/O hasn't completed + * then it's time to wait upon writeout. BH_Lock and BH_Launder are + * used for this. + * + * There is also the case of buffers which were locked by someone else + * - write(2) callers, bdflush, etc. There can be a huge number of these + * and we don't want to just skip them all and fail the page allocation. + * We want to be able to wait on these buffers as well. + * + * The BH_Launder bit is set in submit_bh() to indicate that I/O is + * underway against the buffer, doesn't matter who started it - we know + * that the buffer will eventually come unlocked, and so it's safe to + * wait on it. + * + * The caller holds the page lock and the caller will free this page + * into current->local_page, so by waiting on the page's buffers the + * caller is guaranteed to obtain this page. + * + * sync_page_buffers() will sort-of return true if all the buffers + * against this page are freeable, so try_to_free_buffers() should + * try to free the page's buffers a second time. This is a bit + * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly. + */ static int sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 0; + int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { + tryagain = 0; continue; + } /* Second time through we start actively writing out.. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (!test_bit(BH_launder, &bh->b_state)) + if (unlikely(!buffer_launder(bh))) { + tryagain = 0; continue; + } wait_on_buffer(bh); tryagain = 1; continue; @@ -2621,7 +2659,6 @@ static int sync_page_buffers(struct buff __mark_buffer_clean(bh); get_bh(bh); - set_bit(BH_launder, &bh->b_state); bh->b_end_io = end_buffer_io_sync; submit_bh(WRITE, bh); tryagain = 0; --- 2.4.19-pre4/include/linux/fs.h~aa-020-sync_buffers Tue Mar 26 23:11:23 2002 +++ 2.4.19-pre4-akpm/include/linux/fs.h Tue Mar 26 23:11:23 2002 @@ -217,7 +217,7 @@ enum bh_state_bits { BH_New, /* 1 if the buffer is new and not yet written out */ BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */ BH_Wait_IO, /* 1 if we should write out this buffer */ - BH_launder, /* 1 if we should throttle on this buffer */ + BH_Launder, /* 1 if we can throttle on this buffer */ BH_JBD, /* 1 if it has an attached journal_head */ BH_PrivateStart,/* not a state bit, but the first bit available @@ -279,6 +279,7 @@ void init_buffer(struct buffer_head *, b #define buffer_mapped(bh) __buffer_state(bh,Mapped) #define buffer_new(bh) __buffer_state(bh,New) #define buffer_async(bh) __buffer_state(bh,Async) +#define buffer_launder(bh) __buffer_state(bh,Launder) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) @@ -1110,7 +1111,7 @@ extern struct file_operations rdwr_pipe_ extern int fs_may_remount_ro(struct super_block *); -extern int try_to_free_buffers(struct page *, unsigned int); +extern int FASTCALL(try_to_free_buffers(struct page *, unsigned int)); extern void refile_buffer(struct buffer_head * buf); extern void create_empty_buffers(struct page *, kdev_t, unsigned long); extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate); --- 2.4.19-pre4/drivers/block/ll_rw_blk.c~aa-020-sync_buffers Tue Mar 26 23:11:23 2002 +++ 2.4.19-pre4-akpm/drivers/block/ll_rw_blk.c Tue Mar 26 23:11:23 2002 @@ -958,6 +958,7 @@ void submit_bh(int rw, struct buffer_hea BUG(); set_bit(BH_Req, &bh->b_state); + set_bit(BH_Launder, &bh->b_state); /* * First step, 'identity mapping' - RAID or LVM might