From: Jens Axboe , Chris Mason, me, others. The global unplug list causes horrid spinlock contention on many-disk many-CPU setups - throughput is worse than halved. The other problem with the global unplugging is of course that it will cause the unplugging of queues which are unrelated to the I/O upon which the caller is about to wait. So what we do to solve these problems is to remove the global unplug and set up the infrastructure under which the VFS can tell the block layer to unplug only those queues which are relevant to the page or buffer_head whcih is about to be waited upon. We do this via the very appropriate address_space->backing_dev_info structure. Most of the complexity is in devicemapper, MD and swapper_space, because for these backing devices, multiple queues may need to be unplugged to complete a page/buffer I/O. In each case we ensure that data structures are in place to permit us to identify all the lower-level queues which contribute to the higher-level backing_dev_info. Each contributing queue is told to unplug in response to a higher-level unplug. To simplify things in various places we also introduce the concept of a "synchronous BIO": it is tagged with BIO_RW_SYNC. The block layer will perform an immediate unplug when it sees one of these go past. --- 25-akpm/drivers/block/ll_rw_blk.c | 97 +++++++++++------------------------ 25-akpm/drivers/block/loop.c | 15 ++++- 25-akpm/drivers/block/rd.c | 1 25-akpm/drivers/block/umem.c | 3 - 25-akpm/drivers/md/dm-crypt.c | 2 25-akpm/drivers/md/dm-table.c | 16 +++++ 25-akpm/drivers/md/dm.c | 23 +++++++- 25-akpm/drivers/md/dm.h | 1 25-akpm/drivers/md/md.c | 32 ++++++++++- 25-akpm/drivers/md/raid1.c | 3 + 25-akpm/drivers/md/raid5.c | 4 - 25-akpm/drivers/md/raid6main.c | 3 - 25-akpm/drivers/mtd/devices/blkmtd.c | 6 -- 25-akpm/fs/buffer.c | 12 +++- 25-akpm/fs/direct-io.c | 4 - 25-akpm/fs/jfs/jfs_logmgr.c | 6 -- 25-akpm/fs/ntfs/compress.c | 3 - 25-akpm/fs/ufs/truncate.c | 3 - 25-akpm/fs/xfs/linux/xfs_buf.c | 24 ++------ 25-akpm/include/linux/backing-dev.h | 3 + 25-akpm/include/linux/bio.h | 3 + 25-akpm/include/linux/blkdev.h | 23 ++++++-- 25-akpm/include/linux/fs.h | 2 25-akpm/include/linux/raid/md.h | 1 25-akpm/include/linux/raid/md_k.h | 26 --------- 25-akpm/include/linux/swap.h | 2 25-akpm/kernel/power/disk.c | 1 25-akpm/kernel/power/pmdisk.c | 3 - 25-akpm/kernel/power/swsusp.c | 5 - 25-akpm/mm/filemap.c | 4 + 25-akpm/mm/mempool.c | 2 25-akpm/mm/readahead.c | 8 ++ 25-akpm/mm/shmem.c | 1 25-akpm/mm/swap_state.c | 1 25-akpm/mm/swapfile.c | 65 +++++++++++++++++++++++ 35 files changed, 250 insertions(+), 158 deletions(-) diff -puN drivers/block/ll_rw_blk.c~per-backing_dev-unplugging drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/block/ll_rw_blk.c Thu Apr 8 15:17:56 2004 @@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned */ static kmem_cache_t *request_cachep; -/* - * plug management - */ -static LIST_HEAD(blk_plug_list); -static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; - static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) @@ -251,8 +245,6 @@ void blk_queue_make_request(request_queu */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); - INIT_LIST_HEAD(&q->plug_list); - blk_queue_activity_fn(q, NULL, NULL); } @@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q) * don't plug a stopped queue, it must be paired with blk_start_queue() * which will restart the queueing */ - if (!blk_queue_plugged(q) - && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) { - spin_lock(&blk_plug_lock); - list_add_tail(&q->plug_list, &blk_plug_list); + if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) + return; + + if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - spin_unlock(&blk_plug_lock); - } } EXPORT_SYMBOL(blk_plug_device); @@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device); int blk_remove_plug(request_queue_t *q) { WARN_ON(!irqs_disabled()); - if (blk_queue_plugged(q)) { - spin_lock(&blk_plug_lock); - list_del_init(&q->plug_list); - del_timer(&q->unplug_timer); - spin_unlock(&blk_plug_lock); - return 1; - } - return 0; + if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + return 0; + + del_timer(&q->unplug_timer); + return 1; } EXPORT_SYMBOL(blk_remove_plug); @@ -1161,14 +1148,11 @@ static inline void __generic_unplug_devi * Linux uses plugging to build bigger requests queues before letting * the device have at them. If a queue is plugged, the I/O scheduler * is still adding and merging requests on the queue. Once the queue - * gets unplugged (either by manually calling this function, or by - * calling blk_run_queues()), the request_fn defined for the - * queue is invoked and transfers started. + * gets unplugged, the request_fn defined for the queue is invoked and + * transfers started. **/ -void generic_unplug_device(void *data) +void generic_unplug_device(request_queue_t *q) { - request_queue_t *q = data; - spin_lock_irq(q->queue_lock); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); @@ -1176,9 +1160,23 @@ void generic_unplug_device(void *data) EXPORT_SYMBOL(generic_unplug_device); +static inline void blk_backing_dev_unplug(struct backing_dev_info *bdi) +{ + request_queue_t *q = bdi->unplug_io_data; + + /* + * devices don't necessarily have an ->unplug_fn defined + */ + if (q->unplug_fn) + q->unplug_fn(q); +} + +EXPORT_SYMBOL(blk_backing_dev_unplug); + static void blk_unplug_work(void *data) { request_queue_t *q = data; + q->unplug_fn(q); } @@ -1256,42 +1254,6 @@ void blk_run_queue(struct request_queue EXPORT_SYMBOL(blk_run_queue); /** - * blk_run_queues - fire all plugged queues - * - * Description: - * Start I/O on all plugged queues known to the block layer. Queues that - * are currently stopped are ignored. This is equivalent to the older - * tq_disk task queue run. - **/ -#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list) -void blk_run_queues(void) -{ - LIST_HEAD(local_plug_list); - - spin_lock_irq(&blk_plug_lock); - - /* - * this will happen fairly often - */ - if (list_empty(&blk_plug_list)) - goto out; - - list_splice_init(&blk_plug_list, &local_plug_list); - - while (!list_empty(&local_plug_list)) { - request_queue_t *q = blk_plug_entry(local_plug_list.next); - - spin_unlock_irq(&blk_plug_lock); - q->unplug_fn(q); - spin_lock_irq(&blk_plug_lock); - } -out: - spin_unlock_irq(&blk_plug_lock); -} - -EXPORT_SYMBOL(blk_run_queues); - -/** * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed * @q: the request queue to be released * @@ -1390,6 +1352,10 @@ request_queue_t *blk_alloc_queue(int gfp memset(q, 0, sizeof(*q)); init_timer(&q->unplug_timer); atomic_set(&q->refcnt, 1); + + q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; + q->backing_dev_info.unplug_io_data = q; + return q; } @@ -2050,7 +2016,6 @@ long blk_congestion_wait(int rw, long ti DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[rw]; - blk_run_queues(); prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); @@ -2315,7 +2280,7 @@ out: if (blk_queue_plugged(q)) { int nr_queued = q->rq.count[READ] + q->rq.count[WRITE]; - if (nr_queued == q->unplug_thresh) + if (nr_queued == q->unplug_thresh || bio_sync(bio)) __generic_unplug_device(q); } spin_unlock_irq(q->queue_lock); diff -puN drivers/block/loop.c~per-backing_dev-unplugging drivers/block/loop.c --- 25/drivers/block/loop.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/block/loop.c Thu Apr 8 15:07:53 2004 @@ -434,6 +434,17 @@ inactive: goto out; } +/* + * kick off io on the underlying address space + */ +static void loop_unplug(request_queue_t *q) +{ + struct loop_device *lo = q->queuedata; + + clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); + blk_run_address_space(lo->lo_backing_file->f_mapping); +} + struct switch_request { struct file *file; struct completion wait; @@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_devic { struct file *file; struct inode *inode; - struct block_device *lo_device = NULL; struct address_space *mapping; unsigned lo_blocksize; int lo_flags = 0; @@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_devic set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->lo_blocksize = lo_blocksize; - lo->lo_device = lo_device; + lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; lo->transfer = NULL; @@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_devic */ blk_queue_make_request(lo->lo_queue, loop_make_request); lo->lo_queue->queuedata = lo; + lo->lo_queue->unplug_fn = loop_unplug; set_capacity(disks[lo->lo_number], size); bd_set_size(bdev, size << 9); diff -puN drivers/block/umem.c~per-backing_dev-unplugging drivers/block/umem.c --- 25/drivers/block/umem.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/block/umem.c Thu Apr 8 15:07:53 2004 @@ -368,9 +368,8 @@ static inline void reset_page(struct mm_ page->biotail = & page->bio; } -static void mm_unplug_device(void *data) +static void mm_unplug_device(request_queue_t *q) { - request_queue_t *q = data; struct cardinfo *card = q->queuedata; unsigned long flags; diff -puN drivers/md/dm.c~per-backing_dev-unplugging drivers/md/dm.c --- 25/drivers/md/dm.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/dm.c Thu Apr 8 15:08:08 2004 @@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q return 0; } +static void dm_unplug_all(request_queue_t *q) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + + if (map) { + dm_table_unplug_all(map); + dm_table_put(map); + } +} + static int dm_any_congested(void *congested_data, int bdi_bits) { int r; @@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(u md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); + md->queue->unplug_fn = dm_unplug_all; md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); @@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md) add_wait_queue(&md->wait, &wait); up_write(&md->lock); + /* unplug */ + map = dm_get_table(md); + if (map) { + dm_table_unplug_all(map); + dm_table_put(map); + } + /* * Then we wait for the already mapped ios to * complete. */ - blk_run_queues(); while (1) { set_current_state(TASK_INTERRUPTIBLE); @@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md) def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); up_write(&md->lock); + dm_table_unplug_all(map); dm_table_put(map); - blk_run_queues(); - return 0; } diff -puN drivers/md/dm-crypt.c~per-backing_dev-unplugging drivers/md/dm-crypt.c --- 25/drivers/md/dm-crypt.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/dm-crypt.c Thu Apr 8 15:07:53 2004 @@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *t /* out of memory -> run queues */ if (remaining) - blk_run_queues(); + blk_congestion_wait(bio_data_dir(clone), HZ/100); } /* drop reference, clones could have returned before we reach this */ diff -puN drivers/md/dm.h~per-backing_dev-unplugging drivers/md/dm.h --- 25/drivers/md/dm.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/dm.h Thu Apr 8 15:07:53 2004 @@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t void dm_table_suspend_targets(struct dm_table *t); void dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); +void dm_table_unplug_all(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. diff -puN drivers/md/dm-table.c~per-backing_dev-unplugging drivers/md/dm-table.c --- 25/drivers/md/dm-table.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/dm-table.c Thu Apr 8 15:07:53 2004 @@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_tab return r; } +void dm_table_unplug_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + + if (q->unplug_fn) + q->unplug_fn(q); + } +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); EXPORT_SYMBOL(dm_table_event); EXPORT_SYMBOL(dm_table_get_mode); +EXPORT_SYMBOL(dm_table_put); +EXPORT_SYMBOL(dm_table_get); +EXPORT_SYMBOL(dm_table_unplug_all); diff -puN drivers/md/md.c~per-backing_dev-unplugging drivers/md/md.c --- 25/drivers/md/md.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/md.c Thu Apr 8 15:07:53 2004 @@ -160,6 +160,30 @@ static int md_fail_request (request_queu return 0; } +void md_unplug_mddev(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * this list iteration is done without any locking in md?! + */ + ITERATE_RDEV(mddev, rdev, tmp) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } +} +EXPORT_SYMBOL(md_unplug_mddev); + +static void md_unplug_all(request_queue_t *q) +{ + mddev_t *mddev = q->queuedata; + + md_unplug_mddev(mddev); +} + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -335,6 +359,8 @@ static int sync_page_io(struct block_dev struct bio_vec vec; struct completion event; + rw |= (1 << BIO_RW_SYNC); + bio_init(&bio); bio.bi_io_vec = &vec; vec.bv_page = page; @@ -349,7 +375,6 @@ static int sync_page_io(struct block_dev bio.bi_private = &event; bio.bi_end_io = bi_complete; submit_bio(rw, &bio); - blk_run_queues(); wait_for_completion(&event); return test_bit(BIO_UPTODATE, &bio.bi_flags); @@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + mddev->queue->unplug_fn = md_unplug_all; mddev->changed = 1; return 0; @@ -2718,7 +2744,7 @@ int md_thread(void * arg) run = thread->run; if (run) { run(thread->mddev); - blk_run_queues(); + md_unplug_mddev(thread->mddev); } if (signal_pending(current)) flush_signals(current); @@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev) test_bit(MD_RECOVERY_ERR, &mddev->recovery)) break; - blk_run_queues(); + md_unplug_mddev(mddev); repeat: if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { diff -puN drivers/md/raid1.c~per-backing_dev-unplugging drivers/md/raid1.c --- 25/drivers/md/raid1.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/raid1.c Thu Apr 8 15:07:53 2004 @@ -451,6 +451,7 @@ rb_out: static void device_barrier(conf_t *conf, sector_t sect) { + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock); @@ -478,6 +479,7 @@ static int make_request(request_queue_t * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); conf->nr_pending++; @@ -644,6 +646,7 @@ static void print_conf(conf_t *conf) static void close_sync(conf_t *conf) { + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); spin_unlock_irq(&conf->resync_lock); diff -puN drivers/md/raid5.c~per-backing_dev-unplugging drivers/md/raid5.c --- 25/drivers/md/raid5.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/raid5.c Thu Apr 8 15:07:53 2004 @@ -249,6 +249,7 @@ static struct stripe_head *get_active_st break; if (!sh) { conf->inactive_blocked = 1; + md_unplug_mddev(conf->mddev); wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list) && (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) @@ -1292,9 +1293,8 @@ static inline void raid5_activate_delaye } } } -static void raid5_unplug_device(void *data) +static void raid5_unplug_device(request_queue_t *q) { - request_queue_t *q = data; mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; diff -puN drivers/md/raid6main.c~per-backing_dev-unplugging drivers/md/raid6main.c --- 25/drivers/md/raid6main.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/md/raid6main.c Thu Apr 8 15:07:53 2004 @@ -1454,9 +1454,8 @@ static inline void raid6_activate_delaye } } } -static void raid6_unplug_device(void *data) +static void raid6_unplug_device(request_queue_t *q) { - request_queue_t *q = data; mddev_t *mddev = q->queuedata; raid6_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; diff -puN drivers/mtd/devices/blkmtd.c~per-backing_dev-unplugging drivers/mtd/devices/blkmtd.c --- 25/drivers/mtd/devices/blkmtd.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/mtd/devices/blkmtd.c Thu Apr 8 15:07:53 2004 @@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd bio->bi_private = &event; bio->bi_end_io = bi_read_complete; if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { - submit_bio(READ, bio); - blk_run_queues(); + submit_bio(READ_SYNC, bio); wait_for_completion(&event); err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO; bio_put(bio); @@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio * init_completion(&event); bio->bi_private = &event; bio->bi_end_io = bi_write_complete; - submit_bio(WRITE, bio); - blk_run_queues(); + submit_bio(WRITE_SYNC, bio); wait_for_completion(&event); DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt); err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO; diff -puN fs/buffer.c~per-backing_dev-unplugging fs/buffer.c --- 25/fs/buffer.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/buffer.c Thu Apr 8 15:07:53 2004 @@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head do { prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { - blk_run_queues(); + struct block_device *bd; + smp_mb(); + bd = bh->b_bdev; + if (bd) + blk_run_address_space(bd->bd_inode->i_mapping); io_schedule(); } } while (buffer_locked(bh)); @@ -492,7 +496,6 @@ static void free_more_memory(void) pg_data_t *pgdat; wakeup_bdflush(1024); - blk_run_queues(); yield(); for_each_pgdat(pgdat) { @@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers); int block_sync_page(struct page *page) { - blk_run_queues(); + struct address_space *mapping; + smp_mb(); + mapping = page->mapping; + blk_run_address_space(mapping); return 0; } diff -puN fs/direct-io.c~per-backing_dev-unplugging fs/direct-io.c --- 25/fs/direct-io.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/direct-io.c Thu Apr 8 15:07:53 2004 @@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct if (dio->bio_list == NULL) { dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - blk_run_queues(); + blk_run_address_space(dio->inode->i_mapping); io_schedule(); spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *i if (ret == 0) ret = dio->result; finished_one_bio(dio); /* This can free the dio */ - blk_run_queues(); + blk_run_address_space(inode->i_mapping); if (should_wait) { unsigned long flags; /* diff -puN fs/jfs/jfs_logmgr.c~per-backing_dev-unplugging fs/jfs/jfs_logmgr.c --- 25/fs/jfs/jfs_logmgr.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/jfs/jfs_logmgr.c Thu Apr 8 15:07:53 2004 @@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ, bio); - blk_run_queues(); + submit_bio(READ_SYNC, bio); wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (!log->no_integrity) { - submit_bio(WRITE, bio); + submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); - blk_run_queues(); } else { bio->bi_size = 0; diff -puN fs/ntfs/compress.c~per-backing_dev-unplugging fs/ntfs/compress.c --- 25/fs/ntfs/compress.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/ntfs/compress.c Thu Apr 8 15:07:53 2004 @@ -23,6 +23,7 @@ #include #include +#include #include "ntfs.h" @@ -668,7 +669,7 @@ lock_retry_remap: "uptodate! Unplugging the disk queue " "and rescheduling."); get_bh(tbh); - blk_run_queues(); + blk_run_address_space(mapping); schedule(); put_bh(tbh); if (unlikely(!buffer_uptodate(tbh))) diff -puN fs/ufs/truncate.c~per-backing_dev-unplugging fs/ufs/truncate.c --- 25/fs/ufs/truncate.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/ufs/truncate.c Thu Apr 8 15:07:53 2004 @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "swab.h" @@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode) break; if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) ufs_sync_inode (inode); - blk_run_queues(); + blk_run_address_space(inode->i_mapping); yield(); } offset = inode->i_size & uspi->s_fshift; diff -puN fs/xfs/linux/xfs_buf.c~per-backing_dev-unplugging fs/xfs/linux/xfs_buf.c --- 25/fs/xfs/linux/xfs_buf.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/fs/xfs/linux/xfs_buf.c Thu Apr 8 15:07:53 2004 @@ -1013,7 +1013,7 @@ pagebuf_lock( { PB_TRACE(pb, "lock", 0); if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); down(&pb->pb_sema); PB_SET_OWNER(pb); PB_TRACE(pb, "locked", 0); @@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin( if (atomic_read(&pb->pb_pin_count) == 0) break; if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); schedule(); } remove_wait_queue(&pb->pb_waiters, &wait); @@ -1407,7 +1407,7 @@ submit_io: if (pb->pb_flags & PBF_RUN_QUEUES) { pb->pb_flags &= ~PBF_RUN_QUEUES; if (atomic_read(&pb->pb_io_remaining) > 1) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); } } @@ -1471,7 +1471,7 @@ pagebuf_iowait( { PB_TRACE(pb, "iowait", 0); if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); down(&pb->pb_iodonesema); PB_TRACE(pb, "iowaited", (long)pb->pb_error); return pb->pb_error; @@ -1617,7 +1617,6 @@ STATIC int pagebuf_daemon( void *data) { - int count; page_buf_t *pb; struct list_head *curr, *next, tmp; @@ -1640,7 +1639,6 @@ pagebuf_daemon( spin_lock(&pbd_delwrite_lock); - count = 0; list_for_each_safe(curr, next, &pbd_delwrite_queue) { pb = list_entry(curr, page_buf_t, pb_list); @@ -1657,7 +1655,6 @@ pagebuf_daemon( pb->pb_flags &= ~PBF_DELWRI; pb->pb_flags |= PBF_WRITE; list_move(&pb->pb_list, &tmp); - count++; } } @@ -1667,12 +1664,11 @@ pagebuf_daemon( list_del_init(&pb->pb_list); pagebuf_iostrategy(pb); + blk_run_address_space(pb->pb_target->pbr_mapping); } if (as_list_len > 0) purge_addresses(); - if (count) - blk_run_queues(); force_flush = 0; } while (pagebuf_daemon_active); @@ -1689,7 +1685,6 @@ pagebuf_delwri_flush( page_buf_t *pb; struct list_head *curr, *next, tmp; int pincount = 0; - int flush_cnt = 0; pagebuf_runall_queues(pagebuf_dataio_workqueue); pagebuf_runall_queues(pagebuf_logio_workqueue); @@ -1733,14 +1728,8 @@ pagebuf_delwri_flush( pagebuf_lock(pb); pagebuf_iostrategy(pb); - if (++flush_cnt > 32) { - blk_run_queues(); - flush_cnt = 0; - } } - blk_run_queues(); - while (!list_empty(&tmp)) { pb = list_entry(tmp.next, page_buf_t, pb_list); @@ -1751,6 +1740,9 @@ pagebuf_delwri_flush( pagebuf_rele(pb); } + if (flags & PBDF_WAIT) + blk_run_address_space(target->pbr_mapping); + if (pinptr) *pinptr = pincount; } diff -puN include/linux/backing-dev.h~per-backing_dev-unplugging include/linux/backing-dev.h --- 25/include/linux/backing-dev.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/backing-dev.h Thu Apr 8 15:07:53 2004 @@ -28,9 +28,12 @@ struct backing_dev_info { int memory_backed; /* Cannot clean pages with writepage */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ + void (*unplug_io_fn)(struct backing_dev_info *); + void *unplug_io_data; }; extern struct backing_dev_info default_backing_dev_info; +void default_unplug_io_fn(struct backing_dev_info *bdi); int writeback_acquire(struct backing_dev_info *bdi); int writeback_in_progress(struct backing_dev_info *bdi); diff -puN include/linux/bio.h~per-backing_dev-unplugging include/linux/bio.h --- 25/include/linux/bio.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/bio.h Thu Apr 8 15:07:53 2004 @@ -119,11 +119,13 @@ struct bio { * bit 1 -- rw-ahead when set * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries + * bit 4 -- synchronous I/O hint: the block layer will unplug immediately */ #define BIO_RW 0 #define BIO_RW_AHEAD 1 #define BIO_RW_BARRIER 2 #define BIO_RW_FAILFAST 3 +#define BIO_RW_SYNC 4 /* * various member access, note that bio_data should of course not be used @@ -138,6 +140,7 @@ struct bio { #define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9) #define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio))) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) +#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) /* * will die diff -puN include/linux/blkdev.h~per-backing_dev-unplugging include/linux/blkdev.h --- 25/include/linux/blkdev.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/blkdev.h Thu Apr 8 15:07:53 2004 @@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request typedef void (request_fn_proc) (request_queue_t *q); typedef int (make_request_fn) (request_queue_t *q, struct bio *bio); typedef int (prep_rq_fn) (request_queue_t *, struct request *); -typedef void (unplug_fn) (void *q); +typedef void (unplug_fn) (request_queue_t *); struct bio_vec; typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *); @@ -315,8 +315,6 @@ struct request_queue unsigned long bounce_pfn; int bounce_gfp; - struct list_head plug_list; - /* * various queue flags, see QUEUE_* below */ @@ -370,8 +368,9 @@ struct request_queue #define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ +#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ -#define blk_queue_plugged(q) !list_empty(&(q)->plug_list) +#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) @@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk extern void blk_start_queue(request_queue_t *q); extern void blk_stop_queue(request_queue_t *q); extern void __blk_stop_queue(request_queue_t *q); -extern void blk_run_queue(request_queue_t *q); +extern void blk_run_queue(request_queue_t *); extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *); extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int); extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int); @@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_ return bdev->bd_disk->queue; } +static inline void blk_run_backing_dev(struct backing_dev_info *bdi) +{ + if (bdi && bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi); +} + +static inline void blk_run_address_space(struct address_space *mapping) +{ + if (mapping) + blk_run_backing_dev(mapping->backing_dev_info); +} + /* * end_request() and friends. Must be called with the request queue spinlock * acquired. All functions called within end_request() _must_be_ atomic. @@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_ extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -extern void generic_unplug_device(void *); +extern void generic_unplug_device(request_queue_t *); extern long nr_blockdev_pages(void); int blk_get_queue(request_queue_t *); diff -puN include/linux/fs.h~per-backing_dev-unplugging include/linux/fs.h --- 25/include/linux/fs.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/fs.h Thu Apr 8 15:07:53 2004 @@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_ena #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SPECIAL 4 /* For non-blockdevice requests in request queue */ +#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define SEL_IN 1 #define SEL_OUT 2 diff -puN include/linux/raid/md_k.h~per-backing_dev-unplugging include/linux/raid/md_k.h --- 25/include/linux/raid/md_k.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/raid/md_k.h Thu Apr 8 15:07:53 2004 @@ -326,7 +326,6 @@ do { \ if (condition) \ break; \ spin_unlock_irq(&lock); \ - blk_run_queues(); \ schedule(); \ spin_lock_irq(&lock); \ } \ @@ -341,30 +340,5 @@ do { \ __wait_event_lock_irq(wq, condition, lock); \ } while (0) - -#define __wait_disk_event(wq, condition) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - blk_run_queues(); \ - schedule(); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_disk_event(wq, condition) \ -do { \ - if (condition) \ - break; \ - __wait_disk_event(wq, condition); \ -} while (0) - #endif diff -puN kernel/power/disk.c~per-backing_dev-unplugging kernel/power/disk.c --- 25/kernel/power/disk.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/kernel/power/disk.c Thu Apr 8 15:07:53 2004 @@ -84,7 +84,6 @@ static void free_some_memory(void) while (shrink_all_memory(10000)) printk("."); printk("|\n"); - blk_run_queues(); } diff -puN kernel/power/pmdisk.c~per-backing_dev-unplugging kernel/power/pmdisk.c --- 25/kernel/power/pmdisk.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/kernel/power/pmdisk.c Thu Apr 8 15:07:53 2004 @@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsi static void wait_io(void) { - blk_run_queues(); while(atomic_read(&io_done)) io_schedule(); } @@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_o if (rw == WRITE) bio_set_pages_dirty(bio); start_io(); - submit_bio(rw,bio); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); wait_io(); Done: bio_put(bio); diff -puN kernel/power/swsusp.c~per-backing_dev-unplugging kernel/power/swsusp.c --- 25/kernel/power/swsusp.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/kernel/power/swsusp.c Thu Apr 8 15:07:53 2004 @@ -707,11 +707,6 @@ int software_suspend(void) free_some_memory(); - /* No need to invalidate any vfsmnt list -- - * they will be valid after resume, anyway. - */ - blk_run_queues(); - /* Save state of all device drivers, and stop them. */ if ((res = device_suspend(4))==0) /* If stopping device drivers worked, we proceed basically into diff -puN mm/mempool.c~per-backing_dev-unplugging mm/mempool.c --- 25/mm/mempool.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/mempool.c Thu Apr 8 15:07:53 2004 @@ -234,8 +234,6 @@ repeat_alloc: if (!(gfp_mask & __GFP_WAIT)) return NULL; - blk_run_queues(); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); mb(); if (!pool->curr_nr) diff -puN mm/readahead.c~per-backing_dev-unplugging mm/readahead.c --- 25/mm/readahead.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/readahead.c Thu Apr 8 15:07:53 2004 @@ -15,11 +15,16 @@ #include #include +void default_unplug_io_fn(struct backing_dev_info *bdi) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + struct backing_dev_info default_backing_dev_info = { .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, .state = 0, + .unplug_io_fn = default_unplug_io_fn, }; - EXPORT_SYMBOL_GPL(default_backing_dev_info); /* @@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state ra->ra_pages = mapping->backing_dev_info->ra_pages; ra->average = ra->ra_pages / 2; } - EXPORT_SYMBOL(file_ra_state_init); /* diff -puN mm/filemap.c~per-backing_dev-unplugging mm/filemap.c --- 25/mm/filemap.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/filemap.c Thu Apr 8 15:07:53 2004 @@ -119,8 +119,10 @@ void remove_from_page_cache(struct page static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; + smp_mb(); + mapping = page->mapping; if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); return 0; diff -puN include/linux/raid/md.h~per-backing_dev-unplugging include/linux/raid/md.h --- 25/include/linux/raid/md.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/raid/md.h Thu Apr 8 15:07:53 2004 @@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t * extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); +extern void md_unplug_mddev(mddev_t *mddev); extern void md_print_devices (void); diff -puN include/linux/swap.h~per-backing_dev-unplugging include/linux/swap.h --- 25/include/linux/swap.h~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/include/linux/swap.h Thu Apr 8 15:07:53 2004 @@ -232,6 +232,8 @@ extern sector_t map_swap_page(struct swa extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +struct backing_dev_info; +extern void swap_unplug_io_fn(struct backing_dev_info *); extern struct swap_list_t swap_list; extern spinlock_t swaplock; diff -puN mm/swapfile.c~per-backing_dev-unplugging mm/swapfile.c --- 25/mm/swapfile.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/swapfile.c Thu Apr 8 15:07:53 2004 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; +/* + * Array of backing blockdevs, for swap_unplug_fn. We need this because the + * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling + * the unplug_fn. And swap_list_lock cannot be turned into a semaphore. + */ +static DECLARE_MUTEX(swap_bdevs_sem); +static struct block_device *swap_bdevs[MAX_SWAPFILES]; + #define SWAPFILE_CLUSTER 256 +/* + * Caller holds swap_bdevs_sem + */ +static void install_swap_bdev(struct block_device *bdev) +{ + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (swap_bdevs[i] == NULL) { + swap_bdevs[i] = bdev; + return; + } + } + BUG(); +} + +static void remove_swap_bdev(struct block_device *bdev) +{ + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (swap_bdevs[i] == bdev) { + memcpy(&swap_bdevs[i], &swap_bdevs[i + 1], + (MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs)); + swap_bdevs[MAX_SWAPFILES - 1] = NULL; + return; + } + } + BUG(); +} + +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi) +{ + int i; + + down(&swap_bdevs_sem); + for (i = 0; i < MAX_SWAPFILES; i++) { + struct block_device *bdev = swap_bdevs[i]; + struct backing_dev_info *bdi; + + if (bdev == NULL) + break; + bdi = bdev->bd_inode->i_mapping->backing_dev_info; + (*bdi->unplug_io_fn)(bdi); + } + up(&swap_bdevs_sem); +} + static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char _ swap_list_unlock(); goto out_dput; } + down(&swap_bdevs_sem); swap_list_lock(); swap_device_lock(p); swap_file = p->swap_file; @@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char _ destroy_swap_extents(p); swap_device_unlock(p); swap_list_unlock(); + remove_swap_bdev(p->bdev); + up(&swap_bdevs_sem); vfree(swap_map); if (S_ISBLK(mapping->host->i_mode)) { struct block_device *bdev = I_BDEV(mapping->host); @@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __ if (error) goto bad_swap; + down(&swap_bdevs_sem); swap_list_lock(); swap_device_lock(p); p->flags = SWP_ACTIVE; @@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __ } swap_device_unlock(p); swap_list_unlock(); + install_swap_bdev(p->bdev); + up(&swap_bdevs_sem); error = 0; goto out; bad_swap: @@ -1484,7 +1547,7 @@ bad_swap_2: destroy_swap_extents(p); if (swap_map) vfree(swap_map); - if (swap_file && !IS_ERR(swap_file)) + if (swap_file) filp_close(swap_file, NULL); out: if (page && !IS_ERR(page)) { diff -puN mm/swap_state.c~per-backing_dev-unplugging mm/swap_state.c --- 25/mm/swap_state.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/swap_state.c Thu Apr 8 15:07:53 2004 @@ -19,6 +19,7 @@ static struct backing_dev_info swap_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = swap_unplug_io_fn, }; extern struct address_space_operations swap_aops; diff -puN drivers/block/rd.c~per-backing_dev-unplugging drivers/block/rd.c --- 25/drivers/block/rd.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/drivers/block/rd.c Thu Apr 8 15:07:53 2004 @@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, static struct backing_dev_info rd_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = default_unplug_io_fn, }; static int rd_open(struct inode *inode, struct file *filp) diff -puN mm/shmem.c~per-backing_dev-unplugging mm/shmem.c --- 25/mm/shmem.c~per-backing_dev-unplugging Thu Apr 8 15:07:53 2004 +++ 25-akpm/mm/shmem.c Thu Apr 8 15:07:53 2004 @@ -169,6 +169,7 @@ static struct vm_operations_struct shmem static struct backing_dev_info shmem_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = default_unplug_io_fn, }; LIST_HEAD(shmem_inodes); _