From: Miquel van Smoorenburg This implements the queue congestion callout for DM stacks. To make bdi_read/write_congested() return correct information. DESC devicemapper: use rwlock for map alterations EDESC From: Miquel van Smoorenburg This patch adds a rwlock_t maplock to the struct mapped_device. The ->map member is only updated with that lock held. dm_any_congested takes the lock, and so can be sure that the table under ->map won't change. This patch is much smaller and simpler than dm-rwlock.patch. DESC Another DM maplock implementation EDESC From: Joe Thornber Miquel, On Sat, Mar 06, 2004 at 02:40:18PM +0100, Miquel van Smoorenburg wrote: > dm-maplock.patch > > This patch adds a rwlock_t maplock to the struct mapped_device. > The ->map member is only updated with that lock held. dm_any_congested > takes the lock, and so can be sure that the table under ->map won't change. > > This patch is much smaller and simpler than dm-rwlock.patch. The locking rules for this patch are still rather complicated: - md->lock protects all fields in md - md->map_lock also protects md->map - if you want to read md->map you must read lock _either_ md->lock or md->map_lock. - if you want to write md->map you must lock _both_ md->lock and md->map_lock The patch below (applies to 2.6.4-rc2-mm1) is larger than your patch but I think the locking semantics are more intuitive. - md->lock protects all fields in md _except_ md->map - md->map_lock protects md->map - Anyone who wants to read md->map should use dm_get_table() which increments the tables reference count. This means the spin lock is now only held for the duration of a reference count increment. dm.c: protect md->map with a rw spin lock rather than the md->lock semaphore. Also ensure that everyone accesses md->map through dm_get_table(), rather than directly. --- 25-akpm/drivers/md/dm-table.c | 17 +++++++ 25-akpm/drivers/md/dm.c | 97 ++++++++++++++++++++++++++++++------------ 25-akpm/drivers/md/dm.h | 1 3 files changed, 89 insertions(+), 26 deletions(-) diff -puN drivers/md/dm.c~queue-congestion-dm-implementation drivers/md/dm.c --- 25/drivers/md/dm.c~queue-congestion-dm-implementation 2004-03-18 16:57:14.007470040 -0800 +++ 25-akpm/drivers/md/dm.c 2004-03-18 18:53:17.308887320 -0800 @@ -49,6 +49,7 @@ struct target_io { struct mapped_device { struct rw_semaphore lock; + rwlock_t map_lock; atomic_t holders; unsigned long flags; @@ -237,6 +238,24 @@ static int queue_io(struct mapped_device return 0; /* deferred successfully */ } +/* + * Everyone (including functions in this file), should use this + * function to access the md->map field, and make sure they call + * dm_table_put() when finished. + */ +struct dm_table *dm_get_table(struct mapped_device *md) +{ + struct dm_table *t; + + read_lock(&md->map_lock); + t = md->map; + if (t) + dm_table_get(t); + read_unlock(&md->map_lock); + + return t; +} + /*----------------------------------------------------------------- * CRUD START: * A more elegant soln is in the works that uses the queue @@ -345,6 +364,7 @@ static void __map_bio(struct dm_target * struct clone_info { struct mapped_device *md; + struct dm_table *map; struct bio *bio; struct dm_io *io; sector_t sector; @@ -398,7 +418,7 @@ static struct bio *clone_bio(struct bio static void __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; - struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector); + struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); struct target_io *tio; @@ -459,7 +479,7 @@ static void __clone_and_map(struct clone ci->sector += max; ci->sector_count -= max; - ti = dm_table_find_target(ci->md->map, ci->sector); + ti = dm_table_find_target(ci->map, ci->sector); len = to_sector(bv->bv_len) - max; clone = split_bvec(bio, ci->sector, ci->idx, @@ -484,6 +504,7 @@ static void __split_bio(struct mapped_de struct clone_info ci; ci.md = md; + ci.map = dm_get_table(md); ci.bio = bio; ci.io = alloc_io(md); ci.io->error = 0; @@ -500,6 +521,7 @@ static void __split_bio(struct mapped_de /* drop the extra reference count */ dec_pending(ci.io, 0); + dm_table_put(ci.map); } /*----------------------------------------------------------------- * CRUD END @@ -559,6 +581,22 @@ static int dm_request(request_queue_t *q return 0; } +static int dm_any_congested(void *congested_data, int bdi_bits) +{ + int r; + struct mapped_device *md = (struct mapped_device *) congested_data; + struct dm_table *map = dm_get_table(md); + + if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) + /* FIXME: shouldn't suspended count a congested ? */ + r = bdi_bits; + else + r = dm_table_any_congested(map, bdi_bits); + + dm_table_put(map); + return r; +} + /*----------------------------------------------------------------- * A bitset is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ @@ -630,6 +668,7 @@ static struct mapped_device *alloc_dev(u memset(md, 0, sizeof(*md)); init_rwsem(&md->lock); + rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -637,6 +676,8 @@ static struct mapped_device *alloc_dev(u goto bad1; md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, @@ -727,22 +768,28 @@ static int __bind(struct mapped_device * if (size == 0) return 0; + write_lock(&md->map_lock); md->map = t; - dm_table_event_callback(md->map, event_callback, md); + write_unlock(&md->map_lock); dm_table_get(t); + dm_table_event_callback(md->map, event_callback, md); dm_table_set_restrictions(t, q); return 0; } static void __unbind(struct mapped_device *md) { - if (!md->map) + struct dm_table *map = md->map; + + if (!map) return; - dm_table_event_callback(md->map, NULL, NULL); - dm_table_put(md->map); + dm_table_event_callback(map, NULL, NULL); + write_lock(&md->map_lock); md->map = NULL; + write_unlock(&md->map_lock); + dm_table_put(map); } /* @@ -778,12 +825,16 @@ void dm_get(struct mapped_device *md) void dm_put(struct mapped_device *md) { + struct dm_table *map = dm_get_table(md); + if (atomic_dec_and_test(&md->holders)) { - if (!test_bit(DMF_SUSPENDED, &md->flags) && md->map) - dm_table_suspend_targets(md->map); + if (!test_bit(DMF_SUSPENDED, &md->flags) && map) + dm_table_suspend_targets(map); __unbind(md); free_dev(md); } + + dm_table_put(map); } /* @@ -834,6 +885,7 @@ int dm_swap_table(struct mapped_device * */ int dm_suspend(struct mapped_device *md) { + struct dm_table *map; DECLARE_WAITQUEUE(wait, current); down_write(&md->lock); @@ -869,8 +921,11 @@ int dm_suspend(struct mapped_device *md) down_write(&md->lock); remove_wait_queue(&md->wait, &wait); set_bit(DMF_SUSPENDED, &md->flags); - if (md->map) - dm_table_suspend_targets(md->map); + + map = dm_get_table(md); + if (map) + dm_table_suspend_targets(map); + dm_table_put(map); up_write(&md->lock); return 0; @@ -879,22 +934,25 @@ int dm_suspend(struct mapped_device *md) int dm_resume(struct mapped_device *md) { struct bio *def; + struct dm_table *map = dm_get_table(md); down_write(&md->lock); - if (!md->map || + if (!map || !test_bit(DMF_SUSPENDED, &md->flags) || - !dm_table_get_size(md->map)) { + !dm_table_get_size(map)) { up_write(&md->lock); + dm_table_put(map); return -EINVAL; } - dm_table_resume_targets(md->map); + dm_table_resume_targets(map); clear_bit(DMF_SUSPENDED, &md->flags); clear_bit(DMF_BLOCK_IO, &md->flags); def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); up_write(&md->lock); + dm_table_put(map); blk_run_queues(); @@ -946,19 +1004,6 @@ struct gendisk *dm_disk(struct mapped_de return md->disk; } -struct dm_table *dm_get_table(struct mapped_device *md) -{ - struct dm_table *t; - - down_read(&md->lock); - t = md->map; - if (t) - dm_table_get(t); - up_read(&md->lock); - - return t; -} - int dm_suspended(struct mapped_device *md) { return test_bit(DMF_SUSPENDED, &md->flags); diff -puN drivers/md/dm.h~queue-congestion-dm-implementation drivers/md/dm.h --- 25/drivers/md/dm.h~queue-congestion-dm-implementation 2004-03-18 16:57:14.008469888 -0800 +++ 25-akpm/drivers/md/dm.h 2004-03-18 18:48:35.815680792 -0800 @@ -115,6 +115,7 @@ struct list_head *dm_table_get_devices(s int dm_table_get_mode(struct dm_table *t); void dm_table_suspend_targets(struct dm_table *t); void dm_table_resume_targets(struct dm_table *t); +int dm_table_any_congested(struct dm_table *t, int bdi_bits); /*----------------------------------------------------------------- * A registry of target types. diff -puN drivers/md/dm-table.c~queue-congestion-dm-implementation drivers/md/dm-table.c --- 25/drivers/md/dm-table.c~queue-congestion-dm-implementation 2004-03-18 16:57:14.009469736 -0800 +++ 25-akpm/drivers/md/dm-table.c 2004-03-18 18:53:17.309887168 -0800 @@ -279,6 +279,9 @@ void dm_table_get(struct dm_table *t) void dm_table_put(struct dm_table *t) { + if (!t) + return; + if (atomic_dec_and_test(&t->holders)) table_destroy(t); } @@ -867,6 +870,20 @@ void dm_table_resume_targets(struct dm_t } } +int dm_table_any_congested(struct dm_table *t, int bdi_bits) +{ + struct list_head *d, *devices; + int r = 0; + + devices = dm_table_get_devices(t); + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + } + + return r; +} EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); _