From: NeilBrown Signed-off-by: Neil Brown Signed-off-by: Andrew Morton --- 25-akpm/drivers/md/raid1.c | 182 +++++++++++++++++++++++++++++++------ 25-akpm/include/linux/raid/raid1.h | 16 ++- 2 files changed, 168 insertions(+), 30 deletions(-) diff -puN drivers/md/raid1.c~md-raid1-support-for-bitmap-intent-logging drivers/md/raid1.c --- 25/drivers/md/raid1.c~md-raid1-support-for-bitmap-intent-logging 2005-02-17 18:00:59.000000000 -0800 +++ 25-akpm/drivers/md/raid1.c 2005-02-17 18:00:59.000000000 -0800 @@ -12,6 +12,15 @@ * Fixes to reconstruction by Jakob Østergaard" * Various fixes by Neil Brown * + * Changes by Peter T. Breuer 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap marked during normal i/o + * - bitmap used to skip nondirty blocks during sync + * + * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: + * - persistent bitmap code + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) @@ -22,7 +31,16 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include "dm-bio-list.h" #include +#include + +#define DEBUG 0 +#if DEBUG +#define PRINTK(x...) printk(x) +#else +#define PRINTK(x...) +#endif /* * Number of guaranteed r1bios in case of extreme VM load: @@ -287,9 +305,11 @@ static int raid1_end_write_request(struc /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) + if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -309,6 +329,10 @@ static int raid1_end_write_request(struc * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state)); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev static void raid1_unplug(request_queue_t *q) { - unplug_slaves(q->queuedata); + mddev_t *mddev = q->queuedata; + + unplug_slaves(mddev); + md_wakeup_thread(mddev->thread); } static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, @@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, { spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); if (!conf->barrier++) { wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); if (conf->nr_pending) BUG(); } wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); conf->next_resync = sect; spin_unlock_irq(&conf->resync_lock); } @@ -522,8 +549,12 @@ static int make_request(request_queue_t mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; - int i, disks; + int i, targets = 0, disks; mdk_rdev_t *rdev; + struct bitmap *bitmap = mddev->bitmap; + unsigned long flags; + struct bio_list bl; + /* * Register the new request and wait if the reconstruction @@ -554,7 +585,7 @@ static int make_request(request_queue_t r1_bio->master_bio = bio; r1_bio->sectors = bio->bi_size >> 9; - + r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; @@ -597,6 +628,13 @@ static int make_request(request_queue_t * bios[x] to bio */ disks = conf->raid_disks; +#if 0 + { static int first=1; + if (first) printk("First Write sector %llu disks %d\n", + (unsigned long long)r1_bio->sector, disks); + first = 0; + } +#endif rcu_read_lock(); for (i = 0; i < disks; i++) { if ((rdev=conf->mirrors[i].rdev) != NULL && @@ -607,13 +645,21 @@ static int make_request(request_queue_t r1_bio->bios[i] = NULL; } else r1_bio->bios[i] = bio; + targets++; } else r1_bio->bios[i] = NULL; } rcu_read_unlock(); - atomic_set(&r1_bio->remaining, 1); + if (targets < conf->raid_disks) { + /* array is degraded, we will not clear the bitmap + * on I/O completion (see raid1_end_write_request) */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + + atomic_set(&r1_bio->remaining, 0); + bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) @@ -629,14 +675,23 @@ static int make_request(request_queue_t mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - generic_make_request(mbio); - } - if (atomic_dec_and_test(&r1_bio->remaining)) { - md_write_end(mddev); - raid_end_bio_io(r1_bio); + bio_list_add(&bl, mbio); } + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_merge(&conf->pending_bio_list, &bl); + bio_list_init(&bl); + + blk_plug_device(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); + +#if 0 + while ((bio = bio_list_pop(&bl)) != NULL) + generic_make_request(bio); +#endif + return 0; } @@ -716,7 +771,7 @@ static void close_sync(conf_t *conf) { spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock); if (conf->barrier) BUG(); @@ -830,10 +885,11 @@ static int end_sync_read(struct bio *bio * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!uptodate) + if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[r1_bio->read_disk].rdev); - else + set_bit(R1BIO_Degraded, &r1_bio->state); + } else set_bit(R1BIO_Uptodate, &r1_bio->state); rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); reschedule_retry(r1_bio); @@ -857,8 +913,10 @@ static int end_sync_write(struct bio *bi mirror = i; break; } - if (!uptodate) + if (!uptodate) { md_error(mddev, conf->mirrors[mirror].rdev); + set_bit(R1BIO_Degraded, &r1_bio->state); + } update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { @@ -878,6 +936,9 @@ static void sync_request_write(mddev_t * bio = r1_bio->bios[r1_bio->read_disk]; +/* + if (r1_bio->sector == 0) printk("First sync write startss\n"); +*/ /* * schedule writes */ @@ -905,10 +966,12 @@ static void sync_request_write(mddev_t * atomic_inc(&conf->mirrors[i].rdev->nr_pending); atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); } if (atomic_dec_and_test(&r1_bio->remaining)) { + /* if we're here, all write(s) have completed, so clean up */ md_done_sync(mddev, r1_bio->sectors, 1); put_buf(r1_bio); } @@ -937,6 +1000,26 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; spin_lock_irqsave(&conf->device_lock, flags); + + if (conf->pending_bio_list.head) { + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* flush any pending bitmap writes to disk before proceeding w/ I/O */ + if (bitmap_unplug(mddev->bitmap) != 0) + printk("%s: bitmap file write failed!\n", mdname(mddev)); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + unplug = 1; + + continue; + } + if (list_empty(head)) break; r1_bio = list_entry(head->prev, r1bio_t, retry_list); @@ -1020,17 +1103,43 @@ static sector_t sync_request(mddev_t *md int disk; int i; int write_targets = 0; + int sync_blocks; if (!conf->r1buf_pool) + { +/* + printk("sync start - bitmap %p\n", mddev->bitmap); +*/ if (init_resync(conf)) return 0; + } max_sector = mddev->size << 1; if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chunk (there will + * only be one in raid1 resync. + * We can find the current addess in mddev->curr_resync + */ + if (!conf->fullsync) { + if (mddev->curr_resync < max_sector) + bitmap_end_sync(mddev->bitmap, + mddev->curr_resync, + &sync_blocks, 1); + bitmap_close_sync(mddev->bitmap); + } + if (mddev->curr_resync >= max_sector) + conf->fullsync = 0; close_sync(conf); return 0; } + if (!conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) { + /* We can skip this block, and probably several more */ + *skipped = 1; + return sync_blocks; + } /* * If there is non-resync activity waiting for us then * put in a delay to throttle resync. @@ -1069,6 +1178,7 @@ static sector_t sync_request(mddev_t *md r1_bio->mddev = mddev; r1_bio->sector = sector_nr; + r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); r1_bio->read_disk = disk; @@ -1103,6 +1213,11 @@ static sector_t sync_request(mddev_t *md bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_private = r1_bio; } + + if (write_targets + 1 < conf->raid_disks) + /* array degraded, can't clear bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + if (write_targets == 0) { /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished @@ -1122,6 +1237,14 @@ static sector_t sync_request(mddev_t *md len = (max_sector - sector_nr) << 9; if (len == 0) break; + if (!conf->fullsync && sync_blocks == 0) + if (!bitmap_start_sync(mddev->bitmap, + sector_nr, &sync_blocks)) + break; + if (sync_blocks < (PAGE_SIZE>>9)) + BUG(); + if (len > (sync_blocks<<9)) len = sync_blocks<<9; + for (i=0 ; i < conf->raid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { @@ -1144,6 +1267,7 @@ static sector_t sync_request(mddev_t *md } nr_sectors += len>>9; sector_nr += len>>9; + sync_blocks -= (len>>9); } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); bio_full: bio = r1_bio->bios[disk]; @@ -1236,6 +1360,9 @@ static int run(mddev_t *mddev) init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_resume); + bio_list_init(&conf->pending_bio_list); + bio_list_init(&conf->flushing_bio_list); + if (!conf->working_disks) { printk(KERN_ERR "raid1: no operational mirrors for %s\n", mdname(mddev)); @@ -1264,16 +1391,15 @@ static int run(mddev_t *mddev) conf->last_used = j; - - { - mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); - if (!mddev->thread) { - printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; - } + mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); + if (!mddev->thread) { + printk(KERN_ERR + "raid1: couldn't allocate thread for %s\n", + mdname(mddev)); + goto out_free_conf; } + if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -1386,7 +1512,7 @@ static int raid1_reshape(mddev_t *mddev, spin_lock_irq(&conf->resync_lock); conf->barrier++; wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, unplug_slaves(mddev)); + conf->resync_lock, raid1_unplug(mddev->queue)); spin_unlock_irq(&conf->resync_lock); /* ok, everything is stopped */ diff -puN include/linux/raid/raid1.h~md-raid1-support-for-bitmap-intent-logging include/linux/raid/raid1.h --- 25/include/linux/raid/raid1.h~md-raid1-support-for-bitmap-intent-logging 2005-02-17 18:00:59.000000000 -0800 +++ 25-akpm/include/linux/raid/raid1.h 2005-02-17 18:00:59.000000000 -0800 @@ -36,12 +36,21 @@ struct r1_private_data_s { spinlock_t device_lock; struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + /* queue of writes that have been unplugged */ + struct bio_list flushing_bio_list; + /* for use when syncing mirrors: */ spinlock_t resync_lock; - int nr_pending; - int barrier; + int nr_pending; + int barrier; sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ wait_queue_head_t wait_idle; wait_queue_head_t wait_resume; @@ -85,14 +94,17 @@ struct r1bio_s { int read_disk; struct list_head retry_list; + struct bitmap_update *bitmap_update; /* * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. */ struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ }; /* bits for r1bio.state */ #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 #endif _