From: Ben Slusky Modifying loop_copy_bio not to throw away pages fixes the deadlock under high I/O. The new loop_recycle_buffer subroutine allows the loop driver to complete requests using fewer pages than it wanted. drivers/block/loop.c | 137 +++++++++++++++++++++++++++++++++++++++++---------- 1 files changed, 112 insertions(+), 25 deletions(-) diff -puN drivers/block/loop.c~loop-recycle drivers/block/loop.c --- 25/drivers/block/loop.c~loop-recycle 2003-12-29 22:33:22.000000000 -0800 +++ 25-akpm/drivers/block/loop.c 2003-12-29 22:33:22.000000000 -0800 @@ -426,7 +426,7 @@ static int loop_end_io_transfer(struct b if (bio->bi_size) return 1; - if (err || bio_rw(bio) == WRITE) { + if (err || (bio_rw(bio) == WRITE && bio->bi_vcnt == rbh->bi_vcnt)) { bio_endio(rbh, rbh->bi_size, err); if (atomic_dec_and_test(&lo->lo_pending)) up(&lo->lo_bh_mutex); @@ -444,11 +444,11 @@ static struct bio *loop_copy_bio(struct int i; if (bio_rw(rbh) != WRITE) { - bio = bio_clone(rbh, __GFP_NOWARN); + bio = bio_clone(rbh, GFP_NOIO); return bio; } - bio = bio_alloc(__GFP_NOWARN, rbh->bi_vcnt); + bio = bio_alloc(GFP_NOIO, rbh->bi_vcnt); if (!bio) return NULL; @@ -458,27 +458,26 @@ static struct bio *loop_copy_bio(struct bio_for_each_segment(bv, rbh, i) { struct bio_vec *bbv = &bio->bi_io_vec[i]; - bbv->bv_page = alloc_page(__GFP_NOWARN|__GFP_HIGHMEM); + /* We need one page; the rest we can live without */ + bbv->bv_page = alloc_page((bio->bi_vcnt ? __GFP_NOWARN : GFP_NOIO) | __GFP_HIGHMEM); if (bbv->bv_page == NULL) - goto oom; + break; - bbv->bv_len = bv->bv_len; bbv->bv_offset = bv->bv_offset; + bio->bi_size += (bbv->bv_len = bv->bv_len); + bio->bi_vcnt++; + } + + /* Can't get anything done if we didn't get any pages */ + if (unlikely(!bio->bi_vcnt)) { + bio_put(bio); + return NULL; } - bio->bi_idx = rbh->bi_idx; - bio->bi_vcnt = rbh->bi_vcnt; - bio->bi_size = rbh->bi_size; + bio->bi_vcnt += (bio->bi_idx = rbh->bi_idx); bio->bi_rw = rbh->bi_rw; return bio; - -oom: - while (--i >= 0) - __free_page(bio->bi_io_vec[i].bv_page); - - bio_put(bio); - return NULL; } static struct bio *loop_get_buffer(struct loop_device *lo, struct bio *rbh) @@ -499,8 +498,10 @@ static struct bio *loop_get_buffer(struc if (flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; - if (bio == NULL) + if (unlikely(bio == NULL)) { + printk("loop: alloc failed\n"); blk_congestion_wait(WRITE, HZ/10); + } } while (bio == NULL); bio->bi_end_io = loop_end_io_transfer; @@ -511,6 +512,71 @@ static struct bio *loop_get_buffer(struc return bio; } +static void loop_recycle_buffer(struct loop_device *lo, struct bio *bio) +{ + struct bio *rbh = bio->bi_private; + struct bio_vec *bv, *bbv, *rbv; + int i, flags, nvecs = bio->bi_vcnt - bio->bi_idx; + + /* + * Comments in ll_rw_blk.c reserve for generic_make_request the right to + * "change bi_dev and bi_sector for remaps as it sees fit." Doh! + * Workaround: reset the bi_bdev and recompute the starting sector for + * the next write. + */ + bio->bi_bdev = lo->lo_device; + bio->bi_sector = rbh->bi_sector + (lo->lo_offset >> 9); + /* Clean up the flags too */ + bio->bi_flags &= (~(BIO_POOL_MASK - 1) | (1 << BIO_UPTODATE)); + + /* + * Move the allocated pages into position to transfer more data. + */ + __bio_for_each_segment(bv, bio, i, rbh->bi_idx) { + rbv = &rbh->bi_io_vec[i]; + bbv = bv + nvecs; + + /* Workaround -- see note above */ + bio->bi_sector += rbv->bv_len >> 9; + if (i < bio->bi_idx) + continue; + + if (i + nvecs < rbh->bi_vcnt) { + bbv->bv_page = bv->bv_page; + bbv->bv_offset = rbv->bv_offset; + bio->bi_size += (bbv->bv_len = rbv->bv_len); + } else + __free_page(bv->bv_page); + memset(bv, 0, sizeof(*bv)); + } + + bio->bi_idx = bio->bi_vcnt; + bio->bi_vcnt += nvecs; + bio->bi_vcnt = min(bio->bi_vcnt, rbh->bi_vcnt); + + /* + * If we need more pages, try to get some. + * Clear PF_MEMALLOC to avoid consuming all available memory. + */ + flags = current->flags; + current->flags &= ~PF_MEMALLOC; + + __bio_for_each_segment(rbv, rbh, i, bio->bi_vcnt) { + bv = &bio->bi_io_vec[i]; + + bv->bv_page = alloc_page(__GFP_NOWARN|__GFP_HIGHMEM); + if (bv->bv_page == NULL) + break; + + bv->bv_offset = rbv->bv_offset; + bio->bi_size += (bv->bv_len = rbv->bv_len); + bio->bi_vcnt++; + } + + if (flags & PF_MEMALLOC) + current->flags |= PF_MEMALLOC; +} + static int loop_transfer_bio(struct loop_device *lo, struct bio *to_bio, struct bio *from_bio) { @@ -520,12 +586,14 @@ static int loop_transfer_bio(struct loop IV = from_bio->bi_sector + (lo->lo_offset >> 9); - bio_for_each_segment(from_bvec, from_bio, i) { - to_bvec = &to_bio->bi_io_vec[i]; - ret |= lo_do_transfer(lo, bio_data_dir(to_bio), + __bio_for_each_segment(to_bvec, to_bio, i, from_bio->bi_idx) { + from_bvec = &from_bio->bi_io_vec[i]; + if (i >= to_bio->bi_idx) { + ret |= lo_do_transfer(lo, bio_data_dir(to_bio), to_bvec->bv_page, to_bvec->bv_offset, from_bvec->bv_page, from_bvec->bv_offset, from_bvec->bv_len, IV); + } IV += from_bvec->bv_len >> 9; } @@ -592,16 +660,30 @@ inactive: static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) { int ret; + struct bio *rbh; - /* - * For block backed loop, we know this is a READ - */ if (lo->lo_flags & LO_FLAGS_DO_BMAP) { ret = do_bio_filebacked(lo, bio); bio_endio(bio, bio->bi_size, ret); - } else { - struct bio *rbh = bio->bi_private; + } else if (bio_rw(bio) == WRITE) { + /* + * Write complete, but more pages remain; + * encrypt and write some more pages + */ + loop_recycle_buffer(lo, bio); + + rbh = bio->bi_private; + if ((ret = loop_transfer_bio(lo, bio, rbh))) { + bio_endio(bio, bio->bi_size, ret); + return; + } + generic_make_request(bio); + } else { + /* + * Read complete; do decryption now + */ + rbh = bio->bi_private; ret = loop_transfer_bio(lo, bio, rbh); bio_endio(rbh, rbh->bi_size, ret); @@ -619,6 +701,7 @@ static int loop_thread(void *data) { struct loop_device *lo = data; struct bio *bio; + int x; daemonize("loop%d", lo->lo_number); @@ -653,7 +736,11 @@ static int loop_thread(void *data) printk("loop: missing bio\n"); continue; } + + x = (lo->lo_flags & LO_FLAGS_DO_BMAP) || bio_rw(bio) != WRITE; loop_handle_bio(lo, bio); + if (!x) + continue; /* * upped both for pending work and tear-down, lo_pending _