From: Jens Axboe blk_rq_map_user() is a bit of a hack currently, since it drops back to kmalloc() if bio_map_user() fails. This is unfortunate since it means we do no real segment or size checking (and the request segment counts contain crap, already found one bug in a scsi lld). It's also pretty nasty for > PAGE_SIZE requests, as we attempt to do higher order page allocations. Even worse still, ide-cd will drop back to PIO for non-sg/bio requests. All in all, very suboptimal. This patch adds bio_copy_user() which simply sets up a bio with kernel pages and copies data as needed for reads and writes. It also changes bio_map_user() to return an error pointer like bio_copy_user(), so we can return something sane to the user instead of always -ENOMEM. Signed-off-by: Jens Axboe Signed-off-by: Andrew Morton --- 25-akpm/drivers/block/ll_rw_blk.c | 76 +++++++--------- 25-akpm/drivers/block/scsi_ioctl.c | 2 25-akpm/drivers/cdrom/cdrom.c | 2 25-akpm/drivers/ide/ide-cd.c | 18 ++-- 25-akpm/fs/bio.c | 166 +++++++++++++++++++++++++++++++------ 25-akpm/include/linux/bio.h | 5 - 25-akpm/include/linux/blkdev.h | 2 7 files changed, 191 insertions(+), 80 deletions(-) diff -puN drivers/block/ll_rw_blk.c~bio_copy_user-cleanups drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~bio_copy_user-cleanups 2004-07-29 22:56:31.468623992 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2004-07-29 22:56:31.487621104 -0700 @@ -1863,50 +1863,43 @@ EXPORT_SYMBOL(blk_insert_request); struct request *blk_rq_map_user(request_queue_t *q, int rw, void __user *ubuf, unsigned int len) { - struct request *rq = NULL; - char *buf = NULL; + unsigned long uaddr; + struct request *rq; struct bio *bio; - int ret; + + if (len > (q->max_sectors << 9)) + return ERR_PTR(-EINVAL); + if ((!len && ubuf) || (len && !ubuf)) + return ERR_PTR(-EINVAL); rq = blk_get_request(q, rw, __GFP_WAIT); if (!rq) return ERR_PTR(-ENOMEM); - bio = bio_map_user(q, NULL, (unsigned long) ubuf, len, rw == READ); - if (!bio) { - int bytes = (len + 511) & ~511; - - buf = kmalloc(bytes, q->bounce_gfp | GFP_USER); - if (!buf) { - ret = -ENOMEM; - goto fault; - } - - if (rw == WRITE) { - if (copy_from_user(buf, ubuf, len)) { - ret = -EFAULT; - goto fault; - } - } else - memset(buf, 0, len); - } + /* + * if alignment requirement is satisfied, map in user pages for + * direct dma. else, set up kernel bounce buffers + */ + uaddr = (unsigned long) ubuf; + if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) + bio = bio_map_user(q, NULL, uaddr, len, rw == READ); + else + bio = bio_copy_user(q, uaddr, len, rw == READ); - rq->bio = rq->biotail = bio; - if (rq->bio) + if (!IS_ERR(bio)) { + rq->bio = rq->biotail = bio; blk_rq_bio_prep(q, rq, bio); - rq->buffer = rq->data = buf; - rq->data_len = len; - return rq; -fault: - if (buf) - kfree(buf); - if (bio) - bio_unmap_user(bio, 1); - if (rq) - blk_put_request(rq); + rq->buffer = rq->data = NULL; + rq->data_len = len; + return rq; + } - return ERR_PTR(ret); + /* + * bio is the err-ptr + */ + blk_put_request(rq); + return (struct request *) bio; } EXPORT_SYMBOL(blk_rq_map_user); @@ -1920,18 +1913,15 @@ EXPORT_SYMBOL(blk_rq_map_user); * Description: * Unmap a request previously mapped by blk_rq_map_user(). */ -int blk_rq_unmap_user(struct request *rq, void __user *ubuf, struct bio *bio, - unsigned int ulen) +int blk_rq_unmap_user(struct request *rq, struct bio *bio, unsigned int ulen) { - const int read = rq_data_dir(rq) == READ; int ret = 0; - if (bio) - bio_unmap_user(bio, read); - if (rq->buffer) { - if (read && copy_to_user(ubuf, rq->buffer, ulen)) - ret = -EFAULT; - kfree(rq->buffer); + if (bio) { + if (bio_flagged(bio, BIO_USER_MAPPED)) + bio_unmap_user(bio); + else + ret = bio_uncopy_user(bio); } blk_put_request(rq); diff -puN drivers/block/scsi_ioctl.c~bio_copy_user-cleanups drivers/block/scsi_ioctl.c --- 25/drivers/block/scsi_ioctl.c~bio_copy_user-cleanups 2004-07-29 22:56:31.469623840 -0700 +++ 25-akpm/drivers/block/scsi_ioctl.c 2004-07-29 22:56:31.488620952 -0700 @@ -211,7 +211,7 @@ static int sg_io(request_queue_t *q, str hdr->sb_len_wr = len; } - if (blk_rq_unmap_user(rq, hdr->dxferp, bio, hdr->dxfer_len)) + if (blk_rq_unmap_user(rq, bio, hdr->dxfer_len)) return -EFAULT; /* may not have succeeded, but output values written to control diff -puN drivers/cdrom/cdrom.c~bio_copy_user-cleanups drivers/cdrom/cdrom.c --- 25/drivers/cdrom/cdrom.c~bio_copy_user-cleanups 2004-07-29 22:56:31.471623536 -0700 +++ 25-akpm/drivers/cdrom/cdrom.c 2004-07-29 22:56:31.493620192 -0700 @@ -2091,7 +2091,7 @@ static int cdrom_read_cdda_bpc(struct cd cdi->last_sense = s->sense_key; } - if (blk_rq_unmap_user(rq, ubuf, bio, len)) + if (blk_rq_unmap_user(rq, bio, len)) ret = -EFAULT; if (ret) diff -puN drivers/ide/ide-cd.c~bio_copy_user-cleanups drivers/ide/ide-cd.c --- 25/drivers/ide/ide-cd.c~bio_copy_user-cleanups 2004-07-29 22:56:31.473623232 -0700 +++ 25-akpm/drivers/ide/ide-cd.c 2004-07-29 22:56:31.498619432 -0700 @@ -1961,13 +1961,17 @@ static ide_startstop_t cdrom_do_block_pc * sg request */ if (rq->bio) { - if (rq->data_len & 3) { - printk("%s: block pc not aligned, len=%d\n", drive->name, rq->data_len); - cdrom_end_request(drive, 0); - return ide_stopped; - } - info->dma = drive->using_dma; + int mask = drive->queue->dma_alignment; + unsigned long addr = (unsigned long) page_address(bio_page(rq->bio)); + info->cmd = rq_data_dir(rq); + info->dma = drive->using_dma; + + /* + * check if dma is safe + */ + if ((rq->data_len & mask) || (addr & mask)) + info->dma = 0; } /* Start sending the command to the drive. */ @@ -3137,7 +3141,7 @@ int ide_cdrom_setup (ide_drive_t *drive) int nslots; blk_queue_prep_rq(drive->queue, ide_cdrom_prep_fn); - blk_queue_dma_alignment(drive->queue, 3); + blk_queue_dma_alignment(drive->queue, 31); drive->queue->unplug_delay = (1 * HZ) / 1000; if (!drive->queue->unplug_delay) drive->queue->unplug_delay = 1; diff -puN fs/bio.c~bio_copy_user-cleanups fs/bio.c --- 25/fs/bio.c~bio_copy_user-cleanups 2004-07-29 22:56:31.474623080 -0700 +++ 25-akpm/fs/bio.c 2004-07-29 22:57:23.200759512 -0700 @@ -376,6 +376,115 @@ int bio_add_page(struct bio *bio, struct len, offset); } +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user() and write back data + * to user space in case of a read. + */ +int bio_uncopy_user(struct bio *bio) +{ + struct bio_vec *bvec; + int i, ret = 0; + + if (bio_data_dir(bio) == READ) { + char *uaddr = bio->bi_private; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *addr = page_address(bvec->bv_page); + + if (!ret && copy_to_user(uaddr, addr, bvec->bv_len)) + ret = -EFAULT; + + __free_page(bvec->bv_page); + uaddr += bvec->bv_len; + } + } + + bio_put(bio); + return ret; +} + +/** + * bio_copy_user - copy user data to bio + * @q: destination block queue + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, + unsigned int len, int write_to_vm) +{ + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int i, ret; + + bio = bio_alloc(GFP_KERNEL, end - start); + if (!bio) + return ERR_PTR(-ENOMEM); + + ret = 0; + while (len) { + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + + if (__bio_add_page(q, bio, page, bytes, 0) < bytes) { + ret = -EINVAL; + break; + } + + len -= bytes; + } + + /* + * success + */ + if (!ret) { + if (!write_to_vm) { + bio->bi_rw |= (1 << BIO_RW); + /* + * for a write, copy in data to kernel pages + */ + ret = -EFAULT; + bio_for_each_segment(bvec, bio, i) { + char *addr = page_address(bvec->bv_page); + + if (copy_from_user(addr, (char *) uaddr, bvec->bv_len)) + goto cleanup; + } + } + + bio->bi_private = (void *) uaddr; + return bio; + } + + /* + * cleanup + */ +cleanup: + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); + + bio_put(bio); + return ERR_PTR(ret); +} + static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev, unsigned long uaddr, unsigned int len, int write_to_vm) @@ -392,12 +501,13 @@ static struct bio *__bio_map_user(reques * size for now, in the future we can relax this restriction */ if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q))) - return NULL; + return ERR_PTR(-EINVAL); bio = bio_alloc(GFP_KERNEL, nr_pages); if (!bio) - return NULL; + return ERR_PTR(-ENOMEM); + ret = -ENOMEM; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); if (!pages) goto out; @@ -446,11 +556,12 @@ static struct bio *__bio_map_user(reques if (!write_to_vm) bio->bi_rw |= (1 << BIO_RW); + bio->bi_flags |= (1 << BIO_USER_MAPPED); return bio; out: kfree(pages); bio_put(bio); - return NULL; + return ERR_PTR(ret); } /** @@ -461,7 +572,7 @@ out: * @write_to_vm: bool indicating writing to pages or not * * Map the user space address into a bio suitable for io to a block - * device. + * device. Returns an error pointer in case of error. */ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, unsigned long uaddr, unsigned int len, int write_to_vm) @@ -470,26 +581,29 @@ struct bio *bio_map_user(request_queue_t bio = __bio_map_user(q, bdev, uaddr, len, write_to_vm); - if (bio) { - /* - * subtle -- if __bio_map_user() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); + if (IS_ERR(bio)) + return bio; - if (bio->bi_size < len) { - bio_endio(bio, bio->bi_size, 0); - bio_unmap_user(bio, 0); - return NULL; - } - } + /* + * subtle -- if __bio_map_user() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); - return bio; + if (bio->bi_size == len) + return bio; + + /* + * don't support partial mappings + */ + bio_endio(bio, bio->bi_size, 0); + bio_unmap_user(bio); + return ERR_PTR(-EINVAL); } -static void __bio_unmap_user(struct bio *bio, int write_to_vm) +static void __bio_unmap_user(struct bio *bio) { struct bio_vec *bvec; int i; @@ -510,7 +624,7 @@ static void __bio_unmap_user(struct bio * make sure we dirty pages we wrote to */ __bio_for_each_segment(bvec, bio, i, 0) { - if (write_to_vm) + if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); page_cache_release(bvec->bv_page); @@ -522,17 +636,15 @@ static void __bio_unmap_user(struct bio /** * bio_unmap_user - unmap a bio * @bio: the bio being unmapped - * @write_to_vm: bool indicating whether pages were written to * - * Unmap a bio previously mapped by bio_map_user(). The @write_to_vm - * must be the same as passed into bio_map_user(). Must be called with + * Unmap a bio previously mapped by bio_map_user(). Must be called with * a process context. * * bio_unmap_user() may sleep. */ -void bio_unmap_user(struct bio *bio, int write_to_vm) +void bio_unmap_user(struct bio *bio) { - __bio_unmap_user(bio, write_to_vm); + __bio_unmap_user(bio); bio_put(bio); } @@ -863,3 +975,5 @@ EXPORT_SYMBOL(bio_unmap_user); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); EXPORT_SYMBOL(bio_split_pool); +EXPORT_SYMBOL(bio_copy_user); +EXPORT_SYMBOL(bio_uncopy_user); diff -puN include/linux/bio.h~bio_copy_user-cleanups include/linux/bio.h --- 25/include/linux/bio.h~bio_copy_user-cleanups 2004-07-29 22:56:31.476622776 -0700 +++ 25-akpm/include/linux/bio.h 2004-07-29 22:56:31.502618824 -0700 @@ -121,6 +121,7 @@ struct bio { #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_EOPNOTSUPP 6 /* not supported */ +#define BIO_USER_MAPPED 7 /* contains user pages */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* @@ -267,9 +268,11 @@ extern int bio_add_page(struct bio *, st extern int bio_get_nr_vecs(struct block_device *); extern struct bio *bio_map_user(struct request_queue *, struct block_device *, unsigned long, unsigned int, int); -extern void bio_unmap_user(struct bio *, int); +extern void bio_unmap_user(struct bio *); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int); +extern int bio_uncopy_user(struct bio *); #ifdef CONFIG_HIGHMEM /* diff -puN include/linux/blkdev.h~bio_copy_user-cleanups include/linux/blkdev.h --- 25/include/linux/blkdev.h~bio_copy_user-cleanups 2004-07-29 22:56:31.477622624 -0700 +++ 25-akpm/include/linux/blkdev.h 2004-07-29 22:56:31.503618672 -0700 @@ -535,7 +535,7 @@ extern void __blk_stop_queue(request_que extern void blk_run_queue(request_queue_t *); extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *); extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int); -extern int blk_rq_unmap_user(struct request *, void __user *, struct bio *, unsigned int); +extern int blk_rq_unmap_user(struct request *, struct bio *, unsigned int); extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *); static inline request_queue_t *bdev_get_queue(struct block_device *bdev) _