diff --exclude=net -urN v2.4.13-ac6/MAINTAINERS aio-v2.4.13-ac6.diff/MAINTAINERS --- v2.4.13-ac6/MAINTAINERS Fri Nov 2 12:56:03 2001 +++ aio-v2.4.13-ac6.diff/MAINTAINERS Fri Nov 2 13:21:12 2001 @@ -220,6 +220,12 @@ L: linux-net@vger.kernel.org S: Maintained +ASYNC IO +P: Benjamin LaHaise +M: bcrl@redhat.com +L: linux-aio@kvack.org +S: Maintained + AX.25 NETWORK LAYER P: Matthias Welwarsky M: dg2fef@afthd.tu-darmstadt.de diff --exclude=net -urN v2.4.13-ac6/Makefile aio-v2.4.13-ac6.diff/Makefile --- v2.4.13-ac6/Makefile Fri Nov 2 12:56:03 2001 +++ aio-v2.4.13-ac6.diff/Makefile Fri Nov 2 13:21:35 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 13 -EXTRAVERSION = -ac6 +EXTRAVERSION = -ac6-acio KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) @@ -437,6 +437,7 @@ \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \ -o -name '.*.rej' -o -name '.SUMS' -o -size 0 \) -type f -print` TAGS tags + rm -f drivers/scsi/53c700-mem.c backup: mrproper cd .. && tar cf - linux/ | gzip -9 > backup.gz diff --exclude=net -urN v2.4.13-ac6/arch/i386/kernel/entry.S aio-v2.4.13-ac6.diff/arch/i386/kernel/entry.S --- v2.4.13-ac6/arch/i386/kernel/entry.S Fri Nov 2 12:56:03 2001 +++ aio-v2.4.13-ac6.diff/arch/i386/kernel/entry.S Fri Nov 2 14:06:55 2001 @@ -632,6 +632,30 @@ .long SYMBOL_NAME(sys_ni_syscall) /* Reserved for Security */ .long SYMBOL_NAME(sys_gettid) .long SYMBOL_NAME(sys_readahead) /* 225 */ + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) /* 230 */ + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) /* 235 */ + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) /* 240 */ + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) /* 243 */ + .long SYMBOL_NAME(sys___io_setup) /* 244 */ + .long SYMBOL_NAME(sys___io_destroy) + .long SYMBOL_NAME(sys___io_submit) + .long SYMBOL_NAME(sys___io_cancel) + .long SYMBOL_NAME(sys___io_wait) + .long SYMBOL_NAME(sys___io_getevents) .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff --exclude=net -urN v2.4.13-ac6/drivers/block/loop.c aio-v2.4.13-ac6.diff/drivers/block/loop.c --- v2.4.13-ac6/drivers/block/loop.c Fri Nov 2 12:56:05 2001 +++ aio-v2.4.13-ac6.diff/drivers/block/loop.c Fri Nov 2 13:21:12 2001 @@ -283,7 +283,7 @@ spin_lock_irq(&lo->lo_lock); file = lo->lo_backing_file; spin_unlock_irq(&lo->lo_lock); - do_generic_file_read(file, &pos, &desc, lo_read_actor); + do_generic_file_read(file, &pos, &desc, lo_read_actor, 0); return desc.error; } diff --exclude=net -urN v2.4.13-ac6/drivers/char/raw.c aio-v2.4.13-ac6.diff/drivers/char/raw.c --- v2.4.13-ac6/drivers/char/raw.c Fri Nov 2 12:56:06 2001 +++ aio-v2.4.13-ac6.diff/drivers/char/raw.c Wed Nov 14 22:51:52 2001 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -36,13 +38,18 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, write: raw_write, open: raw_open, release: raw_release, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: raw_kvec_read, + kvec_write: raw_kvec_write, }; static struct file_operations raw_ctl_fops = { @@ -261,7 +268,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -394,3 +400,100 @@ out: return err; } + +static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, READ, cb, size, pos); +} + +int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, WRITE, cb, size, pos); +} + +int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos) +{ + int err; + unsigned minor; + kdev_t dev; + unsigned long limit, blocknr, blocks; + + unsigned sector_size, sector_bits, sector_mask; + unsigned max_sectors; + unsigned i; + + pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + /* EOF at the end */ + err = -EEOF; + if (!size || (pos >> sector_bits) == limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + /* ENXIO for io beyond the end */ + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask); + goto out; + } + + /* Verify that the scatter-gather list is sector aligned. */ + for (i=0; inr; i++) + if ((cb.vec->veclet[i].offset & sector_mask) || + (cb.vec->veclet[i].length & sector_mask)) { + pr_debug("veclet offset/length wrong"); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + if (!blocks) { + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + goto out; + } + + err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits); + if (err) + printk("raw_kvec_rw: %d\n", err); + +out: + pr_debug("raw_kvec_rw: ret is %d\n", err); + return err; +} + diff --exclude=net -urN v2.4.13-ac6/drivers/pnp/isapnp.c aio-v2.4.13-ac6.diff/drivers/pnp/isapnp.c --- v2.4.13-ac6/drivers/pnp/isapnp.c Thu Nov 1 16:40:00 2001 +++ aio-v2.4.13-ac6.diff/drivers/pnp/isapnp.c Thu Nov 15 12:23:09 2001 @@ -1707,7 +1707,7 @@ { /* IRQ priority: this table is good for i386 */ static unsigned short xtab[16] = { - 5, 10, 11, 12, 9, 14, 15, 7, 3, 4, 13, 0, 1, 6, 8, 2 + 5, 10, 11, 9, 14, 15, 7, 3, 4, 13, 0, 1, 6, 8, 2, 12 }; int err, i; unsigned long *value1, *value2; diff --exclude=net -urN v2.4.13-ac6/drivers/scsi/aic7xxx_old.c aio-v2.4.13-ac6.diff/drivers/scsi/aic7xxx_old.c --- v2.4.13-ac6/drivers/scsi/aic7xxx_old.c Thu Nov 1 16:40:00 2001 +++ aio-v2.4.13-ac6.diff/drivers/scsi/aic7xxx_old.c Fri Nov 2 13:21:12 2001 @@ -11956,7 +11956,7 @@ #include "aic7xxx_old/aic7xxx_proc.c" -MODULE_LICENSE("Dual BSD/GPL"); +//MODULE_LICENSE("Dual BSD/GPL"); /* Eventually this will go into an include file, but this will be later */ diff --exclude=net -urN v2.4.13-ac6/fs/Makefile aio-v2.4.13-ac6.diff/fs/Makefile --- v2.4.13-ac6/fs/Makefile Fri Nov 2 12:56:10 2001 +++ aio-v2.4.13-ac6.diff/fs/Makefile Fri Nov 2 13:21:12 2001 @@ -12,7 +12,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + fcntl.o ioctl.o readdir.o select.o fifo.o locks.o aio.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o jbd-kernel.o namespace.o diff --exclude=net -urN v2.4.13-ac6/fs/aio.c aio-v2.4.13-ac6.diff/fs/aio.c --- v2.4.13-ac6/fs/aio.c Wed Dec 31 19:00:00 1969 +++ aio-v2.4.13-ac6.diff/fs/aio.c Thu Nov 15 21:23:35 2001 @@ -0,0 +1,821 @@ +/* fs/aio.c + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_IOCTXS 0x800 +#if DEBUG > 1 +#define dprintk printk +#else +#define dprintk(x...) do { ; } while (0) +#endif + +static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t aio_req_lock = RW_LOCK_UNLOCKED; + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kioctx_cachep; + +/* Needs replacement rsn. */ +static struct kioctx *ioctx_list; +static unsigned long new_ioctx_id; + +/* tunable. Needs to be added to sysctl. */ +int max_aio_reqs = 0x10000; + +/* Used for rare fput completion. */ +static void aio_fput_routine(void *); +static struct tq_struct fput_tqueue = { + routine: aio_fput_routine, +}; + +static struct kiocb *fput_iocbs; +static spinlock_t fput_lock; + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + printk(KERN_NOTICE "aio_setup: okay!\n"); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_reqs) +{ + struct kioctx *ctx; + unsigned i; + long size; + + /* Round off to a power of 2. Needed for cheap mask operations */ + for (i=1; i (0x70000000U / sizeof(struct io_event))) || + (nr_reqs > (0x70000000U / sizeof(struct kiocb)))) { + pr_debug("ENOMEM: nr_reqs too high\n"); + return ERR_PTR(-ENOMEM); + } + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + ctx->max_reqs = nr_reqs; + + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->lock); + init_waitqueue_head(&ctx->wait); + + size = sizeof(struct kiocb) * nr_reqs; + ctx->reqs = kmalloc(size, GFP_KERNEL); + if (!ctx->reqs) + goto out_freectx; + + memset(ctx->reqs, 0, size); + for (i=0; ireqs[i].ctx = ctx; + ctx->reqs[i].user_obj = ctx->reqs + i + 1; + } + ctx->reqs[nr_reqs-1].user_obj = NULL; + ctx->free_req = ctx->reqs; + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_reqs; + /* This limits things somewhat for now. */ + ctx->ring = kmalloc(size, GFP_KERNEL); + if (!ctx->ring) + goto out_freereqs; + + memset(ctx->ring, 0, size); + ctx->mm = current->mm; + ctx->ring_mask = nr_reqs - 1; /* trusted copy */ + ctx->ring->mask = ctx->ring_mask; /* user copy */ + + /* now link into global list. kludge. FIXME */ + write_lock(&aio_req_lock); /* FIXME */ + ctx->ring->id = ctx->user_id = new_ioctx_id++; /* FIXME */ + ctx->next = ioctx_list; /* FIXME */ + ioctx_list = ctx; /* FIXME */ + write_unlock(&aio_req_lock); /* FIXME */ + + dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, ctx->mm, ctx->ring->mask); + return ctx; + +out_freereqs: + kfree(ctx->reqs); +out_freectx: + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + dprintk("aio: error allocating ioctx %p\n", ctx); + return ctx; +} + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __put_ioctx(struct kioctx *ctx) +{ + printk("aio: free ioctx %p\n", ctx); + + kfree(ctx->ring); + kfree(ctx->reqs); + kmem_cache_free(kioctx_cachep, ctx); +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns -EAGAIN if no requests are free. + */ +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + + /* Use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + req = ctx->free_req; + if (req) { + ctx->free_req = req->user_obj; + spin_unlock_irq(&ctx->lock); + req->user_obj = NULL; + + get_ioctx(ctx); + return req; + } + spin_unlock_irq(&ctx->lock); + + return NULL; +} + +static void aio_fput_routine(void *data) +{ + struct kiocb *req; + + spin_lock_irq(&fput_lock); + req = fput_iocbs; + fput_iocbs = NULL; + spin_unlock_irq(&fput_lock); + + while (req) { + struct kioctx *ctx = req->ctx; + struct kiocb *next = req->user_obj; + + /* Complete the fput */ + __fput(req->filp); + + /* Link the iocb into the context's free list */ + spin_lock_irq(&ctx->lock); + req->filp = NULL; + req->user_obj = ctx->free_req; + ctx->free_req = req; + spin_unlock_irq(&ctx->lock); + + put_ioctx(ctx); + + req = next; + } +} + +static void aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + int put = 1; + dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", + req, atomic_read(&req->filp->f_count)); + + /* FIXME: use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + req->cancel = NULL; + + /* Must be done under the lock to serialise against cancellations */ + if (unlikely(atomic_dec_and_test(&req->filp->f_count))) { + spin_lock(&fput_lock); + req->user_obj = fput_iocbs; + fput_iocbs = req; + spin_unlock(&fput_lock); + put = 0; + } else { + req->filp = NULL; + req->user_obj = ctx->free_req; + ctx->free_req = req; + } + spin_unlock_irq(&ctx->lock); + + if (put) + put_ioctx(ctx); + else + schedule_task(&fput_tqueue); +} + +/* Lookup an ioctx id. ioctx_list is lockless for reads. + * FIXME: this is O(n) and is only suitable for development. + */ +static inline struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct kioctx *ioctx; + struct mm_struct *mm = current->mm; + + read_lock(&aio_req_lock); + for (ioctx = ioctx_list; ioctx; ioctx = ioctx->next) + if (ioctx->user_id == ctx_id && + ioctx->mm == mm && + !ioctx->dead) + break; + read_unlock(&aio_req_lock); + + if (likely(ioctx)) + get_ioctx(ioctx); + + return ioctx; +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + */ +void aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ctx; + struct aio_ring *ring = ctx->ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + + /* add a completion event to the ring buffer. + * must be done holding ctx->lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->lock, flags); + + tail = ring->tail; + event = &ring->io_events[tail]; + tail = (tail + 1) & ring->mask; + + event->obj = (u64)(unsigned long)iocb->user_obj; + event->data = iocb->user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + spin_unlock_irqrestore(&ctx->lock, flags); + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); +#if 0 + if (!wake) { + printk("kio_complete: should send user of %p a signal...\n", ctx); + } +#endif + + wake_up(&ctx->wait); + + /* everything turned out well, dispose of the aiocb. */ + aio_put_req(ctx, iocb); +} + +/* aio_read_evt + * Pull an event off of the ioctx's event ring. + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring *ring = ioctx->ring; + unsigned long head; + int ret = -EAGAIN; + + dprintk("in aio_read_evt h%lu t%lu m%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail, + (unsigned long)ring->mask); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&aio_read_lock); /* investigate the value of making this per-ctx */ + + head = ring->head; + if (head != ring->tail) { + *ent = ring->io_events[head]; + head = (head + 1) & ioctx->ring_mask; + barrier(); + ring->head = head; + ret = 0; + } + spin_unlock(&aio_read_lock); + +out: + dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + wait_queue_head_t wait; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up(&to->wait); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + init_waitqueue_head(&to->wait); +} + +static inline void set_timeout(struct timeout *to, struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, int nr, struct io_event *event, + struct timespec *timeout) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(to_wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct timespec ts; + struct timeout to; + + init_timeout(&to); + + if (timeout) { + ret = -EFAULT; + if (copy_from_user(&ts, timeout, sizeof(ts))) + goto out; + + set_timeout(&to, &ts); + if (to.timed_out) + timeout = 0; + } + + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (i < nr) { + ret = aio_read_evt(ctx, &ent); + if (ret) { + if (i) + break; + + ret = 0; + if (!timeout) + break; + + add_wait_queue(&ctx->wait, &wait); + add_wait_queue(&to.wait, &to_wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx, &ent); + if (!ret) + break; + ret = -ETIMEDOUT; + if (to.timed_out) + break; + schedule(); + if (to.timed_out) + break; + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + ret = aio_read_evt(ctx, &ent); + } while (ret) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + remove_wait_queue(&to.wait, &to_wait); + } + + if (ret) + break; + + dprintk("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (copy_to_user(event, &ent, sizeof(ent))) { + printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n"); + break; + } + + /* Now complete the aio request and copy the result codes to userland. */ + event ++; + i ++; + } + + if (timeout) + clear_timeout(&to); +out: + return i ? i : ret; +} + +asmlinkage long sys___io_setup(unsigned nr_reqs, aio_context_t *ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (ret) + goto out; + + ret = -EINVAL; + if (ctx || nr_reqs > max_aio_reqs) { + pr_debug("EINVAL: io_setup: !ctx or nr_reqs > max\n"); + goto out; + } + + ioctx = ioctx_alloc(nr_reqs); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) + return 0; + put_ioctx(ioctx); + } + +out: + return ret; +} + +/* aio_release + * Release the kioctx associated with the userspace handle. + */ +asmlinkage long sys___io_destroy(aio_context_t ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (ioctx) { + ioctx->dead = 1; + dprintk("aio_release(%p)\n", ioctx); + put_ioctx(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +/* sys___io_submit + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +asmlinkage long sys___io_submit(aio_context_t ctx_id, int nr, struct iocb **iocbpp) +{ + struct kioctx *ctx; + long ret = 0; + int i; + + ctx = lookup_ioctx(ctx_id); + if (!ctx) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + for (i=0; ifilp = file; + tmp.aio_key = req - ctx->reqs; + ret = put_user(tmp.aio_key, &iocbp->aio_key); + if (ret) + goto out_put_req; + + req->user_obj = iocbp; + req->user_data = tmp.aio_data; + + switch (tmp.aio_lio_opcode) { + case IOCB_CMD_PREAD: op = file->f_op->aio_read; break; + case IOCB_CMD_PREADX: op = file->f_op->aio_readx; break; + case IOCB_CMD_PWRITE: op = file->f_op->aio_write; break; + case IOCB_CMD_FSYNC: op = file->f_op->aio_fsync; break; + default: op = NULL; break; + } + ret = -EINVAL; + if (!op) { + pr_debug("EINVAL: io_submit: no operation provided\n"); + goto out_put_req; + } + + ret = op(file, req, tmp); + if (!ret) + continue; + + if (ret != 0) { + if (-EEOF == ret) + ret = 0; + aio_complete(req, ret, 0); + continue; + } + + pr_debug("io_submit: op returned %ld\n", ret); + + out_put_req: + aio_put_req(ctx, req); + break; + out_fput: + fput(file); + break; + } + + put_ioctx(ctx); + run_task_queue(&tq_disk); + return i ? i : ret; +} + +static inline void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res) +{ + struct kiocb *iocb = _iocb; + ssize_t total = iocb->nr_atomic; + + if (res > 0) + total += res; + + aio_complete(iocb, total ? total : res, 0); + unmap_kvec(vec, rw); + free_kvec(vec); +} + +void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(1, _iocb, vec, res); +} + +void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(0, _iocb, vec, res); +} + +ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + ssize_t nr_read = 0; + loff_t pos = iocb.aio_offset; + kvec_cb_t cb; + + if (file->f_op->new_read) { + nr_read = file->f_op->new_read(file, (void *)buf, size, + &pos, F_ATOMIC); + dprintk("from new_read: nr_read: %ld\n", (long)nr_read); + if (-EAGAIN == nr_read) + nr_read = 0; + if ((nr_read >= min_size) || (nr_read < 0)) { + dprintk("returning nr_read: %ld\n", (long)nr_read); + return nr_read; + } + } + dprintk("nr_read: %ld\n", (long)nr_read); + + req->nr_atomic = nr_read; + size -= nr_read; + buf += nr_read; + cb.vec = map_user_kvec(READ, buf, size); + cb.fn = generic_aio_complete_read; + cb.data = req; + + dprintk("generic_aio_read: cb.vec=%p\n", cb.vec); + if (IS_ERR(cb.vec)) + return nr_read ? nr_read : PTR_ERR(cb.vec); + + return file->f_op->kvec_read(file, cb, size, pos); +} + +ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_read(file, req, iocb, iocb.aio_nbytes); +} + +ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_read(file, req, iocb, 1); +} + +ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + loff_t pos = iocb.aio_offset; + ssize_t nr_written = 0; + kvec_cb_t cb; + long res; + + if (file->f_op->new_write) { + nr_written = file->f_op->new_write(file, (void *)buf, size, + &pos, F_ATOMIC); + pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written); + if (-EAGAIN == nr_written) + nr_written = 0; + if ((nr_written >= min_size) || (nr_written < 0)) + return nr_written; + } + + req->nr_atomic = nr_written; + size -= nr_written; + buf += nr_written; + cb.vec = map_user_kvec(WRITE, buf, size); + cb.fn = generic_aio_complete_write; + cb.data = req; + + if (IS_ERR(cb.vec)) { + pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec)); + return nr_written ? nr_written : PTR_ERR(cb.vec); + } + + res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset); + pr_debug("generic_aio_write: kvec_write: %ld\n", res); + if (res < 0) { + if (nr_written) + res = nr_written; + } + return res; +} + +ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_write(file, req, iocb, iocb.aio_nbytes); +} + +asmlinkage long sys___io_cancel(aio_context_t ctx, struct iocb *iocb) +{ + return -ENOSYS; +} + +asmlinkage long sys___io_wait(aio_context_t ctx_id, struct iocb *iocb, struct timespec *timeout) +{ +#if 0 /* FIXME. later. */ + struct kioctx *ioctx; + long ret = -EINVAL; + unsigned key; + long obj = (long)iocb; + + ioctx = lookup_ioctx(ctx_id); + if (!ioctx) + goto out; + + ret = get_user(key, &iocb->aio_key); + if (ret) + goto out; + + ret = __aio_complete(ioctx, key, obj, !!timeout); + put_ioctx(ioctx); + +out: + return ret; +#endif + return -ENOSYS; +} + +asmlinkage long sys___io_getevents(int ctx_id, int nr, struct io_event *events, + struct timespec *timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (ioctx) { + ret = read_events(ioctx, nr, events, timeout); + put_ioctx(ioctx); + } + + return ret; +} + +__initcall(aio_setup); diff --exclude=net -urN v2.4.13-ac6/fs/buffer.c aio-v2.4.13-ac6.diff/fs/buffer.c --- v2.4.13-ac6/fs/buffer.c Fri Nov 2 12:56:10 2001 +++ aio-v2.4.13-ac6.diff/fs/buffer.c Wed Nov 14 17:36:10 2001 @@ -141,8 +141,7 @@ { clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); - if (waitqueue_active(&bh->b_wait)) - wake_up(&bh->b_wait); + wake_up(&bh->b_wait); } /* @@ -2088,6 +2087,7 @@ return tmp.b_blocknr; } +#if 1 /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -2264,6 +2264,7 @@ return transferred; return err; } +#endif /* * Start I/O on a page. @@ -2895,3 +2896,222 @@ module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + kvec_cb_t cb; + atomic_t io_count; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_cb_put(struct brw_cb *brw_cb) +{ + if (atomic_dec_and_test(&brw_cb->io_count)) { + ssize_t res = 0, err = 0; + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (!err && buffer_uptodate(bh)) + res += bh->b_size; + else + err = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (!res) + res = err; + + brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + brw_cb_put(brw_cb); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift) +{ + struct kvec *vec = cb.vec; + struct kveclet *veclet; + int err; + int length; + unsigned sector_size = 1 << sector_shift; + int i; + + struct brw_cb *brw_cb; + + if (!vec->nr) + BUG(); + + /* + * First, do some alignment and validity checks + */ + length = 0; + for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) { + length += veclet->length; + if ((veclet->offset & (sector_size-1)) || + (veclet->length & (sector_size-1))) { + printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size); + return -EINVAL; + } + } + + if (length < (blocks << sector_shift)) + BUG(); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + err = 0; + + if (!blocks) { + printk("brw_kiovec_async: !i\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb->cb = cb; + brw_cb->nr = 0; + + /* This is ugly. FIXME. */ + for (i=0, veclet=vec->veclet; inr; i++,veclet++) { + struct page *page = veclet->page; + unsigned offset = veclet->offset; + unsigned length = veclet->length; + + if (!page) + BUG(); + + while (length > 0) { + struct buffer_head *tmp; + tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO); + err = -ENOMEM; + if (!tmp) + goto error; + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, page, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blknr++; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) + | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + + if (brw_cb->nr >= blocks) + goto submit; + } /* End of block loop */ + } /* End of page loop */ + +submit: + atomic_set(&brw_cb->io_count, brw_cb->nr+1); + /* okay, we've setup all our io requests, now fire them off! */ + for (i=0; inr; i++) + submit_bh(rw, brw_cb->bh[i]); + brw_cb_put(brw_cb); + + return 0; + +error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (i = brw_cb->nr-1; i--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[i]); + kfree(brw_cb); + } + + return err; +} +#if 0 +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} +#endif diff --exclude=net -urN v2.4.13-ac6/fs/ext2/file.c aio-v2.4.13-ac6.diff/fs/ext2/file.c --- v2.4.13-ac6/fs/ext2/file.c Thu Nov 1 16:40:02 2001 +++ aio-v2.4.13-ac6.diff/fs/ext2/file.c Fri Nov 2 13:21:13 2001 @@ -41,12 +41,17 @@ struct file_operations ext2_file_operations = { llseek: generic_file_llseek, read: generic_file_read, + new_read: generic_file_new_read, write: generic_file_write, ioctl: ext2_ioctl, mmap: generic_file_mmap, open: generic_file_open, release: ext2_release_file, fsync: ext2_sync_file, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, }; struct inode_operations ext2_file_inode_operations = { diff --exclude=net -urN v2.4.13-ac6/fs/file_table.c aio-v2.4.13-ac6.diff/fs/file_table.c --- v2.4.13-ac6/fs/file_table.c Fri Nov 2 12:56:10 2001 +++ aio-v2.4.13-ac6.diff/fs/file_table.c Tue Nov 13 20:27:36 2001 @@ -98,27 +98,31 @@ void fput(struct file * file) { + if (atomic_dec_and_test(&file->f_count)) + __fput(file); +} + +void __fput(struct file * file) +{ struct dentry * dentry = file->f_dentry; struct vfsmount * mnt = file->f_vfsmnt; struct inode * inode = dentry->d_inode; - if (atomic_dec_and_test(&file->f_count)) { - locks_remove_flock(file); - if (file->f_op && file->f_op->release) - file->f_op->release(inode, file); - fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); - file_list_lock(); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; - list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - dput(dentry); - mntput(mnt); - } + locks_remove_flock(file); + if (file->f_op && file->f_op->release) + file->f_op->release(inode, file); + fops_put(file->f_op); + if (file->f_mode & FMODE_WRITE) + put_write_access(inode); + file_list_lock(); + file->f_dentry = NULL; + file->f_vfsmnt = NULL; + list_del(&file->f_list); + list_add(&file->f_list, &free_list); + files_stat.nr_free_files++; + file_list_unlock(); + dput(dentry); + mntput(mnt); } struct file * fget(unsigned int fd) diff --exclude=net -urN v2.4.13-ac6/fs/nfs/file.c aio-v2.4.13-ac6.diff/fs/nfs/file.c --- v2.4.13-ac6/fs/nfs/file.c Mon Sep 24 02:16:04 2001 +++ aio-v2.4.13-ac6.diff/fs/nfs/file.c Fri Nov 2 13:21:13 2001 @@ -50,6 +50,7 @@ release: nfs_release, fsync: nfs_fsync, lock: nfs_lock, + //rw_kiovec: generic_file_rw_kiovec, }; struct inode_operations nfs_file_inode_operations = { diff --exclude=net -urN v2.4.13-ac6/fs/select.c aio-v2.4.13-ac6.diff/fs/select.c --- v2.4.13-ac6/fs/select.c Mon Sep 24 02:16:05 2001 +++ aio-v2.4.13-ac6.diff/fs/select.c Fri Nov 2 13:21:13 2001 @@ -12,23 +12,31 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * June 2001 + * Added async_poll implementation. -ben */ +#include #include #include #include #include /* for STICKY_TIMEOUTS */ #include +#include +#include #include #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) +static kmem_cache_t *poll_table_cache; + struct poll_table_entry { - struct file * filp; - wait_queue_t wait; - wait_queue_head_t * wait_address; + wait_queue_t wait; + wait_queue_head_t *wait_address; + struct file *filp; + poll_table *p; }; struct poll_table_page { @@ -72,6 +80,72 @@ } } +void async_poll_complete(void *data) +{ + poll_table *p = data, *pwait; + struct kiocb *iocb = p->iocb; + unsigned int mask; + + pwait = p; + p->wake = 0; + wmb(); + do { + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= p->events | POLLERR | POLLHUP; + if (mask) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + return; + } + p->sync = 0; + wmb(); + } while (p->wake); + +} + +static void async_poll_waiter(wait_queue_t *wait) +{ + struct poll_table_entry *entry = (struct poll_table_entry *)wait; + poll_table *p = entry->p; + + /* avoid writes to the cacheline if possible for SMP */ + if (!p->wake) { + p->wake = 1; + /* ensure only one wake up queues the wtd */ + if (!p->sync && !test_and_set_bit(0, &p->sync)) + wtd_queue(&p->wtd); + } +} + +int async_poll(struct kiocb *iocb, int events) +{ + unsigned int mask; + poll_table *p, *pwait; + + p = kmem_cache_alloc(poll_table_cache, SLAB_KERNEL); + if (!p) + return -ENOMEM; + + poll_initwait(p); + wtd_set_action(&p->wtd, async_poll_complete, p); + p->iocb = iocb; + p->wake = 0; + p->sync = 0; + p->events = events; + pwait = p; + + mask = DEFAULT_POLLMASK; + if (iocb->filp->f_op && iocb->filp->f_op->poll) + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= events | POLLERR | POLLHUP; + if (mask) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + } + + return 0; +} + void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { struct poll_table_page *table = p->table; @@ -98,7 +172,11 @@ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->p = p; + if (p->iocb) + init_waitqueue_func_entry(&entry->wait, async_poll_waiter); + else + init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } } @@ -494,3 +572,14 @@ poll_freewait(&table); return err; } + +static int __init poll_init(void) +{ + poll_table_cache = kmem_cache_create("poll table", + sizeof(poll_table), 0, 0, NULL, NULL); + if (!poll_table_cache) + panic("unable to alloc poll_table_cache"); + return 0; +} + +module_init(poll_init); diff --exclude=net -urN v2.4.13-ac6/include/asm-i386/errno.h aio-v2.4.13-ac6.diff/include/asm-i386/errno.h --- v2.4.13-ac6/include/asm-i386/errno.h Mon Feb 26 10:20:14 2001 +++ aio-v2.4.13-ac6.diff/include/asm-i386/errno.h Tue Nov 6 20:31:42 2001 @@ -128,5 +128,7 @@ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ +#define ENOAIO 125 /* fd does not support aio */ +#define EEOF 126 /* in kernel only: end of file */ #endif diff --exclude=net -urN v2.4.13-ac6/include/asm-i386/resource.h aio-v2.4.13-ac6.diff/include/asm-i386/resource.h --- v2.4.13-ac6/include/asm-i386/resource.h Fri Sep 22 17:21:19 2000 +++ aio-v2.4.13-ac6.diff/include/asm-i386/resource.h Fri Nov 2 13:21:13 2001 @@ -16,8 +16,9 @@ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define RLIMIT_AS 9 /* address space limit */ #define RLIMIT_LOCKS 10 /* maximum file locks held */ +#define RLIMIT_AIO 11 -#define RLIM_NLIMITS 11 +#define RLIM_NLIMITS 12 /* * SuS says limits have to be unsigned. @@ -40,6 +41,7 @@ { RLIM_INFINITY, RLIM_INFINITY }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ + { INR_AIO, INR_AIO }, \ } #endif /* __KERNEL__ */ diff --exclude=net -urN v2.4.13-ac6/include/asm-i386/unistd.h aio-v2.4.13-ac6.diff/include/asm-i386/unistd.h --- v2.4.13-ac6/include/asm-i386/unistd.h Thu Nov 1 16:40:03 2001 +++ aio-v2.4.13-ac6.diff/include/asm-i386/unistd.h Fri Nov 2 13:52:27 2001 @@ -231,8 +231,18 @@ #define __NR_gettid 224 #define __NR_readahead 225 -/* user-visible error numbers are in the range -1 - -124: see */ +/* gap here for now */ +#define __NR___io_setup 244 +#define __NR___io_destroy 245 +#define __NR___io_submit 246 +#define __NR___io_cancel 247 +#define __NR___io_wait 248 +#define __NR___io_getevents 249 +/* user-visible error numbers are in the range -1 - -124: see */ +#ifdef NO_SYSCALL_ERRNO +#define __syscall_return(type, res) return (type)(res) +#else #define __syscall_return(type, res) \ do { \ if ((unsigned long)(res) >= (unsigned long)(-125)) { \ @@ -241,6 +251,7 @@ } \ return (type) (res); \ } while (0) +#endif /* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */ #define _syscall0(type,name) \ diff --exclude=net -urN v2.4.13-ac6/include/asm-ia64/resource.h aio-v2.4.13-ac6.diff/include/asm-ia64/resource.h --- v2.4.13-ac6/include/asm-ia64/resource.h Fri Sep 22 17:21:19 2000 +++ aio-v2.4.13-ac6.diff/include/asm-ia64/resource.h Fri Nov 2 13:21:13 2001 @@ -19,8 +19,9 @@ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define RLIMIT_AS 9 /* address space limit */ #define RLIMIT_LOCKS 10 /* maximum file locks held */ +#define RLIMIT_AIO 11 -#define RLIM_NLIMITS 11 +#define RLIM_NLIMITS 12 /* * SuS says limits have to be unsigned. @@ -39,10 +40,11 @@ { 0, RLIM_INFINITY }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ { 0, 0 }, \ - { INR_OPEN, INR_OPEN }, \ + { INR_OPEN, INR_OPEN }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ { RLIM_INFINITY, RLIM_INFINITY }, \ + { INR_AIO, INR_AIO }, \ } # endif /* __KERNEL__ */ diff --exclude=net -urN v2.4.13-ac6/include/asm-ia64/unistd.h aio-v2.4.13-ac6.diff/include/asm-ia64/unistd.h --- v2.4.13-ac6/include/asm-ia64/unistd.h Mon Aug 13 15:12:08 2001 +++ aio-v2.4.13-ac6.diff/include/asm-ia64/unistd.h Fri Nov 2 13:21:13 2001 @@ -6,6 +6,7 @@ * * Copyright (C) 1998-2000 Hewlett-Packard Co * Copyright (C) 1998-2000 David Mosberger-Tang + * Portions Copyright (C) 2001 Red Hat, Inc. */ #include @@ -206,6 +207,13 @@ #define __NR_getdents64 1214 #define __NR_getunwind 1215 +#define __NR___io_setup 1224 /* aio - create a context */ +#define __NR___io_destroy 1225 /* aio - destroy a context */ +#define __NR___io_submit 1226 /* aio - submit a list of ios */ +#define __NR___io_cancel 1227 /* aio - cancel a specific ios */ +#define __NR___io_wait 1228 /* aio - wait for a specific ios */ +#define __NR___io_getevents 1229 /* aio - retrieve events */ + #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr); diff --exclude=net -urN v2.4.13-ac6/include/linux/aio.h aio-v2.4.13-ac6.diff/include/linux/aio.h --- v2.4.13-ac6/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.13-ac6.diff/include/linux/aio.h Thu Nov 15 21:28:29 2001 @@ -0,0 +1,149 @@ +/* linux/aio.h + * + * Copyright 2000,2001 Red Hat. + * + * Written by Benjamin LaHaise + * + * Permission to use, copy, modify, and distribute this software and its + * documentation is hereby granted, provided that the above copyright + * notice appears in all copies. This software is provided without any + * warranty, express or implied. Red Hat makes no representations about + * the suitability of this software for any purpose. + * + * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, + * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF + * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND + * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, + * ENHANCEMENTS, OR MODIFICATIONS. + */ +#ifndef __LINUX__AIO_H +#define __LINUX__AIO_H + +#include + +typedef unsigned long aio_context_t; + +enum { + IOCB_CMD_PREAD = 0, + IOCB_CMD_PWRITE = 1, + IOCB_CMD_FSYNC = 2, + IOCB_CMD_FDSYNC = 3, + IOCB_CMD_PREADX = 4, +}; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#if defined(__LITTLE_ENDIAN) +#define PADDED(x,y) x, y +#elif defined(__BIG_ENDIAN) +#define PADDED(x,y) y, x +#else +#error edit for your odd byteorder. +#endif + +struct aio_ring { + __u32 PADDED(id, pad1); /* kernel internal index number */ + __u32 PADDED(mask, pad2); /* number of io_events - 1 */ + __u32 PADDED(head, pad3); + __u32 PADDED(tail, pad4); + + __u32 PADDED(woke, pad5); /* set when a wakeup was sent */ + + __u32 pad6[22]; /* pad out to 128 bytes */ + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + */ + +struct iocb { + /* these are internal to the kernel/libc. */ + __u64 aio_data; /* data to be returned in event's data */ + __u32 PADDED(aio_key, aio_reserved1); + /* the kernel sets aio_key to the req # */ + + /* common fields */ + __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ + __s16 aio_reqprio; + __u32 aio_fildes; + + __u64 aio_buf; + __u64 aio_nbytes; + __s64 aio_offset; + + /* extra parameters */ + __u64 aio_reserved2; + __u64 aio_reserved3; +}; /* 64 bytes */ + +#undef IFBIG +#undef IFLITTLE + +#ifdef __KERNEL__ +#ifndef __LINUX__KIOVEC_H +#include +#endif +#include + +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kioctx; + +struct kiocb { + void (*cancel)(void *data, struct kioctx *ctx, int idx); + struct file *filp; + struct kioctx *ctx; + void *user_obj; + __u64 user_data; + ssize_t nr_atomic; +}; + +struct kioctx { + atomic_t users; + int dead; + + /* This needs improving */ + unsigned long user_id; + struct kioctx *next; + struct mm_struct *mm; + + wait_queue_head_t wait; + + spinlock_t lock; + + struct kiocb *reqs; + struct kiocb *free_req; + + unsigned max_reqs; + unsigned ring_mask; + struct aio_ring *ring; +}; + +extern struct file_operations aio_fops; + +extern void aio_complete(struct kiocb *iocb, long res, long res2); +extern void __put_ioctx(struct kioctx *ctx); + +#define get_ioctx(kioctx) atomic_inc(&(kioctx)->users) +#define put_ioctx(kioctx) do { if (atomic_dec_and_test(&(kioctx)->users)) __put_ioctx(kioctx); } while (0) + +#endif /*__KERNEL__*/ + +#endif /* __AIO_H__ */ + diff --exclude=net -urN v2.4.13-ac6/include/linux/brlock.h aio-v2.4.13-ac6.diff/include/linux/brlock.h --- v2.4.13-ac6/include/linux/brlock.h Tue Nov 13 00:26:54 2001 +++ aio-v2.4.13-ac6.diff/include/linux/brlock.h Tue Nov 13 20:23:53 2001 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_AIO_LOCK, __BR_END }; diff --exclude=net -urN v2.4.13-ac6/include/linux/file.h aio-v2.4.13-ac6.diff/include/linux/file.h --- v2.4.13-ac6/include/linux/file.h Wed Aug 23 14:22:26 2000 +++ aio-v2.4.13-ac6.diff/include/linux/file.h Tue Nov 13 20:14:24 2001 @@ -6,6 +6,7 @@ #define __LINUX_FILE_H extern void FASTCALL(fput(struct file *)); +extern void FASTCALL(__fput(struct file *)); extern struct file * FASTCALL(fget(unsigned int fd)); static inline int get_close_on_exec(unsigned int fd) diff --exclude=net -urN v2.4.13-ac6/include/linux/fs.h aio-v2.4.13-ac6.diff/include/linux/fs.h --- v2.4.13-ac6/include/linux/fs.h Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/include/linux/fs.h Thu Nov 15 21:28:39 2001 @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -42,6 +41,7 @@ #undef NR_OPEN #define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ +#define INR_AIO 2048 /* initial limit on number of aio requests */ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< /* FIXME */ +#include + +#define F_ATOMIC 0x0001 +#define F_OFFSETOK 0x0002 + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -832,6 +846,20 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + + + /* this will replace read/write ops above in 2.5 */ + ssize_t (*new_read) (struct file *, char *, size_t, loff_t *, int); + ssize_t (*new_write) (struct file *, const char *, size_t, loff_t *, int); + + ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_readx)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb); + + /* in-kernel async api */ + int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t); + int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t); }; struct inode_operations { @@ -1422,6 +1450,13 @@ unsigned long *); extern int block_sync_page(struct page *); +extern int generic_aio_read(struct file *, struct kiocb *, struct iocb, size_t); +extern int generic_aio_write(struct file *, struct kiocb *, struct iocb, size_t); +extern int generic_sock_aio_read(struct file *, struct kiocb *, struct iocb); +extern int generic_file_aio_read(struct file *, struct kiocb *, struct iocb); +extern int generic_file_aio_write(struct file *, struct kiocb *, struct iocb); +extern int generic_file_kvec_read(struct file *, kvec_cb_t, size_t, loff_t); +extern int generic_file_kvec_write(struct file *, kvec_cb_t, size_t, loff_t); int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); @@ -1430,8 +1465,10 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); +extern ssize_t generic_file_new_read(struct file *, char *, size_t, loff_t *, int); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); -extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int); +extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern loff_t generic_file_llseek(struct file *, loff_t, int); diff --exclude=net -urN v2.4.13-ac6/include/linux/iobuf.h aio-v2.4.13-ac6.diff/include/linux/iobuf.h --- v2.4.13-ac6/include/linux/iobuf.h Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/include/linux/iobuf.h Thu Nov 15 21:28:41 2001 @@ -53,8 +53,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -80,6 +82,8 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], kdev_t dev, unsigned long b[], int size); diff --exclude=net -urN v2.4.13-ac6/include/linux/kiovec.h aio-v2.4.13-ac6.diff/include/linux/kiovec.h --- v2.4.13-ac6/include/linux/kiovec.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.13-ac6.diff/include/linux/kiovec.h Thu Nov 15 19:27:01 2001 @@ -0,0 +1,107 @@ +#ifndef __LINUX__IOBUF_H +#define __LINUX__IOBUF_H + +struct page; +struct list; + +struct kveclet { + struct page *page; + unsigned offset; + unsigned length; +}; + +struct kvec { + unsigned max_nr; + unsigned nr; + struct kveclet veclet[0]; +}; + +struct kvec_cb { + struct kvec *vec; + void (*fn)(void *data, struct kvec *vec, ssize_t res); + void *data; +}; + +struct kvec_cb_list { + struct list_head list; + struct kvec_cb cb; +}; + +#ifndef _LINUX_TYPES_H +#include +#endif +#ifndef _LINUX_KDEV_T_H +#include +#endif +#ifndef _ASM_KMAP_TYPES_H +#include +#endif + +extern struct kvec *map_user_kvec(int rw, unsigned long va, size_t len); +extern void unmap_kvec(struct kvec *, int dirtied); +extern void free_kvec(struct kvec *); + +/* brw_kvec_async: + * Performs direct io to/from disk into cb.vec. Count is the number + * of sectors to read, sector_shift is the blocksize (which must be + * compatible with the kernel's current idea of the device's sector + * size) in log2. blknr is the starting sector offset on dev. + * + */ +extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count, + unsigned long blknr, int sector_shift); + +/* Memory copy helpers usage: + * void foo(... struct kveclet *veclet...) + * + * struct kvec_dst dst; + * + * kvec_dst_map(&dst, veclet, KM_USER0) + * for (...) + * memcpy_to_kvec_dst(&dst, data, size); -- each copy appends + * kvec_dst_unmap(&dst); + * + * Note that scheduling is not permitted between kvec_dst_map() and + * kvec_dst_unmap(). This is because internally the routines make use + * of an atomic kmap. + */ +struct kvec_dst { + char *addr; + char *dst; + struct kveclet *let; + int space; + int offset; + enum km_type type; +}; + + +#define kvec_dst_map(Xdst, Xlet) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = (Xlet); \ + _dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\ + _dst->dst += _let->offset + _dst->offset; \ + _dst->space = _let->length; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_init(Xdst, Xlet, Xtype) \ + do { \ + (Xdst)->offset = 0; \ + (Xdst)->type = Xtype; \ + kvec_dst_map(Xdst, Xlet); \ + } while(0) + +#define kvec_dst_unmap(Xdst) \ + do { \ + kunmap_atomic((Xdst)->addr, (Xdst)->type); \ + (Xdst)->offset = (Xdst)->dst - (Xdst)->addr; \ + (Xdst)->offset -= (Xdst)->let->offset; \ + } while(0) + +extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst, + const char *from, long len)); +extern void FASTCALL(memcpy_from_kvec_dst(char *to, + struct kvec_dst *from, long len)); + +#endif diff --exclude=net -urN v2.4.13-ac6/include/linux/mm.h aio-v2.4.13-ac6.diff/include/linux/mm.h --- v2.4.13-ac6/include/linux/mm.h Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/include/linux/mm.h Thu Nov 15 21:28:39 2001 @@ -321,8 +321,7 @@ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ - if (waitqueue_active(&(page)->wait)) \ - wake_up(&(page)->wait); \ + wake_up(&(page)->wait); \ } while (0) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) diff --exclude=net -urN v2.4.13-ac6/include/linux/net.h aio-v2.4.13-ac6.diff/include/linux/net.h --- v2.4.13-ac6/include/linux/net.h Tue Nov 13 00:19:34 2001 +++ aio-v2.4.13-ac6.diff/include/linux/net.h Thu Nov 15 21:28:38 2001 @@ -83,6 +83,9 @@ struct scm_cookie; struct vm_area_struct; struct page; +struct iocb; +struct kioctx; +#include /* shut gcc up */ struct proto_ops { int family; @@ -110,6 +113,8 @@ int (*recvmsg) (struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); + int (*kvec_read) (struct socket *sock, kvec_cb_t cb, size_t size); + int (*kvec_write) (struct socket *sock, kvec_cb_t cb, size_t size); }; struct net_proto_family diff --exclude=net -urN v2.4.13-ac6/include/linux/poll.h aio-v2.4.13-ac6.diff/include/linux/poll.h --- v2.4.13-ac6/include/linux/poll.h Tue Nov 13 15:20:32 2001 +++ aio-v2.4.13-ac6.diff/include/linux/poll.h Thu Nov 15 21:28:41 2001 @@ -7,14 +7,25 @@ #include #include +#ifndef __LINUX__MM_H #include +#endif #include +#ifndef __LINUX__WORKTODO_H +#include +#endif struct poll_table_page; +struct kiocb; typedef struct poll_table_struct { - int error; - struct poll_table_page * table; + struct worktodo wtd; + int error; + struct poll_table_page *table; + struct kiocb *iocb; /* iocb for async poll */ + int events; /* event mask for async poll */ + int wake; + long sync; } poll_table; extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); @@ -29,7 +40,9 @@ { pt->error = 0; pt->table = NULL; + pt->iocb = NULL; } + extern void poll_freewait(poll_table* pt); diff --exclude=net -urN v2.4.13-ac6/include/linux/sched.h aio-v2.4.13-ac6.diff/include/linux/sched.h --- v2.4.13-ac6/include/linux/sched.h Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/include/linux/sched.h Thu Nov 15 21:28:39 2001 @@ -271,6 +271,7 @@ atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t files; /* How many open files does this user have? */ + atomic_t aio_reqs; /* How many aio requests does the user have? */ /* Hash table maintenance information */ struct user_struct *next, **pprev; @@ -779,6 +780,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff --exclude=net -urN v2.4.13-ac6/include/linux/skbuff.h aio-v2.4.13-ac6.diff/include/linux/skbuff.h --- v2.4.13-ac6/include/linux/skbuff.h Mon Nov 12 20:26:06 2001 +++ aio-v2.4.13-ac6.diff/include/linux/skbuff.h Thu Nov 15 21:28:41 2001 @@ -1126,6 +1126,15 @@ extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); +/* skb <-> kvec helpers */ +extern void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset, + struct kvec *vec, int len); +extern int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb, + int offset, struct kvec *vec, int len); +extern int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len, + void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb)); + + extern void skb_init(void); extern void skb_add_mtu(int mtu); diff --exclude=net -urN v2.4.13-ac6/include/linux/tqueue.h aio-v2.4.13-ac6.diff/include/linux/tqueue.h --- v2.4.13-ac6/include/linux/tqueue.h Tue Nov 13 00:19:34 2001 +++ aio-v2.4.13-ac6.diff/include/linux/tqueue.h Tue Nov 13 20:23:41 2001 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff --exclude=net -urN v2.4.13-ac6/include/linux/types.h aio-v2.4.13-ac6.diff/include/linux/types.h --- v2.4.13-ac6/include/linux/types.h Mon Nov 12 23:45:28 2001 +++ aio-v2.4.13-ac6.diff/include/linux/types.h Tue Nov 13 20:23:41 2001 @@ -127,4 +127,9 @@ char f_fpack[6]; }; +/* kernel typedefs -- they belong here. */ +#ifdef __KERNEL__ +typedef struct kvec_cb kvec_cb_t; +#endif /* __KERNEL__ */ + #endif /* _LINUX_TYPES_H */ diff --exclude=net -urN v2.4.13-ac6/include/linux/wait.h aio-v2.4.13-ac6.diff/include/linux/wait.h --- v2.4.13-ac6/include/linux/wait.h Tue Nov 13 00:19:34 2001 +++ aio-v2.4.13-ac6.diff/include/linux/wait.h Thu Nov 15 17:57:13 2001 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -137,6 +140,7 @@ #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -174,6 +178,22 @@ #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -231,6 +251,20 @@ list_del(&old->task_list); } +#define add_wait_queue_cond(q, wait, cond) \ + ({ \ + unsigned long flags; \ + int _raced = 0; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + if (cond) \ + __add_wait_queue((q), (wait)); \ + else \ + _raced = 1; \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + _raced; \ + }) + #endif /* __KERNEL__ */ #endif diff --exclude=net -urN v2.4.13-ac6/include/linux/worktodo.h aio-v2.4.13-ac6.diff/include/linux/worktodo.h --- v2.4.13-ac6/include/linux/worktodo.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.13-ac6.diff/include/linux/worktodo.h Thu Nov 15 18:13:14 2001 @@ -0,0 +1,76 @@ +/* + * Written by Benjamin LaHaise. + * + * Copyright 2000-2001 Red Hat, Inc. + * + * #include "gpl.h" + * + * Basic design idea from Jeff Merkey. + * Stack based on ideas from Ingo Molnar. + */ +#ifndef __LINUX__WORKTODO_H +#define __LINUX__WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct wtd_stack { + void (*fn)(void *data); + void *data; +}; + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ + + int sp; + struct wtd_stack stack[3]; +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_init(wtd, routine) do { \ + INIT_TQUEUE(&(wtd)->tq, (routine), (wtd)); \ + (wtd)->data = 0; \ + (wtd)->sp = 0; \ +} while (0) + +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_push(wtd, action, wtddata) \ +do { \ + (wtd)->stack[(wtd)->sp].fn = (wtd)->tq.routine; \ + (wtd)->stack[(wtd)->sp++].data = (wtd)->tq.data;\ + (wtd)->tq.routine = action; \ + (wtd)->tq.data = wtddata; \ +} while (0) + +static inline void wtd_pop(struct worktodo *wtd) +{ + if (wtd->sp) { + wtd->sp--; + wtd->tq.routine = wtd->stack[wtd->sp].fn; + wtd->tq.data = wtd->stack[wtd->sp].data; + } +} + +#define wtd_set_action(wtd, action, wtddata) INIT_TQUEUE(&(wtd)->tq, action, wtddata) + +struct page; +extern void wtd_wait_page(struct worktodo *wtd, struct page *page); +extern void wtd_lock_page(struct worktodo *wtd, struct page *page); +struct buffer_head; +extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); + +#if 0 /* not implemented yet */ +extern void wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* __LINUX__WORKTODO_H */ diff --exclude=net -urN v2.4.13-ac6/kernel/context.c aio-v2.4.13-ac6.diff/kernel/context.c --- v2.4.13-ac6/kernel/context.c Thu Nov 1 16:40:03 2001 +++ aio-v2.4.13-ac6.diff/kernel/context.c Fri Nov 2 13:21:13 2001 @@ -94,12 +94,18 @@ */ for (;;) { set_task_state(curtask, TASK_INTERRUPTIBLE); - add_wait_queue(&context_task_wq, &wait); - if (TQ_ACTIVE(tq_context)) + add_wait_queue_exclusive_lifo(&context_task_wq, &wait); + if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context)) set_task_state(curtask, TASK_RUNNING); - schedule(); + else + schedule(); remove_wait_queue(&context_task_wq, &wait); run_task_queue(&tq_context); + while (TQ_ACTIVE(tq_context)) { + if (current->need_resched) + schedule(); + run_task_queue(&tq_context); + } wake_up(&context_task_done); if (signal_pending(curtask)) { while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0) diff --exclude=net -urN v2.4.13-ac6/kernel/fork.c aio-v2.4.13-ac6.diff/kernel/fork.c --- v2.4.13-ac6/kernel/fork.c Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/kernel/fork.c Fri Nov 2 13:21:13 2001 @@ -46,6 +46,16 @@ wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; diff --exclude=net -urN v2.4.13-ac6/kernel/ksyms.c aio-v2.4.13-ac6.diff/kernel/ksyms.c --- v2.4.13-ac6/kernel/ksyms.c Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/kernel/ksyms.c Fri Nov 2 13:21:13 2001 @@ -213,6 +213,11 @@ EXPORT_SYMBOL(do_generic_file_read); EXPORT_SYMBOL(generic_file_write); EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_new_read); +EXPORT_SYMBOL(generic_file_aio_read); +EXPORT_SYMBOL(generic_file_aio_write); +EXPORT_SYMBOL(generic_file_kvec_read); +EXPORT_SYMBOL(generic_file_kvec_write); EXPORT_SYMBOL(generic_ro_fops); EXPORT_SYMBOL(generic_buffer_fdatasync); EXPORT_SYMBOL(page_hash_bits); diff --exclude=net -urN v2.4.13-ac6/kernel/sched.c aio-v2.4.13-ac6.diff/kernel/sched.c --- v2.4.13-ac6/kernel/sched.c Fri Nov 2 12:56:13 2001 +++ aio-v2.4.13-ac6.diff/kernel/sched.c Fri Nov 2 13:21:13 2001 @@ -714,13 +714,13 @@ } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small + * +ve number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by contonuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) @@ -733,14 +733,25 @@ list_for_each(tmp,&q->task_list) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_func_t func; CHECK_MAGIC(curr->__magic); + func = curr->func; + if (func) { + unsigned flags = curr->flags; + func(curr); + if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if (state & mode) { WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if (try_to_wake_up(p, sync) && + (curr->flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) break; } } diff --exclude=net -urN v2.4.13-ac6/kernel/softirq.c aio-v2.4.13-ac6.diff/kernel/softirq.c --- v2.4.13-ac6/kernel/softirq.c Mon Sep 24 02:16:05 2001 +++ aio-v2.4.13-ac6.diff/kernel/softirq.c Fri Nov 2 13:21:13 2001 @@ -354,6 +354,7 @@ data = p->data; wmb(); p->sync = 0; + smp_mb(); if (f) f(data); } diff --exclude=net -urN v2.4.13-ac6/kernel/user.c aio-v2.4.13-ac6.diff/kernel/user.c --- v2.4.13-ac6/kernel/user.c Wed Nov 29 01:43:39 2000 +++ aio-v2.4.13-ac6.diff/kernel/user.c Fri Nov 2 13:21:13 2001 @@ -29,7 +29,8 @@ struct user_struct root_user = { __count: ATOMIC_INIT(1), processes: ATOMIC_INIT(1), - files: ATOMIC_INIT(0) + files: ATOMIC_INIT(0), + aio_reqs: ATOMIC_INIT(0), }; /* Binary files v2.4.13-ac6/mm/.filemap.c.swp and aio-v2.4.13-ac6.diff/mm/.filemap.c.swp differ diff --exclude=net -urN v2.4.13-ac6/mm/filemap.c aio-v2.4.13-ac6.diff/mm/filemap.c --- v2.4.13-ac6/mm/filemap.c Fri Nov 2 12:56:14 2001 +++ aio-v2.4.13-ac6.diff/mm/filemap.c Thu Nov 15 15:12:41 2001 @@ -22,12 +22,14 @@ #include #include #include +#include #include #include #include #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -934,7 +936,7 @@ static void generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, - struct page * page) + struct page * page, int flags) { unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; unsigned long index = page->index; @@ -1049,7 +1051,7 @@ * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int flags) { struct inode *inode = filp->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -1107,13 +1109,17 @@ unsigned long end_index, nr, ret; end_index = inode->i_size >> PAGE_CACHE_SHIFT; - if (index > end_index) + if (index > end_index) { + desc->error = -EEOF; break; + } nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = inode->i_size & ~PAGE_CACHE_MASK; - if (nr <= offset) + if (nr <= offset) { + desc->error = -EEOF; break; + } } nr = nr - offset; @@ -1134,7 +1140,7 @@ if (!Page_Uptodate(page)) goto page_not_up_to_date; - generic_file_readahead(reada_ok, filp, inode, page); + generic_file_readahead(reada_ok, filp, inode, page, flags); page_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -1167,13 +1173,24 @@ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. */ page_not_up_to_date: - generic_file_readahead(reada_ok, filp, inode, page); + generic_file_readahead(reada_ok, filp, inode, page, flags); if (Page_Uptodate(page)) goto page_ok; /* Get exclusive access to the page ... */ - lock_page(page); + if (flags & F_ATOMIC) { + if (TryLockPage(page)) { + if (Page_Uptodate(page)) + goto page_ok; + printk("page_not_up_to_date: -EAGAIN\n"); + desc->error = -EAGAIN; + page_cache_release(page); + break; + } + printk("page_not_up_to_date: atomic trylock succeeded\n"); + } else + lock_page(page); /* Did it get unhashed before we got the lock? */ if (!page->mapping) { @@ -1197,11 +1214,12 @@ goto page_ok; /* Again, try some read-ahead while waiting for the page to finish.. */ - generic_file_readahead(reada_ok, filp, inode, page); - wait_on_page(page); + generic_file_readahead(reada_ok, filp, inode, page, flags); + if (!(flags & F_ATOMIC)) + wait_on_page(page); if (Page_Uptodate(page)) goto page_ok; - error = -EIO; + error = (flags & F_ATOMIC) ? -EAGAIN : -EIO; } /* UHHUH! A synchronous read error occurred. Report it */ @@ -1278,7 +1296,7 @@ * This is the "read()" routine for all filesystems * that can use the page cache directly. */ -ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +ssize_t generic_file_new_read(struct file * filp, char * buf, size_t count, loff_t *ppos, int flags) { ssize_t retval; @@ -1293,16 +1311,25 @@ desc.count = count; desc.buf = buf; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc, + file_read_actor, flags); retval = desc.written; - if (!retval) + if (!retval) { retval = desc.error; + if (retval == -EEOF && !(flags & F_ATOMIC)) + retval = 0; + } } } return retval; } +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + return generic_file_new_read(filp, buf, count, ppos, 0); +} + static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) { ssize_t written; @@ -1396,7 +1423,7 @@ desc.count = count; desc.buf = (char *) out_file; desc.error = 0; - do_generic_file_read(in_file, ppos, &desc, file_send_actor); + do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0); retval = desc.written; if (!retval) @@ -2789,3 +2816,712 @@ panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + ret = -EINVAL; + if (nr <= 0) + goto out; + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_free(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + kvec_cb_t cb; + + size_t size; + unsigned long transferred; + unsigned offset; + struct kveclet *veclet; + + int sync; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io, int unlock) +{ + kvec_cb_t cb; + ssize_t res; + + if (unlock) { + unsigned i; + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + deactivate_page(page); + page_cache_release(page); + } + } else { + unsigned i; + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + } + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + + cb = io->cb; + res = io->transferred ? io->transferred : io->err; + kfree(io); + + cb.fn(cb.data, cb.vec, res); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + pr_debug("__iodesc_finish_write(%p)\n", io); + + __iodesc_free(io, WRITE == io->rw); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + unsigned long bytes; + unsigned long offset, src_offset; + struct page *src_page; + long status; + char *kaddr; + int src_bytes; + char *src; + int done = 0; + unsigned left; + + src_page = io->veclet->page; + src_bytes = io->veclet->length; + src_offset = io->veclet->offset; + src = kmap(src_page) + src_offset; + + offset = io->offset; + kaddr = kmap(page); + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (io->err) { + printk("prepare_write: %d\n", io->err); + goto unlock; + } + + left = bytes; + for (;;) { + unsigned this = src_bytes; + if (left < this) + this = left; + + memcpy(kaddr, src, this); + kaddr += this; + src += this; + left -= this; + src_bytes -= this; + src_offset += this; + + if (left <= 0) + break; + + if (!src_bytes) { + io->veclet++; + kunmap(src_page); + src_page = io->veclet->page; + src_bytes = io->veclet->length; + src_offset = io->veclet->offset; + src = kmap(src_page) + src_offset; + } + } + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; + else + printk("commit_write: %ld\n", status); + + if (status > 0) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + + src_offset += status; + src_offset &= PAGE_CACHE_SIZE - 1; + } else { + io->err = status; + done = 1; + } + +unlock: + kunmap(page); + kunmap(src_page); + + //UnlockPage(page); + //deactivate_page(page); + //page_cache_release(page); + + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { + pr_debug("waiting on bh=%pi io=%p\n", bh, io); + wtd_wait_on_buffer(&io->wtd, bh); + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { + pr_debug("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + + pr_debug("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + up(&io->file->f_dentry->d_inode->i_sem); + + for (i=0; inr_pages; i++) + if (__iodesc_write_page(io, io->pages[i])) + break; + + if (io->sync) { + io->good_idx = 0; + + pr_debug("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + + pr_debug("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + wtd_lock_page(&io->wtd, io->good_page); + return; + } + } + + //Is this faster? __iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + struct page **src_pagep; + char *dst_addr, *src_addr; + int src_off; + size_t size; + size_t valid; + + struct kveclet *veclet = io->veclet; + struct page *dst_page = veclet->page; + int dst_len = veclet->length; + int dst_off = veclet->offset; + + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + dst_addr = kmap(veclet->page); + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + io->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", + this, io->transferred); + + if (size <= 0) + break; + + if (dst_len <= 0) { + kunmap(dst_page); + veclet++; + dst_page = veclet->page; + dst_off = veclet->offset; + dst_len = veclet->length; + dst_addr = kmap(dst_page); + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); + pr_debug("page(%lu)->count = %d\n", + (*src_pagep)->index, + atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(dst_page); + kunmap(*src_pagep); +no_data: + __iodesc_free(io, 0); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); + while (Page_Uptodate(page)) { +again: + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + wtd_lock_page(&io->wtd, page); + return; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + printk("attempting to read %lu\n", page->index); + io->did_read = 1; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + wtd_lock_page(&io->wtd, page); + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int ret; + if (!mayblock) { + static int zoo; if (zoo++ < 5) printk("read sleep\n"); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + ret = readpage(io->file, io->new_pages[i]); + if (ret) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) { + static int zoo; if (zoo++ < 5) printk("read sleep\n"); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + if (!TryLockPage(page)) { + int ret = readpage(io->file, page); + if (ret) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n"); + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + wtd_lock_page(&io->wtd, io->good_page); + return; +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos); + +int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, READ, cb, size, pos); +} + +int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, WRITE, cb, size, pos); +} + +int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (rw != READ && rw != WRITE) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->size = size; + + if (READ == rw) { + pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size); + + if (pos > inode->i_size) + size = 0; + else if ((pos + size) > inode->i_size) + size = inode->i_size - pos; + + if (io->size < size) + size = io->size; + else if (size < io->size) + io->size = size; + + pr_debug("io->size=%d size=%d\n", io->size, size); + } + + index = pos >> PAGE_CACHE_SHIFT; + eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->rw = rw; + io->as = as; + io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1); + io->file = file; + io->cb = cb; + io->veclet = cb.vec->veclet; + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + /* FIXME: make the down a WTD_op */ + if (rw == WRITE) + down(&io->file->f_dentry->d_inode->i_sem); + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (rw == READ) + __generic_file_read_iodesc(io, 0); + else if (rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + kfree(io); +out: + return ret; +} + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(&page->wait, &wtd->wait); + wtd_queue(wtd); + } else + schedule_task(&run_disk_tq); +} + +void wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + + /* Wakeups may race with TryLockPage, so try again within the wait + * queue spinlock. + */ + if (!add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page))) { + /* Page is still locked. Kick the disk queue... */ + run_task_queue(&tq_disk); + return; + } + } + + wtd->tq.routine(wtd->tq.data); +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + if (!buffer_locked(bh)) { + wtd->tq.routine(wtd->tq.data); + return; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + if (add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh))) + wtd->tq.routine(wtd->tq.data); + else + run_task_queue(&tq_disk); +} + +void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +struct tq_struct run_disk_tq = { + routine: do_run_tq_disk, + data: NULL +}; + diff --exclude=net -urN v2.4.13-ac6/mm/memory.c aio-v2.4.13-ac6.diff/mm/memory.c --- v2.4.13-ac6/mm/memory.c Fri Nov 2 12:56:14 2001 +++ aio-v2.4.13-ac6.diff/mm/memory.c Thu Nov 15 18:54:53 2001 @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #include @@ -1465,3 +1467,203 @@ } while (addr < end); return 0; } + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + * FIXME: some architectures need to flush the cache based on user addresses + * here. Someone please provide a better macro than flush_cache_page. + */ + +#define dprintk(x...) +struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len) +{ + struct kvec *vec; + struct kveclet *veclet; + unsigned long end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + int i; + int datain = (rw == READ); + unsigned nr_pages; + + end = ptr + len; + if (end < ptr) + return ERR_PTR(-EINVAL); + + nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages -= ptr >> PAGE_SHIFT; + nr_pages ++; + vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet), + GFP_KERNEL); + if (!vec) + return ERR_PTR(-ENOMEM); + vec->nr = 0; + vec->max_nr = nr_pages; + veclet = vec->veclet; + + /* Make sure the iobuf is not already mapped somewhere. */ + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + down_read(&mm->mmap_sem); + + err = -EFAULT; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + struct page *map; + veclet->offset = ptr & ~PAGE_MASK; + veclet->length = PAGE_SIZE - veclet->offset; + if (len < veclet->length) + veclet->length = len; + ptr &= PAGE_MASK; + len -= veclet->length; + + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + atomic_inc(&map->count); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + veclet->page = map; + veclet++; + + ptr += PAGE_SIZE; + vec->nr = ++i; + } + + veclet->page = NULL; /* dummy for the prefetch in free_kvec */ + veclet->length = 0; /* bug checking ;-) */ + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return vec; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kvec(vec, 0); + printk(KERN_DEBUG "map_user_kvec: err(%d)\n", err); + kfree(vec); + return ERR_PTR(err); +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kvec (struct kvec *vec, int dirtied) +{ + struct kveclet *veclet = vec->veclet; + struct kveclet *end = vec->veclet + vec->nr; + struct page *map = veclet->page; + + prefetchw(map); + for (; vecletpage) { + prefetchw(veclet[1].page); + if (likely(map != NULL) && !PageReserved(map)) { + if (dirtied) { + SetPageDirty(map); + flush_dcache_page(map); /* FIXME */ + } + __free_page(map); + } + } + + vec->nr = 0; +} + +void free_kvec(struct kvec *vec) +{ + kfree(vec); +} + +/* kvec memory copy helper: appends len bytes in from to dst. + */ +void memcpy_to_kvec_dst(struct kvec_dst *dst, const char *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (dst->space < cnt) + cnt = dst->space; + + memcpy(dst->dst, from, cnt); + from += cnt; + dst->space -= cnt; + dst->dst += cnt; + len -= cnt; + if (!dst->space && len) { + kvec_dst_unmap(dst); + kvec_dst_map(dst, dst->let + 1); + if (unlikely(!dst->space)) + BUG(); + } + } while (len); +} + +/* kvec memory copy helper: copies and consumes len bytes in from to dst. + */ +void memcpy_from_kvec_dst(char *to, struct kvec_dst *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (from->space < cnt) + cnt = from->space; + + memcpy(to, from->dst, cnt); + to += cnt; + from->space -= cnt; + from->dst += cnt; + len -= cnt; + if (unlikely(!from->space && len)) { + kvec_dst_unmap(from); + kvec_dst_map(from, from->let + 1); + if (unlikely(!from->space)) + BUG(); + } + } while (len); +} +