diff -urN /md0/kernels/2.4/v2.4.9-ac9/MAINTAINERS aio-v2.4.9-ac9.diff/MAINTAINERS
--- /md0/kernels/2.4/v2.4.9-ac9/MAINTAINERS	Mon Sep 10 20:35:49 2001
+++ aio-v2.4.9-ac9.diff/MAINTAINERS	Fri Sep  7 00:15:16 2001
@@ -201,6 +201,12 @@
 L:	linux-net@vger.kernel.org
 S:	Maintained
 
+ASYNC IO
+P:	Benjamin LaHaise
+M:	bcrl@redhat.com
+L:	linux-aio@kvack.org
+S:	Maintained
+
 AX.25 NETWORK LAYER
 P:	Matthias Welwarsky
 M:	dg2fef@afthd.tu-darmstadt.de
diff -urN /md0/kernels/2.4/v2.4.9-ac9/Makefile aio-v2.4.9-ac9.diff/Makefile
--- /md0/kernels/2.4/v2.4.9-ac9/Makefile	Mon Sep 10 20:35:49 2001
+++ aio-v2.4.9-ac9.diff/Makefile	Fri Sep  7 00:15:35 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 9
-EXTRAVERSION = -ac9
+EXTRAVERSION = -ac9-aio
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -urN /md0/kernels/2.4/v2.4.9-ac9/arch/i386/kernel/entry.S aio-v2.4.9-ac9.diff/arch/i386/kernel/entry.S
--- /md0/kernels/2.4/v2.4.9-ac9/arch/i386/kernel/entry.S	Mon Sep 10 20:35:49 2001
+++ aio-v2.4.9-ac9.diff/arch/i386/kernel/entry.S	Sat Sep  8 01:56:10 2001
@@ -626,6 +626,12 @@
 	.long SYMBOL_NAME(sys_getdents64)	/* 220 */
 	.long SYMBOL_NAME(sys_fcntl64)
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for TUX */
+	.long SYMBOL_NAME(sys___io_setup)	/* 223 */
+	.long SYMBOL_NAME(sys___io_destroy)
+	.long SYMBOL_NAME(sys___io_getevents)
+	.long SYMBOL_NAME(sys___io_submit)
+	.long SYMBOL_NAME(sys___io_cancel)
+	.long SYMBOL_NAME(sys___io_wait)
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long SYMBOL_NAME(sys_ni_syscall)
diff -urN /md0/kernels/2.4/v2.4.9-ac9/drivers/char/raw.c aio-v2.4.9-ac9.diff/drivers/char/raw.c
--- /md0/kernels/2.4/v2.4.9-ac9/drivers/char/raw.c	Mon Sep 10 20:35:51 2001
+++ aio-v2.4.9-ac9.diff/drivers/char/raw.c	Fri Sep  7 00:15:16 2001
@@ -16,6 +16,8 @@
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
 #include <asm/uaccess.h>
+#include <linux/aio.h>
+#include <linux/slab.h>
 
 #define dprintk(x...) 
 
@@ -36,13 +38,14 @@
 int	raw_open(struct inode *, struct file *);
 int	raw_release(struct inode *, struct file *);
 int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
-
+int	raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 
 static struct file_operations raw_fops = {
 	read:		raw_read,
 	write:		raw_write,
 	open:		raw_open,
 	release:	raw_release,
+	rw_kiovec:	raw_rw_kiovec,
 };
 
 static struct file_operations raw_ctl_fops = {
@@ -130,7 +133,7 @@
 	 * the blocksize on a device which is already mounted.  
 	 */
 	
-	sector_size = 512;
+	sector_size = 2048;
 	if (is_mounted(rdev)) {
 		if (blksize_size[MAJOR(rdev)])
 			sector_size = blksize_size[MAJOR(rdev)][MINOR(rdev)];
@@ -260,7 +263,6 @@
 }
 
 
-
 ssize_t	raw_read(struct file *filp, char * buf, 
 		 size_t size, loff_t *offp)
 {
@@ -363,7 +365,7 @@
 		for (i=0; i < blocks; i++) 
 			iobuf->blocks[i] = blocknr++;
 		
-		err = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, sector_size);
+		err = brw_kiovec(rw, 1, &iobuf, dev, blocks, iobuf->blocks, sector_size);
 
 		if (rw == READ && err > 0)
 			mark_dirty_kiobuf(iobuf, err);
@@ -393,3 +395,92 @@
  out:	
 	return err;
 }
+
+int	raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos)
+{
+	int		err;
+	unsigned long	blocknr, blocks;
+	unsigned long	__b[KIO_MAX_SECTORS];
+	unsigned long	*b = __b;
+	int		i;
+	int		minor;
+	kdev_t		dev;
+	unsigned long	limit;
+
+	int		sector_size, sector_bits, sector_mask;
+	int		max_sectors;
+
+#if 0	/* FIXME: this is wrong. */
+	err = 0;
+	if (!size)
+		goto out_complete;
+#endif
+
+	pr_debug("raw_rw_kiovec: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos);
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = to_kdev_t(raw_devices[minor].binding->bd_dev);
+	sector_size = raw_devices[minor].sector_size;
+	sector_bits = raw_devices[minor].sector_bits;
+	sector_mask = sector_size- 1;
+	max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	dprintk ("rw_raw_dev_async: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		printk("pos/size wrong\n");
+		goto out;
+	}
+
+	err = -ENXIO;
+	if ((pos >> sector_bits) >= limit) {
+		printk("raw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		goto out;
+	}
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	blocknr = pos >> sector_bits;
+	blocks = size >> sector_bits;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (blocks > limit - blocknr)
+		blocks = limit - blocknr;
+	err = -ENXIO;
+	pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr);
+	if (!blocks)
+		goto out;
+
+	if (blocks > KIO_MAX_SECTORS) {
+		err = -ENOMEM;
+		b = kmalloc(sizeof(*b) * blocks, GFP_KERNEL);
+		if (!b)
+			goto out;
+	}
+
+	for (i=0; i < blocks; i++) 
+		b[i] = blocknr++;
+
+	err = brw_kiovec_async(rw, nr, kiovec, dev, blocks, b, sector_size);
+	pr_debug("brw_kiovec_async: %d\n", err);
+
+	if (b != __b)
+		kfree(b);
+out:
+	pr_debug("brw_kiovec_async: ret is %d\n", err);
+	return err;
+}
+
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/Makefile aio-v2.4.9-ac9.diff/fs/Makefile
--- /md0/kernels/2.4/v2.4.9-ac9/fs/Makefile	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/fs/Makefile	Fri Sep  7 00:15:17 2001
@@ -12,7 +12,7 @@
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \
-		fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
+		fcntl.o ioctl.o readdir.o select.o fifo.o locks.o aio.o \
 		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
 		filesystems.o jbd-kernel.o namespace.o
 
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/aio.c aio-v2.4.9-ac9.diff/fs/aio.c
--- /md0/kernels/2.4/v2.4.9-ac9/fs/aio.c	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/fs/aio.c	Fri Sep 14 13:25:12 2001
@@ -0,0 +1,827 @@
+//#define DEBUG 1
+/* drivers/char/aio.c
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ *	Implements /dev/aio, something on top of which it should be possible
+ *	to write a POSIX AIO library.
+ *
+ *	Copyright 2000, 2001 Red Hat, Inc.  All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/iobuf.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/brlock.h>
+#include <linux/aio.h>
+#include <linux/smp_lock.h>
+
+#include <asm/uaccess.h>
+
+#undef KERN_DEBUG
+#define KERN_DEBUG ""
+#define MAX_IOCTXS	0x800
+#define dprintk(x...)	do { ; } while (0)
+
+static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t aio_req_lock = SPIN_LOCK_UNLOCKED;
+
+static kmem_cache_t	*kiocb_cachep;
+static kmem_cache_t	*kioctx_cachep;
+
+/* Lockless for reads.  Needs replacement rsn. */
+static struct kioctx	*ioctx_list;
+static unsigned long	new_ioctx_id;
+
+/* tunable.  Needs to be added to sysctl. */
+int max_aio_reqs = 0x10000;
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kiocb_cachep)
+		panic("unable to create kiocb cache\n");
+
+	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kioctx_cachep)
+		panic("unable to create kioctx cache");
+
+	printk(KERN_NOTICE "aio_setup: okay!\n");
+	printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+	return 0;
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an aioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_reqs)
+{
+	struct kioctx *ctx;
+	unsigned i;
+
+	for (i=1; i<nr_reqs; i<<=1)
+		;
+
+	nr_reqs = i;
+
+	ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+	if (ctx) {
+		memset(ctx, 0, sizeof(*ctx));
+		lock_kernel();
+		ctx->user_id = new_ioctx_id++;	/* FIXME */
+		ctx->next = ioctx_list;
+		ioctx_list = ctx;
+		unlock_kernel();
+		atomic_set(&ctx->users, 1);
+		spin_lock_init(&ctx->done_lock);
+		init_waitqueue_head(&ctx->wait);
+
+		ctx->max_reqs = nr_reqs;
+		ctx->reqs = kmalloc(sizeof(struct iocb *) * ctx->max_reqs, GFP_KERNEL);
+		if (ctx->reqs) {
+			memset(ctx->reqs, 0, sizeof(struct iocb *) * ctx->max_reqs);
+			ctx->ring = kmalloc(sizeof(*ctx->ring), GFP_KERNEL);
+			if (ctx->ring) {
+				memset(ctx->ring, 0, sizeof(*ctx->ring));
+				printk("aio: allocated aioctx %p\n", ctx);
+				ctx->mm = current->mm;
+				return ctx;
+			}
+			kfree(ctx->reqs);
+			ctx->reqs = NULL;
+		}
+		kmem_cache_free(kioctx_cachep, ctx);
+		ctx = ERR_PTR(-ENOMEM);
+	}
+
+	printk("aio: error allocating aioctx %p\n", ctx);
+	return ctx;
+}
+
+void kiocb_free(struct kiocb *iocb)
+{
+	int i;
+
+	for (i=0; i<iocb->nr_kiovec; i++)
+		unmap_kiobuf(iocb->kiovec[i]);
+
+	free_kiovec_sz(iocb->nr_kiovec, iocb->kiovec, iocb->kiovec_sz);
+	iocb->nr_kiovec = 0;
+	fput(iocb->filp);
+	iocb->filp = NULL;
+	kmem_cache_free(kiocb_cachep, iocb);
+}
+
+/* aio_complete
+ *	Called when the io request on the given iocb is complete.
+ */
+void aio_complete(struct kiocb *iocb, long res, long res2)
+{
+	struct kioctx	*ctx = iocb->ctx;
+	struct aio_ring	*ring = ctx->ring;
+	struct io_event	*event;
+	unsigned long	flags;
+	unsigned long	tail;
+
+	/* add a completion event to the ring buffer.
+	 * must be done holding done_lock to prevent
+	 * other code from messing with the tail
+	 * pointer since we might be called from irq
+	 * context.
+	 */
+	spin_lock_irqsave(&ctx->done_lock, flags);
+
+	tail = (ring->tail + 1) % ring->nr;
+
+	event = &ring->io_events[tail];
+
+	event->key = iocb->user_key;
+	event->data = iocb->user_data;
+	event->res = res;
+	event->res2 = res2;
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	barrier();
+
+	ring->tail = tail;
+
+	wmb();
+	if (!ring->woke)
+		ring->woke = 1;
+
+	spin_unlock_irqrestore(&ctx->done_lock, flags);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+#if 0
+	if (!wake) {
+		printk("kio_complete: should send user of %p a signal...\n", ctx);
+	}
+#endif
+
+	wake_up(&ctx->wait);
+}
+
+/* aio_kiobuf_endio
+ *	Called when io on a given kiobuf is complete.
+ */
+static void aio_kiobuf_endio(struct kiobuf *iobuf)
+{
+	struct kiocb	*iocb = iobuf->end_io_data;
+
+	/* TODO: possibly put the return code into the iocb
+	 * here.  This only really makes sense if it's being
+	 * put into the user's iocb, which would mean pinning
+	 * it down in memory.  Maybe.
+	 */
+	pr_debug("aio_kiobuf_endio: %p %p/%d\n", iobuf, iocb, iocb->idx);
+	aio_complete(iocb, iocb->kiovec[0]->transferred ?
+		iocb->kiovec[0]->transferred : iocb->kiovec[0]->errno, 0);
+}
+
+/* kio_submit:
+ *	Submits an actual aiocb
+ */
+static inline int kio_submit(struct kiocb *iocb, struct iocb *aiocb)
+{
+	int (*rw_kiovec)(struct file *, int, int, struct kiobuf **, int, size_t, loff_t);
+	int ret = -ENOSYS;
+	struct kioctx *ctx = iocb->ctx;
+	int rw;
+
+	switch(aiocb->aio_lio_opcode) {
+	case IOCB_CMD_PWRITE:
+		rw = WRITE;
+		break;
+	case IOCB_CMD_PREAD:
+		rw = READ;
+		break;
+	default:
+		printk("kio_submit: lio_opcode = %d\n", aiocb->aio_lio_opcode);
+		goto out;
+	}
+
+	rw_kiovec = iocb->filp->f_op->rw_kiovec;
+	if (rw_kiovec)
+		ret = rw_kiovec(iocb->filp, rw, iocb->nr_kiovec, iocb->kiovec, /*flags*/ 0, aiocb->aio_nbytes, aiocb->aio_offset);
+	else {
+		iocb->kiovec[0]->transferred = 0;
+		iocb->kiovec[0]->errno = -ENOSYS;
+		aio_kiobuf_endio(iocb->kiovec[0]);
+		ret = 0;
+	}
+
+out:
+	if (ret) {
+		static int count;
+		if (count < 10) {
+			count++;
+			printk("kio_submit: failed!\n");
+		}
+		atomic_dec(&ctx->users);
+		if (atomic_read(&ctx->users) <= 0)
+			BUG();
+	}
+
+	return ret;
+}
+
+/*----------------- /dev/aio interface ----------------------- */
+static inline struct kiocb *aio_convert_user_aiocb(struct kioctx *ctx,
+	struct iocb *uaiocb, struct iocb *user_aiocb)
+{
+	struct kiocb *iocb;
+	int rw = WRITE;
+	int ret = -ENOMEM;
+	int i;
+
+	iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+	if (!iocb)
+		goto out;
+
+	atomic_inc(&ctx->users);
+
+	memset(iocb, 0, sizeof(*iocb));
+
+	iocb->ctx = ctx;
+	iocb->user_key = (long)user_aiocb;
+	iocb->user_data = uaiocb->aio_data;
+	iocb->filp = fget(uaiocb->aio_fildes);
+	ret = -EBADF;
+	if (!iocb->filp)
+		goto out_err;
+
+	iocb->nr_kiovec = 1;
+	iocb->kiovec_sz[0] = 1;
+	ret = alloc_kiovec_sz(1, iocb->kiovec, iocb->kiovec_sz);
+	if (ret)
+		goto out_err;
+
+	for (i=0; i < iocb->nr_kiovec; i++) {
+		iocb->kiovec[i]->end_io = aio_kiobuf_endio;
+		iocb->kiovec[i]->end_io_data = iocb;
+	}
+
+	switch (uaiocb->aio_lio_opcode) {
+	case IOCB_CMD_PREAD:	rw = READ;
+	case IOCB_CMD_PWRITE:
+		pr_debug("aio: map_user_kiobuf(%d, %p, %lu, %lu) = ",
+			rw, iocb->kiovec[0], (unsigned long)uaiocb->aio_buf,
+			(unsigned long)uaiocb->aio_nbytes);
+		ret = map_user_kiobuf(rw, iocb->kiovec[0],
+				(unsigned long)uaiocb->aio_buf,
+				uaiocb->aio_nbytes);
+		pr_debug("%d\n", ret);
+		if (ret)
+			goto out_kiobuf_err;
+		break;
+	default:
+		ret = -EINVAL;
+		printk("aio_convert_user_aiocb: lio_opcode = %d\n", uaiocb->aio_lio_opcode);
+		goto out_kiobuf_err;
+	}
+
+	pr_debug("kio_convert_user_aiocb: (%p, %p) / %p\n", ctx, uaiocb, iocb);
+
+	return iocb;
+
+out_kiobuf_err:
+out_err:
+	kiocb_free(iocb);
+out:
+	return ERR_PTR(ret);
+}
+
+/* kiocb_get
+ *
+ */
+static inline struct kiocb *kiocb_get(struct kioctx *ctx, unsigned idx, u64 key)
+{
+	struct kiocb *iocb;
+
+	spin_lock(&aio_req_lock);
+	iocb = ctx->reqs[idx];
+	if (iocb && iocb->user_data == key) {
+		if (!iocb->locked)
+			iocb->locked = 1;
+		else
+			iocb = ERR_PTR(-EBUSY);
+	} else
+		iocb = ERR_PTR(-ENOENT);
+	spin_unlock(&aio_req_lock);
+	return iocb;
+}
+
+/* aio_complete
+ *	Checks if the kiogrp in ctx at idx is finished.  If so, copies the
+ *	completion codes into userspace, and then releases the kiogrp.
+ */
+static int __aio_complete(struct kioctx *ctx, unsigned idx, u64 key, int please_wait)
+{
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	struct kiocb *iocb;
+	int ret = -EINVAL;
+
+	pr_debug("aio_complete: %p %d %p %d\n", ctx, idx, key, please_wait);
+	if (idx >= ctx->max_reqs) {
+		printk("aio_complete: idx(%d) is invalid\n", idx);
+		goto out;
+	}
+
+	ret = -EBUSY;
+
+	if (please_wait) {
+		add_wait_queue(&ctx->wait, &wait);
+
+		do {
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+			iocb = kiocb_get(ctx, idx, key);
+			if (iocb == ERR_PTR(-EBUSY)) {
+				schedule();
+
+				/* interrupted due to a signal? */
+				iocb = ERR_PTR(-EINTR);
+				if (signal_pending(tsk))
+					break;
+				iocb = kiocb_get(ctx, idx, key);
+			}
+		} while (iocb == ERR_PTR(-EBUSY));
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&ctx->wait, &wait);
+	} else
+		iocb = kiocb_get(ctx, idx, key);
+
+	ret = PTR_ERR(iocb);
+	if (IS_ERR(iocb)) {
+		printk("aio_complete: ERR: %d [%u, %Lx] from %p\n", ret, idx, key, __builtin_return_address(0));
+		goto out;
+	}
+
+	pr_debug("aio_complete: [%d] = %p\n", idx, iocb);
+
+	/* everything turned out well, dispose of the aiocb. */
+	kiocb_free(iocb);
+
+	return 0;
+
+out:
+	return ret;
+}
+
+/* aio_read_evt
+ *	Pull an event off of the aioctx's event ring.
+ *	FIXME: make this use cmpxchg.
+ *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+{
+	struct aio_ring *ring = ioctx->ring;
+	unsigned long head;
+	int ret = -EAGAIN;
+
+	pr_debug("in aio_read_evt h%lu t%lu\n", ring->head, ring->tail);
+	barrier();
+	if (ring->head == ring->tail)
+		goto out;
+
+	spin_lock(&aio_read_lock);	/* investigate the value of making this per-ctx */
+
+	head = ring->head;
+	if (head != ring->tail) {
+		head = (head + 1) & ioctx->ring_mask;
+		*ent = ring->io_events[head];
+		barrier();
+		ring->head = head;
+		ret = 0;
+	}
+	spin_unlock(&aio_read_lock);
+
+out:
+	pr_debug("leaving aio_read_evt: %d  h%lu t%lu\n", ret, ring->head, ring->tail);
+	return ret;
+}
+
+struct timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	wait_queue_head_t	wait;
+};
+
+static void timeout_func(unsigned long data)
+{
+	struct timeout *to = (struct timeout *)data;
+
+	to->timed_out = 1;
+	wake_up(&to->wait);
+}
+
+static inline void init_timeout(struct timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = timeout_func;
+	to->timed_out = 0;
+	init_waitqueue_head(&to->wait);
+}
+
+static inline void set_timeout(struct timeout *to, struct timespec *ts)
+{
+	unsigned long how_long;
+
+	if (!ts->tv_sec && !ts->tv_nsec) {
+		to->timed_out = 1;
+		return;
+	}
+
+	how_long = ts->tv_sec * HZ;
+#define HZ_NS (1000000000 / HZ)
+	how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS;
+	
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+static inline void clear_timeout(struct timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx, int nr, struct io_event *event,
+			struct timespec *timeout)
+{
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	DECLARE_WAITQUEUE(to_wait, tsk);
+	int			ret = -EINVAL;
+	int			i = 0;
+	struct io_event		ent;
+	struct timespec		ts;
+	struct timeout		to;
+
+	init_timeout(&to);
+
+	if (timeout) {
+		ret = -EFAULT;
+		if (copy_from_user(&ts, timeout, sizeof(ts)))
+			goto out;
+
+		set_timeout(&to, &ts);
+	}
+
+	memset(&ent, 0, sizeof(ent));
+	ret = 0;
+
+	while (i < nr) {
+		ret = aio_read_evt(ctx, &ent);
+		if (ret) {
+			if (i)
+				break;
+
+			ret = 0;
+			if (!i && !timeout)
+				break;
+
+			add_wait_queue(&ctx->wait, &wait);
+			add_wait_queue(&to.wait, &to_wait);
+			do {
+				set_task_state(tsk, TASK_INTERRUPTIBLE);
+
+				ret = aio_read_evt(ctx, &ent);
+				if (!ret)
+					break;
+				ret = -ETIMEDOUT;
+				if (to.timed_out)
+					break;
+				schedule();
+				if (to.timed_out)
+					break;
+				if (signal_pending(tsk)) {
+					ret = -EINTR;
+					break;
+				}
+				ret = aio_read_evt(ctx, &ent);
+			} while (ret) ;
+
+			set_task_state(tsk, TASK_RUNNING);
+			remove_wait_queue(&ctx->wait, &wait);
+			remove_wait_queue(&to.wait, &to_wait);
+		}
+
+		if (ret)
+			break;
+
+		/* FIXME: split checks in two */
+		ret = -EFAULT;
+		if (copy_to_user(event, &ent, sizeof(ent))) {
+			/* FIXME: we lose an event here. */
+			printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n");
+			break;
+		}
+
+		/* Now complete the aio request and copy the result codes to userland. */
+		/* FIXME: bob */
+		ret = __aio_complete(ctx, ent.key, ent.data, 0);
+		if (ret) {
+			printk(KERN_DEBUG "aio: lost an event -- aio_complete: %d.\n", ret);
+			break;	/* FIXME: we lose an event here */
+		}
+
+		event ++;
+		i ++;
+	}
+
+	if (timeout)
+		clear_timeout(&to);
+out:
+	return i ? i : ret;
+}
+
+/* __aioctx_put
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+void __aioctx_put(struct kioctx *ctx)
+{
+	struct io_event		ent;
+	printk("aio: free aioctx %p\n", ctx);
+
+	/* release any io requests that were not reaped by the user process */
+	while (!aio_read_evt(ctx, &ent)) {
+		struct kiocb *iocb = kiocb_get(ctx, ent.key, ent.data);
+		if (!IS_ERR(iocb))
+			kiocb_free(iocb);
+	}
+
+	kfree(ctx->ring);
+	kfree(ctx->reqs);
+	kmem_cache_free(kioctx_cachep, ctx);
+}
+
+/* iocb_setup
+ *	Allocate and initialize a kiocb in the given
+ *	context at idx.  For positive values of idx,
+ *	attempts to install the iogrp at idx, negative
+ *	means allocate one.
+ *	Error returns are by means of ERR_PTR's.
+ */
+static inline int iocb_setup(struct kioctx *ctx, struct kiocb *iocb, int idx)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (idx >= ctx->max_reqs)
+		goto out;
+
+	/* Get a reference to ze iogrp so that it isn't reported
+	 * as complete before we're done queuing it.
+	 */
+	atomic_inc(&ctx->users);
+
+	/* Assign the iogrp an id. */
+
+	/* FIXME: use cmpxchg instead of spin_lock? */
+	spin_lock(&aio_req_lock);
+	if (idx < 0) {
+		for (idx=0; (idx<ctx->max_reqs) && (ctx->reqs[idx]); idx++)
+			;
+		if (idx < ctx->max_reqs)
+			ctx->reqs[idx] = iocb;
+		else {
+			printk("iogrp_setup: -EAGAIN\n");
+			idx = -EAGAIN;
+		}
+	} else if (idx < ctx->max_reqs) {
+		if (!ctx->reqs[idx])
+			ctx->reqs[idx] = iocb;
+		else {
+			printk("iogrp_setup: -EBUSY\n");
+			idx = -EBUSY;
+		}
+	} else
+		idx = -EINVAL;
+
+	spin_unlock(&aio_req_lock);
+
+	iocb->idx = idx; /* side effect on error: kiogrp_free notices idx < 0 */
+	if (idx < 0) {
+		atomic_dec(&ctx->users);
+		ret = idx;
+	}
+
+out:
+	return ret;
+}
+
+/*	Lookup an ioctx id.  ioctx_list is lockless for reads.
+ *	FIXME: this is O(n) and is only suitable for development.
+ */
+static inline struct kioctx *get_ioctx(unsigned long ctx_id)
+{
+	struct kioctx *ioctx = ioctx_list;
+	struct mm_struct *mm = current->mm;
+
+	do {
+		if (ioctx->user_id == ctx_id && ioctx->mm == mm)
+			return ioctx;
+		ioctx = ioctx->next;
+	} while (ioctx);
+
+	return NULL;
+}
+
+static inline void put_ioctx(struct kioctx *ctx)
+{
+	// FIXME!!!
+	//aioctx_put(ctx);
+}
+
+
+/* __submit_io
+ *	Copies the aiocb from userspace into the kernel and sets up the
+ *	request.  Returns 0 if the request is successfully queued, -errno
+ *	otherwise.
+ */
+static long submit_io(struct kioctx *ctx, struct iocb *iocbp)
+{
+	struct iocb	iocb;
+	long		ret;
+	struct kiocb	*kiocb;
+
+	pr_debug("aio: submit %p %p\n", iocbp, &iocb);
+	ret = -EFAULT;
+	if (copy_from_user(&iocb, iocbp, sizeof(iocb)))
+		goto out;
+
+	kiocb = aio_convert_user_aiocb(ctx, &iocb, iocbp);
+	pr_debug("aio: kiocb = %p\n", kiocb);
+	ret = PTR_ERR(kiocb);
+	if (IS_ERR(kiocb))
+		goto out;
+
+	iocb_setup(ctx, kiocb, -1);
+
+	ret = -EFAULT;
+	if (put_user((u64)kiocb->idx, &iocbp->aio_key))
+		goto out;
+
+	/* kio_submit will free the kiocb if it fails. */
+	ret = kio_submit(kiocb, &iocb);
+	if (!ret)
+		return 0;
+
+	kiocb_free(kiocb);
+	return ret;
+
+out:
+	/* Shoot, something went wrong. */
+	return ret;
+}
+
+asmlinkage int sys___io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	int ret;
+
+	ret = get_user(ctx, ctxp);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (ctx || nr_reqs > max_aio_reqs)
+		goto out;
+
+	ioctx = ioctx_alloc(nr_reqs);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		ret = put_user(ioctx->user_id, ctxp);
+		if (!ret)
+			return 0;
+		aioctx_put(ioctx);
+	}
+
+out:
+	return ret;
+}
+
+/* aio_release
+ *	Free the aioctx associated with the file.  FIXME!
+ */
+asmlinkage int sys___io_destroy(aio_context_t ctx)
+{
+	struct kioctx *ioctx = get_ioctx(ctx);
+	if (ioctx) {
+		dprintk("aio_release(%p)\n", filp->private_data);
+		aioctx_put(ioctx);
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/* sys___io_submit
+ *	Copy an aiocb from userspace into kernel space, then convert it to
+ *	a kiocb, submit and repeat until done.  Error codes on copy/submit
+ *	only get returned for the first aiocb copied as otherwise the size
+ *	of aiocbs copied is returned (standard write sematics).
+ */
+asmlinkage long sys___io_submit(aio_context_t ctx_id, int nr, struct iocb **iocbpp)
+{
+	struct kioctx *ioctx;
+	long ret = 0;
+	int i;
+
+	ioctx = get_ioctx(ctx_id);
+	if (!ioctx)
+		goto out_inval;
+
+	for (i=0; i<nr; i++) {
+		struct iocb *iocbp;
+		ret = get_user(iocbp, iocbpp + i);
+		if (ret)
+			break;
+
+		ret = submit_io(ioctx, iocbp);
+		if (ret)
+			break;
+	}
+
+	put_ioctx(ioctx);
+	run_task_queue(&tq_disk);
+	return i ? i : ret;
+
+out_inval:
+	return -EINVAL;
+}
+
+asmlinkage long sys___io_cancel(aio_context_t ctx, struct iocb *iocb)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys___io_wait(aio_context_t ctx_id, struct iocb *iocb, struct timespec *timeout)
+{
+	struct kioctx *ioctx;
+	int ret = -EINVAL;
+	unsigned idx;
+	long key = (long)iocb;
+
+	ioctx = get_ioctx(ctx_id);
+	if (!ioctx)
+		goto out;
+
+	ret = get_user(idx, &iocb->aio_key);
+	if (ret)
+		goto out;
+
+	ret = __aio_complete(ioctx, idx, key, !!timeout);
+	put_ioctx(ioctx);
+
+out:
+	return ret;
+}
+
+asmlinkage long sys___io_getevents(int ctx_id, int nr, struct io_event *events,
+			struct timespec *timeout)
+{
+	struct kioctx *ioctx = get_ioctx(ctx_id);
+	int ret = -EINVAL;
+
+	if (ioctx) {
+		ret = read_events(ioctx, nr, events, timeout);
+		put_ioctx(ioctx);
+	}
+
+	return ret;
+}
+
+__initcall(aio_setup);
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/buffer.c aio-v2.4.9-ac9.diff/fs/buffer.c
--- /md0/kernels/2.4/v2.4.9-ac9/fs/buffer.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/fs/buffer.c	Fri Sep  7 00:15:17 2001
@@ -140,8 +140,7 @@
 {
 	clear_bit(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
-	if (waitqueue_active(&bh->b_wait))
-		wake_up(&bh->b_wait);
+	wake_up(&bh->b_wait);
 }
 
 /*
@@ -2061,6 +2060,7 @@
 	return tmp.b_blocknr;
 }
 
+#if 0
 /*
  * IO completion routine for a buffer_head being used for kiobuf IO: we
  * can't dispatch the kiobuf callback until io_count reaches 0.  
@@ -2237,6 +2237,7 @@
 		return transferred;
 	return err;
 }
+#endif
 
 /*
  * Start I/O on a page.
@@ -2868,3 +2869,251 @@
 
 module_init(bdflush_init)
 
+/* async kio interface */
+struct brw_cb {
+	struct kiobuf		*kiobuf;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf)
+{
+	if (atomic_dec_and_test(&kiobuf->io_count)) {
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (buffer_uptodate(bh) && !kiobuf->errno)
+				kiobuf->transferred += bh->b_size;
+			else if (!kiobuf->errno)
+				kiobuf->errno = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (kiobuf->end_io)
+			kiobuf->end_io(kiobuf);
+		wake_up(&kiobuf->wait_queue);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	kiobuf = brw_cb->kiobuf;
+	if (!uptodate && !kiobuf->errno)
+		brw_cb->kiobuf->errno = -EIO;
+	brw_kio_put_iobuf(brw_cb, kiobuf);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int		err;
+	int		length;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	struct page *	map;
+	struct buffer_head *tmp;
+	int		bh_nr;
+	int		i;
+
+#define MAX_KIOVEC_NR	8
+	struct brw_cb	*brw_cb_table[MAX_KIOVEC_NR];
+	struct brw_cb	*brw_cb;
+
+	if (!nr)
+		return 0;
+
+	if (nr > MAX_KIOVEC_NR) {
+		printk("kiovec too large: %d\n", nr);
+		BUG();
+	}
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (sector_size-1)) ||
+		    (iobuf->length & (sector_size-1))) {
+			printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size);
+			return -EINVAL;
+		}
+
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		iobuf->errno = 0;
+		iobuf->transferred = 0;
+		atomic_inc(&iobuf->io_count);
+
+		bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size;
+		if (!bh_nr) {
+			printk("brw_kiovec_async: !bh_nr\n");
+			return -EINVAL;
+		}
+
+		/* FIXME: tie into userbeans here */
+		brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL);
+		if (!brw_cb)
+			return -ENOMEM;
+
+		brw_cb_table[i] = brw_cb;
+		brw_cb->kiobuf = iobuf;
+		brw_cb->nr = 0;
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map  = iobuf->maplist[pageind];
+			err = -EFAULT;
+			if (!map)
+				goto error;
+
+			while (length > 0 && (bufind < nr_blocks)) {
+				blocknr = b[bufind++];
+				tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
+				err = -ENOMEM;
+				if (!tmp)
+					goto error;
+
+				memset(tmp, 0, sizeof(*tmp));
+				init_waitqueue_head(&tmp->b_wait);
+				tmp->b_dev = B_FREE;
+				tmp->b_size = sector_size;
+				set_bh_page(tmp, map, offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+				tmp->b_private = brw_cb;
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					clear_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				brw_cb->bh[brw_cb->nr++] = tmp;
+				length -= sector_size;
+				offset += sector_size;
+
+				atomic_inc(&iobuf->io_count);
+
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i = 0; i < nr; i++) {
+		int j;
+		brw_cb = brw_cb_table[i];
+#if 1
+		for (j=0; j<brw_cb->nr; j++) 
+			submit_bh(rw, brw_cb->bh[j]);
+		//ll_rw_block(rw, brw_cb->nr, brw_cb->bh);
+#else
+		generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr);
+#endif
+		brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf);
+	}
+
+	return 0;
+
+ error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	do {
+		brw_cb = brw_cb_table[i];
+		if (brw_cb) {
+			/* We got an error allocating the bh'es.  Just free the current
+			   buffer_heads and exit. */
+			for (bhind = brw_cb->nr; bhind--; )
+				kmem_cache_free(bh_cachep, brw_cb->bh[bhind]);
+			atomic_dec(&brw_cb->kiobuf->io_count);
+			kfree(brw_cb);
+		}
+	} while (i--) ;
+
+	return err;
+}
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/ext2/file.c aio-v2.4.9-ac9.diff/fs/ext2/file.c
--- /md0/kernels/2.4/v2.4.9-ac9/fs/ext2/file.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/fs/ext2/file.c	Fri Sep  7 00:15:17 2001
@@ -42,6 +42,7 @@
 	llseek:		generic_file_llseek,
 	read:		generic_file_read,
 	write:		generic_file_write,
+	rw_kiovec:	generic_file_rw_kiovec,
 	ioctl:		ext2_ioctl,
 	mmap:		generic_file_mmap,
 	open:		generic_file_open,
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/nfs/file.c aio-v2.4.9-ac9.diff/fs/nfs/file.c
--- /md0/kernels/2.4/v2.4.9-ac9/fs/nfs/file.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/fs/nfs/file.c	Fri Sep  7 00:15:17 2001
@@ -50,6 +50,7 @@
 	release:	nfs_release,
 	fsync:		nfs_fsync,
 	lock:		nfs_lock,
+        rw_kiovec:      generic_file_rw_kiovec,
 };
 
 struct inode_operations nfs_file_inode_operations = {
diff -urN /md0/kernels/2.4/v2.4.9-ac9/fs/select.c aio-v2.4.9-ac9.diff/fs/select.c
--- /md0/kernels/2.4/v2.4.9-ac9/fs/select.c	Thu Aug 16 16:58:49 2001
+++ aio-v2.4.9-ac9.diff/fs/select.c	Sun Sep  9 00:10:40 2001
@@ -12,22 +12,31 @@
  *  24 January 2000
  *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ *
+ *  June 2001
+ *	Added async_poll implementation. -ben
  */
 
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/poll.h>
 #include <linux/file.h>
+#include <linux/aio.h>
+#include <linux/init.h>
 
 #include <asm/uaccess.h>
 
 #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
+static kmem_cache_t *poll_table_cache;
+
 struct poll_table_entry {
-	struct file * filp;
-	wait_queue_t wait;
-	wait_queue_head_t * wait_address;
+	wait_queue_t		wait;
+	wait_queue_head_t	*wait_address;
+	struct file		*filp;
+	poll_table		*p;
 };
 
 struct poll_table_page {
@@ -71,6 +80,72 @@
 	}
 }
 
+void async_poll_complete(void *data)
+{
+	poll_table	*p = data, *pwait;
+	struct kiocb	*iocb = p->iocb;
+	unsigned int	mask;
+
+	pwait = p;
+	p->wake = 0;
+	wmb();
+	do {
+		mask = iocb->filp->f_op->poll(iocb->filp, p);
+		mask &= p->events | POLLERR | POLLHUP;
+		if (mask) {
+			poll_freewait(p);
+			aio_complete(iocb, mask, 0);
+			return;
+		}
+		p->sync = 0;
+		wmb();
+	} while (p->wake);
+
+}
+
+static void async_poll_waiter(wait_queue_t *wait)
+{
+	struct poll_table_entry *entry = (struct poll_table_entry *)wait;
+	poll_table *p = entry->p;
+
+	/* avoid writes to the cacheline if possible for SMP */
+	if (!p->wake) {
+		p->wake = 1;
+		/* ensure only one wake up queues the wtd */
+		if (!p->sync && !test_and_set_bit(0, &p->sync))
+			wtd_queue(&p->wtd);
+	}
+}
+
+int async_poll(struct kiocb *iocb, int events)
+{
+	unsigned int mask;
+	poll_table *p, *pwait;
+
+	p = kmem_cache_alloc(poll_table_cache, SLAB_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	poll_initwait(p);
+	wtd_set_action(&p->wtd, async_poll_complete, p);
+	p->iocb = iocb;
+	p->wake = 0;
+	p->sync = 0;
+	p->events = events;
+	pwait = p;
+
+	mask = DEFAULT_POLLMASK;
+	if (iocb->filp->f_op && iocb->filp->f_op->poll)
+		mask = iocb->filp->f_op->poll(iocb->filp, p);
+	mask &= events | POLLERR | POLLHUP;
+	if (mask) {
+		poll_freewait(p);
+		aio_complete(iocb, mask, 0);
+	}
+
+	return 0;
+}
+
 void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
 {
 	struct poll_table_page *table = p->table;
@@ -97,7 +172,11 @@
 	 	get_file(filp);
 	 	entry->filp = filp;
 		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
+		entry->p = p;
+		if (p->iocb)
+			init_waitqueue_func_entry(&entry->wait, async_poll_waiter);
+		else
+			init_waitqueue_entry(&entry->wait, current);
 		add_wait_queue(wait_address,&entry->wait);
 	}
 }
@@ -493,3 +572,14 @@
 	poll_freewait(&table);
 	return err;
 }
+
+static int __init poll_init(void)
+{
+	poll_table_cache = kmem_cache_create("poll table",
+                        sizeof(poll_table), 0, 0, NULL, NULL);
+	if (!poll_table_cache)
+		panic("unable to alloc poll_table_cache");
+	return 0;
+}
+
+module_init(poll_init);
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/errno.h aio-v2.4.9-ac9.diff/include/asm-i386/errno.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/errno.h	Mon Feb 26 10:20:14 2001
+++ aio-v2.4.9-ac9.diff/include/asm-i386/errno.h	Fri Sep  7 00:15:17 2001
@@ -128,5 +128,6 @@
 
 #define	ENOMEDIUM	123	/* No medium found */
 #define	EMEDIUMTYPE	124	/* Wrong medium type */
+#define	ENOCLUE		125	/* userland programmer induced race condition */
 
 #endif
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/uaccess.h aio-v2.4.9-ac9.diff/include/asm-i386/uaccess.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/uaccess.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/asm-i386/uaccess.h	Mon Sep 17 13:40:05 2001
@@ -127,6 +127,7 @@
 extern void __put_user_1(void);
 extern void __put_user_2(void);
 extern void __put_user_4(void);
+extern void __put_user_8(void);
 
 extern void __put_user_bad(void);
 
@@ -162,6 +163,13 @@
 	  case 1: __put_user_asm(x,ptr,retval,"b","b","iq"); break;	\
 	  case 2: __put_user_asm(x,ptr,retval,"w","w","ir"); break;	\
 	  case 4: __put_user_asm(x,ptr,retval,"l","","ir"); break;	\
+	  case 8: {							\
+		u32 *__put_ptr = (void*)(ptr);				\
+		u64 __put_val = (x);					\
+		__put_user_asm((u32)__put_val,__put_ptr,retval,"l","","ir"); \
+		__put_user_asm((u32)(__put_val>>32),__put_ptr+1,retval,"l","","ir"); \
+		break;							\
+	  }								\
 	  default: __put_user_bad();					\
 	}								\
 } while (0)
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/unistd.h aio-v2.4.9-ac9.diff/include/asm-i386/unistd.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/asm-i386/unistd.h	Fri Aug 11 17:39:23 2000
+++ aio-v2.4.9-ac9.diff/include/asm-i386/unistd.h	Fri Sep  7 00:15:17 2001
@@ -227,9 +227,16 @@
 #define __NR_madvise1		219	/* delete when C lib stub is removed */
 #define __NR_getdents64		220
 #define __NR_fcntl64		221
+/* reserved for tux	222 */
+#define __NR___io_cancel	224
+#define __NR___io_wait		225
+#define __NR___io_getevents	226
+#define __NR___io_submit	227
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
-
+#ifdef NO_SYSCALL_ERRNO
+#define __syscall_return(type, res)	return (type)(res)
+#else
 #define __syscall_return(type, res) \
 do { \
 	if ((unsigned long)(res) >= (unsigned long)(-125)) { \
@@ -238,6 +245,7 @@
 	} \
 	return (type) (res); \
 } while (0)
+#endif
 
 /* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
 #define _syscall0(type,name) \
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/aio.h aio-v2.4.9-ac9.diff/include/linux/aio.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/aio.h	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/include/linux/aio.h	Mon Sep 17 14:26:57 2001
@@ -0,0 +1,131 @@
+/* linux/aio.h
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ */
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long	aio_context_t;
+
+#define IOCB_CMD_PREAD		0
+#define IOCB_CMD_PWRITE		1
+#define IOCB_CMD_FSYNC		2
+#define IOCB_CMD_FDSYNC		3
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+	u64		data;		/* the data field from the iocb */
+	u64		key;		/* what iocb this event came from */
+	s64		res;		/* result code for this event */
+	s64		res2;		/* secondary result */
+};
+
+struct aio_ring {
+	u32			id;	/* kernel internal index number */
+	u32			nr;	/* number of io_events */
+	u32			head;
+	u32			tail;
+
+	u32			woke;	/* set when a wakeup was sent */
+	u32			pad1;
+	u32			pad2;
+	u32			pad3;
+
+	u32			pad4[24];	/* pad out to 128 bytes */
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#if defined(__LITTLE_ENDIAN)
+#define IFLITTLE(x)	x
+#define IFBIG(x)	/**/
+#elif defined(__BIG_ENDIAN)
+#define IFLITTLE(x)	/**/
+#define IFBIG(x)	x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+	/* these are internal to the kernel/libc. */
+	u64	aio_data;	/* data to be returned in event's data */
+	IFBIG(u32 aio_pad1);
+	u32	aio_key;	/* the kernel sets this to ~0 if completed, */
+				/* otherwise is the request index. */
+	IFLITTLE(u32 aio_pad1);
+
+	/* common fields */
+	u16	aio_lio_opcode;
+	s16	aio_reqprio;
+	u32	aio_fildes;
+
+	u64	aio_buf;
+	s64	aio_offset;
+	u64	aio_nbytes;
+
+	/* extra parameters */
+	u64	aio_param1;
+	u64	aio_param2;
+
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#ifdef __KERNEL__
+#define AIO_MAXSEGS		4
+#define AIO_KIOGRP_NR_ATOMIC	8
+
+struct kiocb {
+	int		nr_kiovec;
+	struct kiobuf	*kiovec[AIO_MAXSEGS];
+	int		kiovec_sz[AIO_MAXSEGS];
+	//struct iocb	*obj;
+	u64		user_key;
+	u64		user_data;
+	struct file	*filp;
+	long		aio_return;
+	struct kioctx	*ctx;
+	int		idx;
+	int		locked:1;
+};
+
+struct kioctx {
+	atomic_t		users;
+
+	/* This needs improving */
+	unsigned long		user_id;
+	struct kioctx		*next;
+	struct mm_struct	*mm;
+
+	wait_queue_head_t	wait;
+
+	struct kiocb		**reqs;
+
+	spinlock_t		done_lock;
+
+	unsigned		max_reqs;
+	unsigned		ring_mask;
+	int			pid;		/* pid to send wakeups to */
+	struct aio_ring		*ring;
+};
+
+extern struct file_operations aio_fops;
+
+extern void aio_complete(struct kiocb *iocb, long res, long res2);
+extern void __aioctx_put(struct kioctx *ctx);
+
+#define aioctx_get(kioctx)	atomic_inc(&(kioctx)->users)
+#define aioctx_put(kioctx)	do { if (atomic_dec_and_test(&(kioctx)->users)) __aioctx_put(kioctx); } while (0)
+
+#endif /*__KERNEL__*/
+
+#endif /* __AIO_H__ */
+
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/blkdev.h aio-v2.4.9-ac9.diff/include/linux/blkdev.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/blkdev.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/blkdev.h	Mon Sep 17 13:40:05 2001
@@ -149,7 +149,7 @@
 extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
 extern void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size);
 extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
-extern void generic_make_request(int rw, struct buffer_head * bh);
+extern void generic_make_request(int rw, struct buffer_head *bh);
 extern request_queue_t *blk_get_queue(kdev_t dev);
 extern inline request_queue_t *__blk_get_queue(kdev_t dev);
 extern void blkdev_release_request(struct request *);
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/brlock.h aio-v2.4.9-ac9.diff/include/linux/brlock.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/brlock.h	Mon Sep 10 20:03:53 2001
+++ aio-v2.4.9-ac9.diff/include/linux/brlock.h	Mon Sep 17 13:40:25 2001
@@ -34,6 +34,7 @@
 enum brlock_indices {
 	BR_GLOBALIRQ_LOCK,
 	BR_NETPROTO_LOCK,
+	BR_AIO_LOCK,
 
 	__BR_END
 };
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/event.h aio-v2.4.9-ac9.diff/include/linux/event.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/event.h	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/include/linux/event.h	Fri Sep  7 00:15:17 2001
@@ -0,0 +1,21 @@
+#ifndef _LINUX_KEVENTQ_H
+#define _LINUX_KEVENTQ_H
+
+typedef struct file *keventq_t;
+
+keventq_t keventq_get(int qid);
+#define keventq_put(evq)	fput(evq)
+
+keventq_t keventq_get(int qid)
+{
+	struct file *filp = fget(qid);
+	if (filp) {
+		if (&keventq_fops == filp->f_op)
+			return filp;
+		fput(filp);
+	}
+	return NULL;
+}
+
+
+#endif
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/fs.h aio-v2.4.9-ac9.diff/include/linux/fs.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/fs.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/fs.h	Mon Sep 17 13:40:05 2001
@@ -20,7 +20,6 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/stddef.h>
-#include <linux/string.h>
 
 #include <asm/atomic.h>
 #include <asm/bitops.h>
@@ -802,7 +801,13 @@
  * NOTE:
  * read, write, poll, fsync, readv, writev can be called
  *   without the big kernel lock held in all filesystems.
+ *
+ * rw_kiovec returns the number of bytes that will actually
+ * be transferred into the kiovec, or an error that occurred
+ * during queueing.
  */
+struct kiobuf;
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -822,6 +827,7 @@
 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	int (*rw_kiovec)(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 };
 
 struct inode_operations {
@@ -1411,6 +1417,7 @@
 extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
 extern loff_t generic_file_llseek(struct file *, loff_t, int);
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/iobuf.h aio-v2.4.9-ac9.diff/include/linux/iobuf.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/iobuf.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/iobuf.h	Mon Sep 17 13:40:05 2001
@@ -53,8 +53,10 @@
 
 	/* Dynamic state for IO completion: */
 	atomic_t	io_count;	/* IOs still in progress */
+	int		transferred;	/* Number of bytes of completed IO at the beginning of the buffer */
 	int		errno;		/* Status of completed IO */
 	void		(*end_io) (struct kiobuf *); /* Completion callback */
+	void		*end_io_data;
 	wait_queue_head_t wait_queue;
 };
 
@@ -80,7 +82,9 @@
 
 /* fs/buffer.c */
 
+int	brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+		   kdev_t dev, int nr_blocks, unsigned long b[], int size);
 int	brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
-		   kdev_t dev, unsigned long b[], int size);
+		   kdev_t dev, int nr_blocks, unsigned long b[], int size);
 
 #endif /* __LINUX_IOBUF_H */
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/iovec.h aio-v2.4.9-ac9.diff/include/linux/iovec.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/iovec.h	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/include/linux/iovec.h	Fri Sep 14 16:33:15 2001
@@ -0,0 +1,26 @@
+#ifndef __LINUX__IOBUF_H
+#define __LINUX__IOBUF_H
+
+struct page;
+
+struct lvec {
+	struct page	*page;
+	unsigned	offset;
+	unsigned	length;
+};
+
+struct liovec {
+	unsigned	max_nr;
+	unsigned	nr;
+	struct	lvec	vec[0];
+};
+
+struct vfs_liovec {
+	struct liovec	iovec;
+	void		(*complete)(struct vfs_liovec *);
+};
+
+struct liovec *liovec_map_user(int rw, unsigned long va, size_t len);
+int map_user_lvec(int rw, unsigned nr, struct lvec *iovec, unsigned long va, size_t len);
+
+#endif
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/lib_lio.h aio-v2.4.9-ac9.diff/include/linux/lib_lio.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/lib_lio.h	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/include/linux/lib_lio.h	Fri Sep  7 00:15:17 2001
@@ -0,0 +1,108 @@
+#ifndef __LIB_LIO_H
+#define __LIB_LIO_H
+
+struct timespec;
+struct sockaddr;
+struct iovec;
+
+
+typedef enum lio_iocb_cmd {
+
+	LIO_CMD_PREAD,
+	LIO_CMD_PWRITE,
+	LIO_CMD_ACCEPT,
+	LIO_CMD_CONNECT,
+	LIO_CMD_SENDTO,
+	LIO_CMD_RECVFROM,
+
+	LIO_CMD_POLL,
+} lio_iocb_cmd_t;
+
+struct lio_iocb_sendto {
+	void	*msg;
+	int	len;
+	int	flags;
+	struct sockaddr	*addr;
+};
+
+struct lio_iocb_poll {
+	int	events;
+};	/* result code is the set of result flags or -'ve errno */
+
+struct lio_iocb_sockaddr {
+	struct sockaddr *addr;
+	int		len;
+};	/* result code is the length of the sockaddr, or -'ve errno */
+
+struct lio_iocb_common {
+	void		*buf;
+	long		nbytes;
+	long long	offset;
+};	/* result code is the amount read or -'ve errno */
+
+struct lio_iocb_vector {
+	const struct iovec	*vec;
+	int			nr;
+	long long		offset;
+};	/* result code is the amount read or -'ve errno */
+
+typedef struct lio_iocb {
+	long	key;		/* For use in identifying io requests */
+	void	*data;		/* Return in the io completion event */
+	int	aio_fildes;
+	short	aio_reqprio;
+	short	aio_lio_opcode;	
+
+	union {
+		struct lio_iocb_common		c;
+		struct lio_iocb_vector		v;
+		struct lio_iocb_poll		poll;
+		struct lio_iocb_sockaddr	saddr;
+	} u;
+} lio_iocb_t;
+
+typedef void (*lio_callback_t)(int qid, lio_iocb_t *iocb, long result);
+
+extern int lio_queue_init(int maxevents);
+extern int lio_queue_grow(int qid, int new_maxevents);
+extern int lio_queue_release(int qid);
+extern int lio_queue_wait(int qid, struct timespec *timeout);
+extern int lio_queue_run(int qid);
+extern int lio_submit(int qid, int nr, lio_iocb_t *ios[]);
+
+static inline void lio_prep_accept(lio_iocb_t *iocb, int s, struct sockaddr *addr, int addrlen)
+{
+	iocb->aio_fildes = s;
+	iocb->aio_lio_opcode = LIO_CMD_ACCEPT;
+	iocb->aio_reqprio = 0;
+	iocb->u.c.buf = addr;
+	iocb->u.c.nbytes = addrlen;
+	iocb->u.c.offset = 0;
+}
+
+static inline void lio_prep_pread(lio_iocb_t *iocb, int fd, void *buf, long count, long long offset)
+{
+	iocb->aio_fildes = fd;
+	iocb->aio_lio_opcode = LIO_CMD_PREAD;
+	iocb->aio_reqprio = 0;
+	iocb->u.c.buf = buf;
+	iocb->u.c.nbytes = count;
+	iocb->u.c.offset = offset;
+}
+
+static inline void lio_prep_poll(lio_iocb_t *iocb, lio_callback_t *cb, int fd, int events)
+{
+	iocb->data = cb;
+	iocb->aio_fildes = fd;
+	iocb->aio_lio_opcode = LIO_CMD_POLL;
+	iocb->aio_reqprio = 0;
+	iocb->u.poll.events = events;
+}
+
+static inline int lio_poll(int qid, lio_iocb_t *iocb, lio_callback_t *cb, int fd, int events)
+{
+	lio_prep_poll(iocb, cb, fd, events);
+	return lio_submit(qid, 1, &iocb);
+}
+
+#endif
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/mm.h aio-v2.4.9-ac9.diff/include/linux/mm.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/mm.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/mm.h	Mon Sep 17 13:40:05 2001
@@ -322,8 +322,7 @@
 					smp_mb__before_clear_bit(); \
 					if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \
 					smp_mb__after_clear_bit(); \
-					if (waitqueue_active(&(page)->wait)) \
-						wake_up(&(page)->wait); \
+					wake_up(&(page)->wait); \
 				} while (0)
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/poll.h aio-v2.4.9-ac9.diff/include/linux/poll.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/poll.h	Mon Sep 10 20:03:45 2001
+++ aio-v2.4.9-ac9.diff/include/linux/poll.h	Mon Sep 17 13:40:05 2001
@@ -7,14 +7,25 @@
 
 #include <linux/wait.h>
 #include <linux/string.h>
+#ifndef __LINUX__MM_H
 #include <linux/mm.h>
+#endif
 #include <asm/uaccess.h>
+#ifndef __LINUX__WORKTODO_H
+#include <linux/worktodo.h>
+#endif
 
 struct poll_table_page;
+struct kiocb;
 
 typedef struct poll_table_struct {
-	int error;
-	struct poll_table_page * table;
+	struct worktodo		wtd;
+	int			error;
+	struct poll_table_page	*table;
+	struct kiocb		*iocb;		/* iocb for async poll */
+	int			events;		/* event mask for async poll */
+	int			wake;
+	long			sync;
 } poll_table;
 
 extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
@@ -29,7 +40,9 @@
 {
 	pt->error = 0;
 	pt->table = NULL;
+	pt->iocb = NULL;
 }
+
 extern void poll_freewait(poll_table* pt);
 
 
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/sched.h aio-v2.4.9-ac9.diff/include/linux/sched.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/sched.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/sched.h	Mon Sep 17 13:40:05 2001
@@ -763,6 +763,7 @@
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
 #define __wait_event(wq, condition) 					\
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/tqueue.h aio-v2.4.9-ac9.diff/include/linux/tqueue.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/tqueue.h	Mon Sep 10 20:03:04 2001
+++ aio-v2.4.9-ac9.diff/include/linux/tqueue.h	Mon Sep 17 13:40:05 2001
@@ -67,6 +67,7 @@
 #define TQ_ACTIVE(q)		(!list_empty(&q))
 
 extern task_queue tq_timer, tq_immediate, tq_disk;
+extern struct tq_struct run_disk_tq;
 
 /*
  * To implement your own list of active bottom halfs, use the following
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/wait.h aio-v2.4.9-ac9.diff/include/linux/wait.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/wait.h	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/include/linux/wait.h	Mon Sep 17 13:40:05 2001
@@ -28,17 +28,20 @@
 #define WAITQUEUE_DEBUG 0
 #endif
 
+typedef struct __wait_queue wait_queue_t;
+typedef void (*wait_queue_func_t)(wait_queue_t *wait);
+
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
 	struct task_struct * task;
 	struct list_head task_list;
+	wait_queue_func_t func;
 #if WAITQUEUE_DEBUG
 	long __magic;
 	long __waker;
 #endif
 };
-typedef struct __wait_queue wait_queue_t;
 
 /*
  * 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -137,6 +140,7 @@
 #endif
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
+	func:		NULL,						\
 	task:		tsk,						\
 	task_list:	{ NULL, NULL },					\
 			 __WAITQUEUE_DEBUG_INIT(name)}
@@ -174,6 +178,22 @@
 #endif
 	q->flags = 0;
 	q->task = p;
+	q->func = NULL;
+#if WAITQUEUE_DEBUG
+	q->__magic = (long)&q->__magic;
+#endif
+}
+
+static inline void init_waitqueue_func_entry(wait_queue_t *q,
+					wait_queue_func_t func)
+{
+#if WAITQUEUE_DEBUG
+	if (!q || !p)
+		WQ_BUG();
+#endif
+	q->flags = 0;
+	q->task = NULL;
+	q->func = func;
 #if WAITQUEUE_DEBUG
 	q->__magic = (long)&q->__magic;
 #endif
@@ -230,6 +250,19 @@
 #endif
 	list_del(&old->task_list);
 }
+
+#define add_wait_queue_cond(q, wait, cond, fail) \
+	do {							\
+		unsigned long flags;				\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = 0;				\
+		if (cond)					\
+			__add_wait_queue((q), (wait));		\
+		else {						\
+			fail;					\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+	} while (0)
 
 #endif /* __KERNEL__ */
 
diff -urN /md0/kernels/2.4/v2.4.9-ac9/include/linux/worktodo.h aio-v2.4.9-ac9.diff/include/linux/worktodo.h
--- /md0/kernels/2.4/v2.4.9-ac9/include/linux/worktodo.h	Wed Dec 31 19:00:00 1969
+++ aio-v2.4.9-ac9.diff/include/linux/worktodo.h	Mon Sep 17 13:40:05 2001
@@ -0,0 +1,39 @@
+#ifndef __LINUX__WORKTODO_H
+#define __LINUX__WORKTODO_H
+
+#ifndef _LINUX_WAIT_H
+#include <linux/wait.h>
+#endif
+#ifndef _LINUX_TQUEUE_H
+#include <linux/tqueue.h>
+#endif
+
+struct worktodo {
+	wait_queue_t		wait;
+	struct tq_struct	tq;
+
+	void *data;	/* for use by the wtd_ primatives */
+};
+
+/* FIXME NOTE: factor from kernel/context.c */
+#define wtd_queue(wtd)	schedule_task(&(wtd)->tq)
+
+#define wtd_set_action(wtd, action, wtddata)	\
+	do {					\
+		(wtd)->tq.routine = (action);	\
+		(wtd)->tq.data = (wtddata);	\
+	} while (0)
+
+struct page;
+extern void wtd_wait_page(struct worktodo *wtd, struct page *page);
+extern void wtd_lock_page(struct worktodo *wtd, struct page *page);
+struct buffer_head;
+extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh);
+
+#if 0	/* not implemented yet */
+extern void wtd_down(struct worktodo *wtd, struct semaphore *sem);
+extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem);
+extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem);
+#endif
+
+#endif /* __LINUX__WORKTODO_H */
diff -urN /md0/kernels/2.4/v2.4.9-ac9/kernel/context.c aio-v2.4.9-ac9.diff/kernel/context.c
--- /md0/kernels/2.4/v2.4.9-ac9/kernel/context.c	Fri May 25 22:48:10 2001
+++ aio-v2.4.9-ac9.diff/kernel/context.c	Fri Sep  7 00:15:17 2001
@@ -91,12 +91,18 @@
 	 */
 	for (;;) {
 		set_task_state(curtask, TASK_INTERRUPTIBLE);
-		add_wait_queue(&context_task_wq, &wait);
-		if (TQ_ACTIVE(tq_context))
+		add_wait_queue_exclusive_lifo(&context_task_wq, &wait);
+		if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context))
 			set_task_state(curtask, TASK_RUNNING);
-		schedule();
+		else
+			schedule();
 		remove_wait_queue(&context_task_wq, &wait);
 		run_task_queue(&tq_context);
+		while (TQ_ACTIVE(tq_context)) {
+			if (current->need_resched)
+				schedule();
+			run_task_queue(&tq_context);
+		}
 		wake_up(&context_task_done);
 		if (signal_pending(curtask)) {
 			while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0)
diff -urN /md0/kernels/2.4/v2.4.9-ac9/kernel/fork.c aio-v2.4.9-ac9.diff/kernel/fork.c
--- /md0/kernels/2.4/v2.4.9-ac9/kernel/fork.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/kernel/fork.c	Fri Sep  7 00:15:17 2001
@@ -45,6 +45,16 @@
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }
 
+void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)
+{
+	unsigned long flags;
+
+	wq_write_lock_irqsave(&q->lock, flags);
+	wait->flags = WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+	wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
diff -urN /md0/kernels/2.4/v2.4.9-ac9/kernel/sched.c aio-v2.4.9-ac9.diff/kernel/sched.c
--- /md0/kernels/2.4/v2.4.9-ac9/kernel/sched.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/kernel/sched.c	Fri Sep  7 00:15:17 2001
@@ -714,13 +714,13 @@
 }
 
 /*
- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small
+ * +ve number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by contonuing to scan the queue.
  */
 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 			 	     int nr_exclusive, const int sync)
@@ -733,14 +733,25 @@
 	
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
-                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_func_t func;
 
 		CHECK_MAGIC(curr->__magic);
+		func = curr->func;
+		if (func) {
+			unsigned flags = curr->flags;
+			func(curr);
+			if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+				break;
+			continue;
+		}
 		p = curr->task;
 		state = p->state;
 		if (state & mode) {
 			WQ_NOTE_WAKER(curr);
-			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			if (try_to_wake_up(p, sync) &&
+			    (curr->flags & WQ_FLAG_EXCLUSIVE) &&
+			    !--nr_exclusive)
 				break;
 		}
 	}
diff -urN /md0/kernels/2.4/v2.4.9-ac9/kernel/softirq.c aio-v2.4.9-ac9.diff/kernel/softirq.c
--- /md0/kernels/2.4/v2.4.9-ac9/kernel/softirq.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/kernel/softirq.c	Fri Sep  7 00:15:17 2001
@@ -354,6 +354,7 @@
 		data = p->data;
 		wmb();
 		p->sync = 0;
+		smp_mb();
 		if (f)
 			f(data);
 	}
diff -urN /md0/kernels/2.4/v2.4.9-ac9/mm/filemap.c aio-v2.4.9-ac9.diff/mm/filemap.c
--- /md0/kernels/2.4/v2.4.9-ac9/mm/filemap.c	Mon Sep 10 20:35:52 2001
+++ aio-v2.4.9-ac9.diff/mm/filemap.c	Fri Sep  7 00:15:17 2001
@@ -22,12 +22,14 @@
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/worktodo.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/mman.h>
 
 #include <linux/highmem.h>
+#include <linux/iobuf.h>
 
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -2762,3 +2764,729 @@
 		panic("Failed to allocate page hash table\n");
 	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
 }
+
+/* address_space_map
+ *	Maps a series of pages from the page cache into the given array.
+ */
+static int address_space_map(struct address_space *as, unsigned long index,
+		int nr, struct page **pages,
+		int *nr_newp, struct page **new_pages)
+{
+	struct page *cached_page = NULL;
+	int nr_new = 0;
+	int ret;
+
+	ret = -EINVAL;
+	if (nr <= 0)
+		goto out;
+
+	ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+	while (nr > 0) {
+		struct page **hash = page_hash(as, index);
+		struct page *page;
+
+		page = __find_page_nolock(as, index, *hash);
+		if (page) {
+			page_cache_get(page);
+got_page:
+			pages[ret++] = page;
+			index++;
+			nr--;
+			continue;
+		}
+
+		if (cached_page) {
+			__add_to_page_cache(cached_page, as, index, hash);
+			nr_new++;
+			*new_pages++ = page = cached_page;
+			cached_page = NULL;
+			goto got_page;
+		}
+		spin_unlock(&pagecache_lock);
+
+		cached_page = page_cache_alloc(as);
+		if (!cached_page)
+			goto out;
+
+		/* Okay, we now have an allocated page.  Retry
+		 * the search and add. */
+		spin_lock(&pagecache_lock);
+	}
+
+	spin_unlock(&pagecache_lock);
+
+out:
+	if (cached_page)
+		page_cache_free(cached_page);
+
+	*nr_newp = nr_new;
+	return ret ? ret : -ENOMEM;
+}
+
+struct iodesc {
+	struct worktodo	wtd;
+
+	struct page	*good_page;	/* the highest Uptodate page */
+	int		good_idx;
+	int		err;
+	int		did_read;
+	int		rw;
+
+	struct page	**pages;
+	struct page	**new_pages;
+	struct page	**cur_pagep;
+	struct page	**src_pagep;
+	int		nr_pages;
+	int		nr_new_pages;
+
+	struct address_space *as;
+	struct file	*file;
+	struct kiobuf	*kiovec[8];
+	int		kio_nr;
+
+	size_t		size;
+	unsigned long	transferred;
+	unsigned	offset;
+	unsigned	src_offset;
+	struct kiobuf	*iobuf;
+
+	int		sync;
+
+#define READDESC_NR_DEF	3
+	struct page *def_pages[READDESC_NR_DEF];
+	struct page *def_new_pages[READDESC_NR_DEF];
+};
+
+static void __iodesc_free(struct iodesc *io)
+{
+	int i;
+
+	for (i=0; i<io->nr_pages; i++)
+		page_cache_release(io->pages[i]);
+
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+	kfree(io);
+}
+
+/* By the time this function is called, all of the pages prior to
+ * the current good_idx have been released appropriately.  The remaining
+ * duties are to release any remaining pages and to honour O_SYNC.
+ */
+static void __iodesc_finish_write(struct iodesc *io)
+{
+	int i;
+
+	pr_debug("__iodesc_finish_write(%p)\n", io);
+
+	if (WRITE == io->rw)
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		UnlockPage(page);
+		deactivate_page(page);
+		//page_cache_release(page);
+	}
+
+	/* FIXME: this is buggy */
+	{
+		struct kiobuf *iobuf = io->kiovec[0];
+		iobuf->transferred = io->transferred;
+		iobuf->errno = io->err;
+		iobuf->end_io(iobuf);
+	}
+
+	__iodesc_free(io);
+}
+
+/* This is mostly ripped from generic_file_write */
+static int __iodesc_write_page(struct iodesc *io, struct page *page)
+{
+	unsigned long bytes;
+	unsigned long offset, src_offset;
+	struct page *src_page;
+	long status;
+	char *kaddr;
+	int src_bytes;
+	char *src;
+	int done = 0;
+	unsigned left;
+
+	src_bytes = PAGE_CACHE_SIZE - io->src_offset;
+	src_page = *io->src_pagep;
+	src = kmap(src_page) + io->src_offset;
+
+	offset = io->offset;
+	src_offset = io->src_offset;
+	kaddr = kmap(page);
+	kaddr += offset;
+
+	bytes = PAGE_CACHE_SIZE - offset;
+	if (io->size < bytes)
+		bytes = io->size;
+
+	pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset);
+
+	io->err = io->as->a_ops->prepare_write(io->file, page,
+						offset, offset + bytes);
+	if (io->err) {
+printk("prepare_write: %d\n", io->err);
+		goto unlock;
+	}
+
+	left = bytes;
+	for (;;) {
+		if (left < src_bytes)
+			src_bytes = left;
+
+		memcpy(kaddr, src, src_bytes);
+		kaddr += src_bytes;
+		src += src_bytes;
+		left -= src_bytes;
+		src_offset += src_bytes;
+		src_offset &= PAGE_SIZE - 1;
+		if (!src_offset)
+			io->src_pagep++;
+
+		if (left <= 0)
+			break;
+
+		if (!src_offset) {
+			kunmap(src_page);
+			src_page = *io->src_pagep;
+			src = kmap(src_page);
+			src_bytes = PAGE_SIZE;
+		}
+	}
+	flush_dcache_page(page);
+	status = io->as->a_ops->commit_write(io->file, page,
+						offset, offset+bytes);
+
+	/* We don't handle short writes */
+	if (status > 0 && status != bytes)
+		done = 1;
+
+	if (!status)
+		status = bytes;
+else
+printk("commit_write: %ld\n", status);
+
+	if (status > 0) {
+		io->transferred += status;
+		io->size -= status;
+		io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1);
+
+		if (io->offset)
+			done = 1;
+
+		io->src_offset += status;
+		io->src_offset &= PAGE_CACHE_SIZE - 1;
+	} else {
+		io->err = status;
+		done = 1;
+	}
+
+unlock:
+	kunmap(page);
+	kunmap(src_page);
+
+	//UnlockPage(page);
+	//deactivate_page(page);
+	//page_cache_release(page);
+
+	return done;
+}
+
+void __iodesc_sync_wait_page(void *data)
+{
+	struct iodesc *io = data;
+
+	do {
+		struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers;
+
+		if (!head)
+			continue;
+
+		bh = head;
+		do {
+			if (buffer_locked(bh)) {
+//printk("waiting on bh=%pi io=%p\n", bh, io);
+				wtd_wait_on_buffer(&io->wtd, bh);
+				return;
+			}
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+//printk("io err bh=%p (%p)\n", bh, io);
+				io->err = -EIO;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+	} while (!io->err && ++io->good_idx < io->nr_pages) ;
+
+//printk("finish_write(%p)\n", io);
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_do_write(void *data)
+{
+	struct iodesc *io = data;
+	unsigned i;
+
+	up(&io->file->f_dentry->d_inode->i_sem);
+
+	for (i=0; i<io->nr_pages; i++)
+		if (__iodesc_write_page(io, io->pages[i]))
+			break;
+
+	if (io->sync) {
+		io->good_idx = 0;
+
+//printk("writing out pages(%p)\n", io);
+		for (i=0; i<io->nr_pages; i++) {
+			if (io->pages[i]->buffers)
+				writeout_one_page(io->pages[i]);
+		}
+
+//printk("calling __iodesc_sync_wait_page(%p)\n", io);
+		wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io);
+		__iodesc_sync_wait_page(io);
+		return;
+	}
+
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_write_lock_next_page(void *data)
+{
+	struct iodesc *io = data;
+	pr_debug("__iodesc_write_next_page(%p)\n", io);
+
+	while (io->good_idx < io->nr_pages) {
+		io->good_page = io->pages[io->good_idx++];
+		if (io->good_page == *io->cur_pagep)
+			io->cur_pagep++;
+		else {
+			wtd_lock_page(&io->wtd, io->good_page);
+			return;
+		}
+	}
+
+	//__iodesc_do_write(io);
+	wtd_set_action(&io->wtd, __iodesc_do_write, io);
+	wtd_queue(&io->wtd);
+}
+
+static 
+void __generic_file_write_iodesc(struct iodesc *io)
+{
+	struct inode *inode = io->file->f_dentry->d_inode;
+	time_t now = CURRENT_TIME;
+
+	remove_suid(inode);
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+		inode->i_ctime = inode->i_mtime = now;
+		mark_inode_dirty_sync(inode);
+	}
+
+	wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io);
+	io->sync = !!(io->file->f_flags & O_SYNC);
+	io->good_idx = 0;
+	io->cur_pagep = io->new_pages;
+	io->src_offset = io->kiovec[0]->offset;
+	io->src_pagep = io->kiovec[0]->maplist;
+	__iodesc_write_lock_next_page(io);
+}
+
+static void __iodesc_read_finish(struct iodesc *io)
+{
+	char *dst_addr, *src_addr;
+	int src_off, i;
+	size_t size;
+	size_t valid;
+
+	struct page **src_pagep;
+
+	pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx);
+	if (io->good_idx <= 0)
+		goto no_data;
+
+	size = io->size;
+	src_off = io->offset;
+	src_pagep = io->pages;
+	src_addr = kmap(*src_pagep);
+
+	valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT;
+	valid -= src_off;
+	pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off);
+
+	if (valid < size)
+		size = valid;
+
+	for (i=0; i<io->kio_nr; i++) {
+		struct kiobuf *iobuf = io->kiovec[i];
+		int dst_len = iobuf->length;
+		int dst_off = iobuf->offset;
+		struct page **dst_pagep = iobuf->maplist;
+
+		dst_addr = kmap(*dst_pagep);
+		iobuf->transferred = 0;
+
+		while (size > 0) {
+			int this = PAGE_CACHE_SIZE - src_off;
+			if ((PAGE_SIZE - dst_off) < this)
+				this = PAGE_SIZE - dst_off;
+			if (size < this)
+				this = size;
+			pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n",
+				this, src_off, dst_off, dst_len);
+			memcpy(dst_addr + dst_off, src_addr + src_off, this);
+
+			src_off += this;
+			dst_off += this;
+			dst_len -= this;
+			size -= this;
+			iobuf->transferred += this;
+			pr_debug("read_finish: this=%d transferred=%d\n", this, iobuf->transferred);
+
+			if (dst_len <= 0)
+				break;
+
+			if (size <= 0)
+				break;
+
+			if (dst_off >= PAGE_SIZE) {
+				kunmap(*dst_pagep);
+				dst_pagep++;
+				dst_addr = kmap(*dst_pagep);
+				dst_off = 0;
+			}
+
+			if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */
+				kunmap(*src_pagep);
+pr_debug("page(%lu)->count = %d\n", (*src_pagep)->index, atomic_read(&(*src_pagep)->count));
+				src_pagep++;
+				src_addr = kmap(*src_pagep);
+				src_off = 0;
+			}
+		}
+		kunmap(*dst_pagep);
+
+		iobuf->errno = iobuf->transferred ? 0 : io->err;
+		if (iobuf->errno && i)
+			iobuf->errno = -EAGAIN;
+		iobuf->end_io(iobuf);
+	}
+
+	kunmap(*src_pagep);
+	__iodesc_free(io);
+
+	return;
+
+no_data:
+	io->kiovec[0]->errno = io->err;
+	io->kiovec[0]->transferred = 0;
+	io->kiovec[0]->end_io(io->kiovec[0]);
+
+	for (i=1; i<io->kio_nr; i++) {
+		struct kiobuf *iobuf = io->kiovec[i];
+
+		iobuf->errno = -EAGAIN;
+		iobuf->transferred = 0;
+		iobuf->end_io(iobuf);
+	}
+	__iodesc_free(io);
+}
+
+static void __iodesc_make_uptodate(void *data)
+{
+	struct iodesc *io = data;
+	struct page *page = io->good_page;
+	int locked = 1;
+
+	pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index);
+	while (Page_Uptodate(page)) {
+again:
+		pr_debug("page index %lu uptodate\n", page->index);
+		if (locked) {
+			UnlockPage(page);
+			locked = 0;
+		}
+		io->did_read = 0;
+		io->good_idx++;
+		if (io->good_idx >= io->nr_pages) {
+			__iodesc_read_finish(io);
+			return;
+		}
+		page = io->good_page = io->pages[io->good_idx];
+		pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index);
+	}
+
+	if (!locked) {
+		wtd_lock_page(&io->wtd, page);
+		return;
+	}
+
+	if (!io->did_read) {
+		/* We haven't tried reading this page before, give it a go. */
+		printk("attempting to read %lu\n", page->index);
+		io->did_read = 1;
+		io->err = page->mapping->a_ops->readpage(io->file, page);
+		if (!io->err) {
+			if (Page_Uptodate(page))
+				goto again;
+			wtd_lock_page(&io->wtd, page);
+			return;
+		}
+	}
+
+	if (locked)
+		UnlockPage(page);
+
+	/* We've already read this page before.  Set err to EIO and quite */
+	if (!io->err)
+		io->err = -EIO;
+	__iodesc_read_finish(io);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data);
+
+static void __generic_file_read_iodesc(struct iodesc *io, int mayblock)
+{
+	int (*readpage)(struct file *, struct page *);
+	int i;
+
+	wtd_set_action(&io->wtd, __iodesc_make_uptodate, io);
+	readpage = io->as->a_ops->readpage;
+	for (i=0; i<io->nr_new_pages; i++) {
+		int foo;
+		if (!mayblock)
+			goto do_wtd;
+		foo = readpage(io->file, io->new_pages[i]);
+		if (foo)
+			printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, foo);
+	}
+
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		if (Page_Uptodate(page)) {
+			pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index);
+			continue;
+		}
+
+		if (!mayblock)
+			goto do_wtd;
+		if (!TryLockPage(page)) {
+			int foo = readpage(io->file, page);
+			if (foo)
+				printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, foo);
+		}
+
+		if (!Page_Uptodate(page) && io->good_idx == -1) {
+			pr_debug("first good_idx=%d (%lu)\n", i, page->index);
+			io->good_idx = i;
+			io->good_page = page;
+		}
+	}
+
+	/* Whee, all the pages are uptodate! */
+	if (!io->good_page) {
+	do {static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n");} while(0);
+		pr_debug("all pages uptodate!\n");
+		io->good_idx = io->nr_pages;
+		__iodesc_read_finish(io);
+		return;
+	}
+
+	pr_debug("locking good_page\n");
+	wtd_lock_page(&io->wtd, io->good_page);
+	return;
+
+do_wtd:
+	do {static int zoo; if (zoo++ < 5) printk("read sleep\n");} while(0);
+	wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+	wtd_queue(&io->wtd);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data)
+{
+	struct iodesc *io = data;
+	__generic_file_read_iodesc(io, 1);
+}
+
+int generic_file_rw_kiovec(struct file *file, int rw,
+	int kio_nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *as = inode->i_mapping;
+	unsigned long index;
+	unsigned long eindex;
+	unsigned long nr_pages;
+	struct iodesc *io = NULL;
+	int ret;
+
+	ret = -EINVAL;
+	if (rw != READ && rw != WRITE)
+		goto out;
+
+	ret = -ENOMEM;
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
+		goto out;
+
+	memset(io, 0, sizeof(*io));
+	io->size = size;
+
+	if (READ == rw) {
+		pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size);
+
+		if (pos > inode->i_size)
+			size = 0;
+		else if ((pos + size) > inode->i_size)
+			size = inode->i_size - pos;
+
+		if (io->size < size)
+			size = io->size;
+		else if (size < io->size)
+			io->size = size;
+
+		pr_debug("io->size=%d size=%d\n", io->size, size);
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = eindex - index + 1;
+
+	pr_debug("nr_pages: %lu\n", nr_pages);
+
+	io->good_idx = -1;
+	io->good_page = NULL;
+	io->did_read = 0;
+	io->err = 0;
+	io->rw = rw;
+	io->as = as;
+	io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1);
+	io->file = file;
+	io->kio_nr = kio_nr;
+	if (kio_nr > 8)
+		BUG();
+	memcpy(io->kiovec, kiovec, sizeof(struct kiobuf *) * kio_nr);
+	if (nr_pages < READDESC_NR_DEF) {
+		io->pages = io->def_pages;
+		io->new_pages = io->def_new_pages;
+	} else {
+		io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->pages)
+			goto out_io;
+
+		io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->new_pages)
+			goto out_pages;
+	}
+
+	/* FIXME: make the down a WTD_op */
+	if (rw == WRITE)
+		down(&io->file->f_dentry->d_inode->i_sem);
+
+	ret = address_space_map(as, index, nr_pages, io->pages,
+			&io->nr_new_pages, io->new_pages);
+	pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages);
+	if (ret <= 0)
+		goto out_new_pages;
+
+	io->nr_pages = ret;
+	io->pages[io->nr_pages] = NULL;
+	io->new_pages[io->nr_new_pages] = NULL;
+
+	if (rw == READ)
+		__generic_file_read_iodesc(io, 0);
+	else if (rw == WRITE)
+		__generic_file_write_iodesc(io);
+
+	return 0;
+
+out_new_pages:
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+out_pages:
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+out_io:
+	kfree(io);
+out:
+	return ret;
+}
+
+static void __wtd_lock_page_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct page *page = (struct page *)wtd->data;
+
+	if (!TryLockPage(page)) {
+		__remove_wait_queue(&page->wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else {
+		schedule_task(&run_disk_tq);
+	}
+}
+
+void wtd_lock_page(struct worktodo *wtd, struct page *page)
+{
+	if (TryLockPage(page)) {
+		int raced = 0;
+		wtd->data = page;
+		init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter);
+		add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page), raced = 1);
+
+		if (!raced) {
+			run_task_queue(&tq_disk);
+			return;
+		}
+	}
+
+	wtd->tq.routine(wtd->tq.data);
+}
+
+static void __wtd_bh_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct buffer_head *bh = (struct buffer_head *)wtd->data;
+
+	if (!buffer_locked(bh)) {
+		__remove_wait_queue(&bh->b_wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else {
+		schedule_task(&run_disk_tq);
+	}
+}
+
+void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh)
+{
+	int raced = 0;
+
+	if (!buffer_locked(bh)) {
+		wtd->tq.routine(wtd->tq.data);
+		return;
+	}
+	wtd->data = bh;
+	init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter);
+	add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh), raced = 1);
+
+	if (raced)
+		wtd->tq.routine(wtd->tq.data);
+	else
+		run_task_queue(&tq_disk);
+}
+
+void do_run_tq_disk(void *data)
+{
+	run_task_queue(&tq_disk);
+}
+
+struct tq_struct run_disk_tq = {
+	routine: do_run_tq_disk,
+	data: NULL
+};
+