diff -urN /md0/kernels/2.4/v2.4.4-ac10/Makefile ac10-aio/Makefile
--- /md0/kernels/2.4/v2.4.4-ac10/Makefile	Thu May 17 15:25:02 2001
+++ ac10-aio/Makefile	Thu May 24 17:53:00 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 4
-EXTRAVERSION = -ac10
+EXTRAVERSION = -ac10-aio1
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -urN /md0/kernels/2.4/v2.4.4-ac10/arch/i386/kernel/entry.S ac10-aio/arch/i386/kernel/entry.S
--- /md0/kernels/2.4/v2.4.4-ac10/arch/i386/kernel/entry.S	Wed Nov  8 20:09:50 2000
+++ ac10-aio/arch/i386/kernel/entry.S	Thu May 24 17:53:22 2001
@@ -646,6 +646,11 @@
 	.long SYMBOL_NAME(sys_getdents64)	/* 220 */
 	.long SYMBOL_NAME(sys_fcntl64)
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for TUX */
+	.long SYMBOL_NAME(sys_ni_syscall)	/* 223 */
+	.long SYMBOL_NAME(sys___io_cancel)
+	.long SYMBOL_NAME(sys___io_wait)
+	.long SYMBOL_NAME(sys___io_getevents)
+	.long SYMBOL_NAME(sys_submit_ios)
 
 	/*
 	 * NOTE!! This doesn't have to be exact - we just have
diff -urN /md0/kernels/2.4/v2.4.4-ac10/drivers/char/mem.c ac10-aio/drivers/char/mem.c
--- /md0/kernels/2.4/v2.4.4-ac10/drivers/char/mem.c	Thu May 17 15:25:04 2001
+++ ac10-aio/drivers/char/mem.c	Thu May 24 17:53:13 2001
@@ -21,6 +21,7 @@
 #include <linux/raw.h>
 #include <linux/tty.h>
 #include <linux/capability.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -571,6 +572,9 @@
 		case 9:
 			filp->f_op = &urandom_fops;
 			break;
+		case 10:
+			filp->f_op = &aio_fops;
+			break;
 		default:
 			return -ENXIO;
 	}
@@ -595,7 +599,8 @@
 	{5, "zero",    S_IRUGO | S_IWUGO,           &zero_fops},
 	{7, "full",    S_IRUGO | S_IWUGO,           &full_fops},
 	{8, "random",  S_IRUGO | S_IWUSR,           &random_fops},
-	{9, "urandom", S_IRUGO | S_IWUSR,           &urandom_fops}
+	{9, "urandom", S_IRUGO | S_IWUSR,           &urandom_fops},
+	{10,"aio",     S_IRUGO | S_IWUSR,           &aio_fops},
     };
     int i;
 
diff -urN /md0/kernels/2.4/v2.4.4-ac10/drivers/char/raw.c ac10-aio/drivers/char/raw.c
--- /md0/kernels/2.4/v2.4.4-ac10/drivers/char/raw.c	Thu May  3 11:22:10 2001
+++ ac10-aio/drivers/char/raw.c	Thu May 24 17:53:14 2001
@@ -16,6 +16,8 @@
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
 #include <asm/uaccess.h>
+#include <linux/aio.h>
+#include <linux/slab.h>
 
 #define dprintk(x...) 
 
@@ -36,13 +38,14 @@
 int	raw_open(struct inode *, struct file *);
 int	raw_release(struct inode *, struct file *);
 int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
-
+int	raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 
 static struct file_operations raw_fops = {
 	read:		raw_read,
 	write:		raw_write,
 	open:		raw_open,
 	release:	raw_release,
+	rw_kiovec:	raw_rw_kiovec,
 };
 
 static struct file_operations raw_ctl_fops = {
@@ -130,7 +133,8 @@
 	 * the blocksize on a device which is already mounted.  
 	 */
 	
-	sector_size = 512;
+	//sector_size = 512;
+	sector_size = 2048;
 	if (get_super(rdev) != NULL) {
 		if (blksize_size[MAJOR(rdev)])
 			sector_size = blksize_size[MAJOR(rdev)][MINOR(rdev)];
@@ -259,7 +263,6 @@
 }
 
 
-
 ssize_t	raw_read(struct file *filp, char * buf, 
 		 size_t size, loff_t *offp)
 {
@@ -360,7 +363,7 @@
 		for (i=0; i < blocks; i++) 
 			iobuf->blocks[i] = blocknr++;
 		
-		err = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, sector_size);
+		err = brw_kiovec(rw, 1, &iobuf, dev, blocks, iobuf->blocks, sector_size);
 
 		if (rw == READ && err > 0)
 			mark_dirty_kiobuf(iobuf, err);
@@ -390,3 +393,92 @@
  out:	
 	return err;
 }
+
+int	raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos)
+{
+	int		err;
+	unsigned long	blocknr, blocks;
+	unsigned long	__b[KIO_MAX_SECTORS];
+	unsigned long	*b = __b;
+	int		i;
+	int		minor;
+	kdev_t		dev;
+	unsigned long	limit;
+
+	int		sector_size, sector_bits, sector_mask;
+	int		max_sectors;
+
+#if 0	/* FIXME: this is wrong. */
+	err = 0;
+	if (!size)
+		goto out_complete;
+#endif
+
+	pr_debug("raw_rw_kiovec: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos);
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = to_kdev_t(raw_devices[minor].binding->bd_dev);
+	sector_size = raw_devices[minor].sector_size;
+	sector_bits = raw_devices[minor].sector_bits;
+	sector_mask = sector_size- 1;
+	max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	dprintk ("rw_raw_dev_async: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		printk("pos/size wrong\n");
+		goto out;
+	}
+
+	err = -ENXIO;
+	if ((pos >> sector_bits) >= limit) {
+		printk("raw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		goto out;
+	}
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	blocknr = pos >> sector_bits;
+	blocks = size >> sector_bits;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (blocks > limit - blocknr)
+		blocks = limit - blocknr;
+	err = -ENXIO;
+	pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr);
+	if (!blocks)
+		goto out;
+
+	if (blocks > KIO_MAX_SECTORS) {
+		err = -ENOMEM;
+		b = kmalloc(sizeof(*b) * blocks, GFP_KERNEL);
+		if (!b)
+			goto out;
+	}
+
+	for (i=0; i < blocks; i++) 
+		b[i] = blocknr++;
+
+	err = brw_kiovec_async(rw, nr, kiovec, dev, blocks, b, sector_size);
+	pr_debug("brw_kiovec_async: %d\n", err);
+
+	if (b != __b)
+		kfree(b);
+out:
+	pr_debug("brw_kiovec_async: ret is %d\n", err);
+	return err;
+}
+
diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/Makefile ac10-aio/fs/Makefile
--- /md0/kernels/2.4/v2.4.4-ac10/fs/Makefile	Thu May 17 15:25:10 2001
+++ ac10-aio/fs/Makefile	Thu May 24 17:53:00 2001
@@ -12,7 +12,7 @@
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		super.o  block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-		ioctl.o readdir.o select.o fifo.o locks.o \
+		ioctl.o readdir.o select.o fifo.o locks.o aio.o \
 		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
 		filesystems.o
 
diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/aio.c ac10-aio/fs/aio.c
--- /md0/kernels/2.4/v2.4.4-ac10/fs/aio.c	Wed Dec 31 19:00:00 1969
+++ ac10-aio/fs/aio.c	Thu May 24 17:53:02 2001
@@ -0,0 +1,894 @@
+/* drivers/char/aio.c
+ *	Copyright 2000 Red Hat, Inc.  All Rights Reserved.
+ *
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ *	Implements /dev/aio, something on top of which it should be possible
+ *	to write a POSIX AIO library.
+ *
+ *	Notes on interface:
+ *		- aiocbs are submitted by doing a submit_ios syscall
+ *		  on an array of aiocbs to the /dev/aio fd
+ *		- on completion, the aiocb, events are placed in
+ *		  a ringbuffer
+ *		- the contents of the ring buffer can be read via the
+ *		  __io_getevents syscall.
+ *		- each open(/dev/aio) instance provides a unique aio
+ *		  control space
+ */
+//#define DEBUG 1
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/iobuf.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/aio.h>
+
+#include <asm/uaccess.h>
+
+#undef KERN_DEBUG
+#define KERN_DEBUG ""
+
+static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t aio_req_lock = SPIN_LOCK_UNLOCKED;
+
+static kmem_cache_t	*kiocb_cachep;
+static kmem_cache_t	*kiogrp_cachep;
+static kmem_cache_t	*kioctx_cachep;
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kiocb_cachep)
+		panic("unable to create kiocb cache\n");
+
+	kiogrp_cachep = kmem_cache_create("kiogrp", sizeof(struct kiogrp),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kiogrp_cachep)
+		panic("unable to create kiogrp cache\n");
+
+	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kioctx_cachep)
+		panic("unable to create kioctx cache");
+
+	printk(KERN_NOTICE "aio_setup: okay!\n");
+	printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+	return 0;
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an aioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(void)
+{
+	struct kioctx *ctx;
+
+	ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+	if (ctx) {
+		memset(ctx, 0, sizeof(*ctx));
+		atomic_set(&ctx->users, 1);
+		spin_lock_init(&ctx->done_lock);
+		init_waitqueue_head(&ctx->wait);
+
+		ctx->max_reqs = AIO_RING_SIZE;
+		ctx->reqs = kmalloc(sizeof(struct iocb *) * ctx->max_reqs, GFP_KERNEL);
+		if (ctx->reqs) {
+			memset(ctx->reqs, 0, sizeof(struct iocb *) * ctx->max_reqs);
+			ctx->ring = kmalloc(sizeof(*ctx->ring), GFP_KERNEL);
+			if (ctx->ring) {
+				memset(ctx->ring, 0, sizeof(*ctx->ring));
+				printk("aio: allocated aioctx %p\n", ctx);
+				return ctx;
+			}
+			kfree(ctx->reqs);
+			ctx->reqs = NULL;
+		}
+		kmem_cache_free(kioctx_cachep, ctx);
+		ctx = ERR_PTR(-ENOMEM);
+	}
+
+	printk("aio: error allocating aioctx %p\n", ctx);
+	return ctx;
+}
+
+struct kiogrp *kiogrp_alloc(struct kioctx *ctx)
+{
+	struct kiogrp *iogrp;
+
+	iogrp = kmem_cache_alloc(kiogrp_cachep, GFP_KERNEL);
+	if (iogrp) {
+		memset(iogrp, 0, sizeof(*iogrp));
+		aioctx_get(ctx);
+		iogrp->ctx = ctx;
+		iogrp->idx = -1;
+	}
+	return iogrp;
+}
+
+void kiocb_free(struct kiocb *iocb)
+{
+	int i;
+
+	for (i=0; i<iocb->nr_kiovec; i++)
+		unmap_kiobuf(iocb->kiovec[i]);
+
+	free_kiovec(iocb->nr_kiovec, iocb->kiovec);
+	iocb->nr_kiovec = 0;
+	fput(iocb->filp);
+	iocb->filp = NULL;
+	kmem_cache_free(kiocb_cachep, iocb);
+}
+
+void kiogrp_free(struct kiogrp *iogrp)
+{
+	struct kioctx *ctx = iogrp->ctx;
+	int i;
+	pr_debug("kio_free: %p/%d\n", iogrp, iogrp->idx);
+
+	if ((i=atomic_read(&iogrp->count))) {
+		printk("kiogrp_free: %d/%p/%d still active!!!\n", i, iogrp, iogrp->idx);
+		return;
+	}
+
+	if ((iogrp->idx >= 0) && (iogrp->idx < ctx->max_reqs))
+		ctx->reqs[iogrp->idx] = NULL;
+
+	for (i=0; i<iogrp->nr_iocbs; i++) {
+		kiocb_free(iogrp->iocbs[i]);
+	}
+	kmem_cache_free(kiogrp_cachep, iogrp);
+	aioctx_put(ctx);
+}
+
+/* iogrp_putio
+ *	Called when the io count on iogrp is decremented. Checks
+ *	to see if the kiogrp the request belongs to has finished,
+ *	and if so sends the completion notice to its context.
+ */
+static void iogrp_putio(struct kiogrp *iogrp)
+{
+	struct kioctx	*ctx = iogrp->ctx;
+	struct aio_ring	*ring = ctx->ring;
+	unsigned long flags;
+	unsigned long tail;
+
+	/* Is this the last io to complete in the group? */
+	if (!atomic_dec_and_test(&iogrp->count)) {
+		if (atomic_read(&iogrp->count) < 0)
+			BUG();
+		return;
+	}
+
+	/* Yes we are, go ahead with completion */
+	aioctx_get(ctx);
+
+	/* add a completion event to the ring buffer.
+	 * must be done holding done_lock to prevent
+	 * other code from messing with the tail
+	 * pointer since we might be called from irq
+	 * context.
+	 */
+	spin_lock_irqsave(&ctx->done_lock, flags);
+
+	tail = (ring->tail + 1) % AIO_RING_SIZE;
+
+	ring->io_events[tail].data = iogrp->user_data;
+	ring->io_events[tail].key = iogrp->idx;
+	ring->io_events[tail].type = IO_EVENT_IOCB_DONE;
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	barrier();
+
+	ring->tail = tail;
+
+	wmb();
+	if (!ring->woke)
+		ring->woke = 1;
+
+	spin_unlock_irqrestore(&ctx->done_lock, flags);
+
+	pr_debug("added to ring %p at [%lu]\n", iogrp, tail);
+#if 0
+	if (!wake) {
+		printk("kio_complete: should send user of %p a signal...\n", ctx);
+	}
+#endif
+
+	wake_up(&ctx->wait);
+
+	aioctx_put(ctx);
+}
+
+/* aio_kiobuf_endio
+ *	Called when io on a given kiobuf is complete.
+ */
+static void aio_kiobuf_endio(struct kiobuf *iobuf)
+{
+	struct kiogrp	*iogrp = iobuf->end_io_data;
+
+	/* TODO: possibly put the return code into the iocb
+	 * here.  This only really makes sense if it's being
+	 * put into the user's iocb, which would mean pinning
+	 * it down in memory.  Maybe.
+	 */
+	pr_debug("aio_kiobuf_endio: %p %p/%d\n", iobuf, iogrp, iogrp->idx);
+	iogrp_putio(iogrp);
+}
+
+/* kio_submit:
+ *	Submits an actual aiocb
+ */
+static inline int kio_submit(struct kiogrp *iogrp, struct kiocb *iocb,
+			struct iocb *aiocb)
+{
+	int (*rw_kiovec)(struct file *, int, int, struct kiobuf **, int, size_t, loff_t);
+	int ret = -ENOSYS;
+	int rw;
+
+	switch(aiocb->aio_lio_opcode) {
+	case IOCB_CMD_WRITE:
+		rw = WRITE;
+		break;
+	case IOCB_CMD_READ:
+		rw = READ;
+		break;
+	default:
+		printk("kio_submit: lio_opcode = %d\n", aiocb->aio_lio_opcode);
+		goto out;
+	}
+
+	rw_kiovec = iocb->filp->f_op->rw_kiovec;
+	if (rw_kiovec)
+		ret = rw_kiovec(iocb->filp, rw, iocb->nr_kiovec, iocb->kiovec, /*flags*/ 0, aiocb->aio_nbytes, aiocb->aio_offset);
+	else {
+		iocb->kiovec[0]->transferred = 0;
+		iocb->kiovec[0]->errno = -ENOSYS;
+		aio_kiobuf_endio(iocb->kiovec[0]);
+		ret = 0;
+	}
+
+out:
+	if (ret) {
+		static int count;
+		if (count < 10) {
+			count++;
+			printk("kio_submit: failed!\n");
+		}
+		atomic_dec(&iogrp->count);
+		if (atomic_read(&iogrp->count) < 0)
+			BUG();
+	}
+
+	return ret;
+}
+
+/*----------------- /dev/aio interface ----------------------- */
+static inline struct kiocb *aio_convert_user_aiocb(struct kiogrp *iogrp,
+	struct iocb *uaiocb, struct iocb *user_aiocb)
+{
+	struct kiocb *iocb;
+	int rw = WRITE;
+	int ret = -ENOMEM;
+	int i;
+
+	iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+	if (!iocb)
+		goto out;
+
+	atomic_inc(&iogrp->count);	/* FIXME: should be according to number of iobufs in this iocb */
+
+	memset(iocb, 0, sizeof(*iocb));
+
+	iocb->user_aiocb = user_aiocb;
+	iocb->filp = fget(uaiocb->aio_fildes);
+	ret = -EBADF;
+	if (!iocb->filp)
+		goto out_err;
+
+	iocb->nr_kiovec = 1;
+	ret = alloc_kiovec(1, iocb->kiovec);
+	if (ret)
+		goto out_err;
+
+	for (i=0; i < iocb->nr_kiovec; i++) {
+		iocb->kiovec[i]->end_io = aio_kiobuf_endio;
+		iocb->kiovec[i]->end_io_data = iogrp;
+	}
+
+	switch (uaiocb->aio_lio_opcode) {
+	case IOCB_CMD_READ:	rw = READ;
+	case IOCB_CMD_WRITE:
+		pr_debug("aio: map_user_kiobuf(%d, %p, %lu, %lu) = ",
+			rw, iocb->kiovec[0], (unsigned long)uaiocb->aio_buf,
+			(unsigned long)uaiocb->aio_nbytes);
+		ret = map_user_kiobuf(rw, iocb->kiovec[0],
+				(unsigned long)uaiocb->aio_buf,
+				uaiocb->aio_nbytes);
+		pr_debug("%d\n", ret);
+		if (ret)
+			goto out_kiobuf_err;
+		break;
+	default:
+		ret = -EINVAL;
+		printk("aio_convert_user_aiocb: lio_opcode = %d\n", uaiocb->aio_lio_opcode);
+		goto out_kiobuf_err;
+	}
+
+	pr_debug("kio_convert_user_aiocb: (%p, %p) / %p\n", iogrp, uaiocb, iocb);
+
+	return iocb;
+
+out_kiobuf_err:
+out_err:
+	kiocb_free(iocb);
+out:
+	return ERR_PTR(ret);
+}
+
+/* aio_open
+ *	Open method for /dev/aio.  Allocates an aioctx for this open()er
+ *	and places it in the file's private_data field.  Can fail because
+ *	of memory allocation failure.
+ */
+int aio_open(struct inode *inode, struct file *filp)
+{
+	struct kioctx *ctx = ioctx_alloc();
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	filp->private_data = ctx;
+	ctx->filp = filp;
+	return 0;
+}
+
+/* aio_release
+ *	Free the aioctx associated with the file.  FIXME!
+ */
+int aio_release(struct inode *inode, struct file *filp)
+{
+	struct kioctx *ioctx = filp->private_data;
+	printk("aio_release(%p)\n", filp->private_data);
+	aioctx_put(ioctx);
+	filp->private_data = NULL;
+	return 0;
+}
+
+/* kiocb_get
+ *
+ */
+static inline struct kiogrp *kiogrp_get(struct kioctx *ctx, int idx, void *key)
+{
+	struct kiogrp *iogrp;
+
+	spin_lock(&aio_req_lock);
+	iogrp = ctx->reqs[idx];
+	if (iogrp && iogrp->user_data == key) {
+		if (!iogrp->locked)
+			iogrp->locked = 1;
+		else
+			iogrp = ERR_PTR(-EBUSY);
+	} else
+		iogrp = ERR_PTR(-ENOENT);
+	spin_unlock(&aio_req_lock);
+	return iogrp;
+}
+
+/* aio_complete
+ *	Checks if the kiogrp in ctx at idx is finished.  If so, copies the
+ *	completion codes into userspace, and then releases the kiogrp.
+ */
+static int aio_complete(struct kioctx *ctx, int idx, void *key, int please_wait)
+{
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	struct kiogrp *iogrp;
+	int ret = -EINVAL;
+	unsigned i;
+
+	pr_debug("aio_complete: %p %d %p %d\n", ctx, idx, key, please_wait);
+	if (idx < 0 || idx >= ctx->max_reqs) {
+		printk("aio_complete: idx(%d) is invalid\n", idx);
+		goto out;
+	}
+
+	ret = -EBUSY;
+
+	if (please_wait) {
+		add_wait_queue(&ctx->wait, &wait);
+
+		do {
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+			iogrp = kiogrp_get(ctx, idx, key);
+			if (iogrp == ERR_PTR(-EBUSY)) {
+				schedule();
+
+				/* interrupted due to a signal? */
+				iogrp = ERR_PTR(-EINTR);
+				if (signal_pending(tsk))
+					break;
+				iogrp = kiogrp_get(ctx, idx, key);
+			}
+		} while (iogrp == ERR_PTR(-EBUSY));
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&ctx->wait, &wait);
+	} else
+		iogrp = kiogrp_get(ctx, idx, key);
+
+	ret = PTR_ERR(iogrp);
+	if (IS_ERR(iogrp)) {
+		printk("aio_complete: ERR: %d [%d, %p] from %p\n", ret, idx, key, __builtin_return_address(0));
+		goto out;
+	}
+
+	pr_debug("aio_complete: [%d] = %p\n", idx, iogrp);
+
+	ret = -EFAULT;
+	for (i=0; i<iogrp->nr_iocbs; i++) {
+		struct kiocb *iocb = iogrp->iocbs[i];
+
+		/* FIXME: decide kiovec vs iocb interaction, this is a KLUDGE */
+		iocb->aio_return = iocb->kiovec[0]->transferred ?
+				iocb->kiovec[0]->transferred :
+				iocb->kiovec[0]->errno;
+
+		if (put_user(iocb->aio_return, &iocb->user_aiocb->__aio_return))
+			goto out_undo;
+		if (put_user(-1, &iocb->user_aiocb->__aio_key))
+			goto out_undo;
+	}
+
+	/* everything turned out well, dispose of the aiocb. */
+	kiogrp_free(iogrp);
+
+	return 0;
+
+out_undo:
+printk("out_undo\n");
+	/* unlock and wakeup so anyone else waiting can attempt this iocb */
+	iogrp->locked = 0;
+	wake_up(&ctx->wait);
+
+out:
+	return ret;
+}
+
+/* aio_read_evt
+ *	Pull an event off of the aioctx's event ring.
+ *	FIXME: make this use cmpxchg.
+ *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct aio_ring *ring, struct io_event *ent)
+{
+	unsigned long head;
+	int ret = -EAGAIN;
+
+	pr_debug("in aio_read_evt h%lu t%lu\n", ring->head, ring->tail);
+	barrier();
+	if (ring->head == ring->tail)
+		goto out;
+
+	spin_lock(&aio_read_lock);	/* investigate the value of making this per-ctx */
+
+	head = ring->head;
+	if (head != ring->tail) {
+		head = (head + 1) % AIO_RING_SIZE;
+		*ent = ring->io_events[head];
+		barrier();
+		ring->head = head;
+		ret = 0;
+	}
+	spin_unlock(&aio_read_lock);
+
+out:
+	pr_debug("leaving aio_read_evt: %d  h%lu t%lu\n", ret, ring->head, ring->tail);
+	return ret;
+}
+
+struct timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	wait_queue_head_t	wait;
+};
+
+static void timeout_func(unsigned long data)
+{
+	struct timeout *to = (struct timeout *)data;
+
+	to->timed_out = 1;
+	wake_up(&to->wait);
+}
+
+static inline void init_timeout(struct timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = timeout_func;
+	to->timed_out = 0;
+	init_waitqueue_head(&to->wait);
+}
+
+static inline void set_timeout(struct timeout *to, struct timespec *ts)
+{
+	unsigned long how_long;
+
+	if (!ts->tv_sec && !ts->tv_nsec) {
+		to->timed_out = 1;
+		return;
+	}
+
+	how_long = ts->tv_sec * HZ;
+#define HZ_NS (1000000000 / HZ)
+	how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS;
+	
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+static inline void clear_timeout(struct timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx, struct io_event *event, int max_nr,
+			struct timespec *timeout)
+{
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	DECLARE_WAITQUEUE(to_wait, tsk);
+	int			ret = -EINVAL;
+	int			nr = 0;
+	struct io_event		ent;
+	struct timespec		ts;
+	struct timeout		to;
+
+	init_timeout(&to);
+
+	if (timeout) {
+		ret = -EFAULT;
+		if (copy_from_user(&ts, timeout, sizeof(ts)))
+			goto out;
+
+		set_timeout(&to, &ts);
+	}
+
+	memset(&ent, 0, sizeof(ent));
+	ret = 0;
+
+	while (nr < max_nr) {
+		ret = aio_read_evt(ctx->ring, &ent);
+		if (ret) {
+			if (nr)
+				break;
+
+			add_wait_queue(&ctx->wait, &wait);
+			add_wait_queue(&to.wait, &to_wait);
+			do {
+				set_task_state(tsk, TASK_INTERRUPTIBLE);
+
+				ret = aio_read_evt(ctx->ring, &ent);
+				if (!ret)
+					break;
+				ret = -ETIMEDOUT;
+				if (to.timed_out)
+					break;
+				schedule();
+				if (to.timed_out)
+					break;
+				if (signal_pending(tsk)) {
+					ret = -EINTR;
+					break;
+				}
+				ret = aio_read_evt(ctx->ring, &ent);
+			} while (ret) ;
+
+			set_task_state(tsk, TASK_RUNNING);
+			remove_wait_queue(&ctx->wait, &wait);
+			remove_wait_queue(&to.wait, &to_wait);
+		}
+
+		if (ret)
+			break;
+
+		/* FIXME: split checks in two */
+		ret = -EFAULT;
+		if (copy_to_user(event, &ent, sizeof(ent))) {
+			/* FIXME: we lose an event here. */
+			printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n");
+			break;
+		}
+
+		/* Now complete the aio request and copy the result codes to userland. */
+		ret = aio_complete(ctx, ent.key, ent.data, 0);
+		if (ret) {
+			printk(KERN_DEBUG "aio: lost an event -- aio_complete: %d.\n", ret);
+			break;	/* FIXME: we lose an event here */
+		}
+
+		event ++;
+		nr ++;
+	}
+
+	if (timeout)
+		clear_timeout(&to);
+out:
+	return nr ? nr : ret;
+}
+
+/* __aioctx_put
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+void __aioctx_put(struct kioctx *ctx)
+{
+	struct io_event		ent;
+	printk("aio: free aioctx %p\n", ctx);
+
+	/* release any io requests that were not reaped by the user process */
+	while (!aio_read_evt(ctx->ring, &ent)) {
+		struct kiogrp *iogrp = kiogrp_get(ctx, ent.key, ent.data);
+		if (!IS_ERR(iogrp))
+			kiogrp_free(iogrp);
+	}
+
+	kfree(ctx->ring);
+	kfree(ctx->reqs);
+	kmem_cache_free(kioctx_cachep, ctx);
+}
+
+/* aio_read
+ *	read() method for /dev/aio.  Reads the next iogrp completion
+ *	event off of the queue and then copies the iocb's return codes
+ *	back into the userspace aiocbs.
+ *	FIXME: error handling isn't complete.  Bummer.
+ *	TODO: implement O_NONBLOCK.
+ */
+static ssize_t aio_read(struct file *filp, char *buf, size_t size, loff_t *offp)
+{
+	struct kioctx	*ctx;
+	int		ret;
+
+	if (size < 0)
+		return -EINVAL;
+
+	size /= sizeof(struct io_event);
+	ctx = filp->private_data;
+
+	ret = read_events(ctx, (struct io_event *)buf, size, NULL);
+
+	return (ret > 0) ? ret * sizeof(struct io_event) : ret;
+}
+
+/* iogrp_setup
+ *	Allocate and initialize a kiogrp in the given
+ *	context at idx.  For positive values of idx,
+ *	attempts to install the iogrp at idx, negative
+ *	means allocate one.
+ *	Error returns are by means of ERR_PTR's.
+ */
+static inline struct kiogrp *iogrp_setup(struct kioctx *ctx, int idx)
+{
+	struct kiogrp	*iogrp;
+
+	iogrp = ERR_PTR(-EINVAL);
+	if (idx >= ctx->max_reqs)
+		goto out;
+
+	iogrp = kiogrp_alloc(ctx);
+	if (IS_ERR(iogrp))
+		goto out;
+
+	/* Get a reference to ze iogrp so that it isn't reported
+	 * as complete before we're done queuing it.
+	 */
+	//atomic_inc(&iogrp->count);
+
+	/* Assign the iogrp an id. */
+
+	/* FIXME: use cmpxchg instead of spin_lock? */
+	spin_lock(&aio_req_lock);
+	if (idx < 0) {
+		for (idx=0; (idx<ctx->max_reqs) && (ctx->reqs[idx]); idx++)
+			;
+		if (idx < ctx->max_reqs)
+			ctx->reqs[idx] = iogrp;
+		else {
+			printk("iogrp_setup: -EAGAIN\n");
+			idx = -EAGAIN;
+		}
+	} else if (idx < ctx->max_reqs) {
+		if (!ctx->reqs[idx])
+			ctx->reqs[idx] = iogrp;
+		else {
+			printk("iogrp_setup: -EBUSY\n");
+			idx = -EBUSY;
+		}
+	} else
+		idx = -EINVAL;
+
+	spin_unlock(&aio_req_lock);
+
+	iogrp->idx = idx;	/* side effect on error: kiogrp_free notices idx < 0 */
+	if (idx < 0) {
+		//atomic_dec(&iogrp->count);
+		kiogrp_free(iogrp);
+		iogrp = ERR_PTR(idx);
+	}
+
+out:
+	return iogrp;
+}
+
+static inline struct kioctx *get_ioctx(int ctx_id)
+{
+	struct file	*filp;
+
+	filp = fget(ctx_id);
+	if (filp) {
+		if (filp->f_op == &aio_fops)
+			return filp->private_data;
+		fput(filp);
+	}
+
+	return NULL;
+}
+
+static inline void put_ioctx(struct kioctx *ctx)
+{
+	fput(ctx->filp);
+}
+
+
+/* __submit_io
+ *	Copies the aiocb from userspace into the kernel and sets up the
+ *	request.  Returns 0 if the request is successfully queued, -errno
+ *	otherwise.
+ */
+static inline long __submit_io(struct kioctx *ctx, struct iocb *uaiocbp)
+{
+	struct iocb	uaiocb;
+	long		ret;
+	struct kiogrp	*iogrp;
+	struct kiocb	*kiocb;
+
+	iogrp = iogrp_setup(ctx, -1);
+	ret = PTR_ERR(iogrp);
+	if (IS_ERR(iogrp))
+		goto out_nofree;
+
+	pr_debug("aio: submit %p %p\n", uaiocbp, &uaiocb);
+	ret = -EFAULT;
+	if (copy_from_user(&uaiocb, uaiocbp, sizeof(uaiocb)))
+		goto out;
+
+	kiocb = aio_convert_user_aiocb(iogrp, &uaiocb, uaiocbp);
+	pr_debug("aio: kiocb = %p\n", kiocb);
+	ret = PTR_ERR(kiocb);
+	if (IS_ERR(kiocb))
+		goto out;
+
+	/* we don't do scatter gather... yet */
+	iogrp->nr_iocbs = 1;
+	iogrp->iocbs = iogrp->atomic_iocbs;
+	iogrp->iocbs[0] = kiocb;
+	iogrp->user_data = uaiocbp;
+
+	ret = -EFAULT;
+	if (put_user((int)iogrp->idx, &uaiocbp->__aio_key))
+		goto out;
+
+	/* kio_submit will free the kiocb if it fails. */
+	ret = kio_submit(iogrp, kiocb, &uaiocb);
+	if (!ret)
+		return 0;
+
+	if (atomic_read(&iogrp->count) != 0)
+		BUG();
+	kiogrp_free(iogrp);
+
+	return ret;
+
+out:
+	/* Shoot, something went wrong.  Discard the iogrp we allocated. */
+	kiogrp_free(iogrp);
+out_nofree:
+	return ret;
+}
+
+/* sys_submit_ios
+ *	Copy an aiocb from userspace into kernel space, then convert it to
+ *	a kiocb, submit and repeat until done.  Error codes on copy/submit
+ *	only get returned for the first aiocb copied as otherwise the size
+ *	of aiocbs copied is returned (standard write sematics).
+ */
+long sys_submit_ios(int ctx_id, int nr, struct iocb **uaiocbpp)
+{
+	struct kioctx	*ctx;
+	struct iocb	*uaiocbp;
+	int i;
+	long ret = 0;
+
+	if (ctx_id < 0 || nr <= 0)
+		goto out_inval;
+
+	ctx = get_ioctx(ctx_id);
+	if (!ctx)
+		goto out_inval;
+
+	for (i=0; i<nr; i++) {
+		ret = get_user(uaiocbp, uaiocbpp + i);
+		if (ret)
+			break;
+
+		ret = __submit_io(ctx, uaiocbp);
+		if (ret)
+			break;
+	}
+
+	/* We're done with setup.  Now fire off any disk io that we plugged. */
+#if 0
+	if (TQ_ACTIVE(tq_disk))
+		schedule_task(&run_disk_tq);
+#endif
+
+	put_ioctx(ctx);
+	run_task_queue(&tq_disk);
+	return !i ? ret : i;
+
+out_inval:
+	return -EINVAL;
+}
+
+long sys___io_cancel(int ctx_id, int event_id, void *key)
+{
+	return -ENOSYS;
+}
+
+long sys___io_wait(int ctx_id, int event_idx, void *event_key, struct timeval *timeout)
+{
+	struct kioctx *ioctx = get_ioctx(ctx_id);
+	int ret = -EINVAL;
+
+	if (ioctx) {
+		ret = aio_complete(ioctx, event_idx, event_key, !!timeout);
+		put_ioctx(ioctx);
+	}
+
+	return ret;
+}
+
+long sys___io_getevents(int ctx_id, struct io_event *events, int max_nr,
+			struct timespec *timeout)
+{
+	struct kioctx *ioctx = get_ioctx(ctx_id);
+	int ret = -EINVAL;
+
+	if (ioctx) {
+		ret = read_events(ioctx, events, max_nr, timeout);
+		put_ioctx(ioctx);
+	}
+
+	return ret;
+}
+
+struct file_operations aio_fops = {
+	//ioctl:		aio_ioctl,
+	open:		aio_open,
+	release:	aio_release,
+	read:		aio_read,
+};
+
+__initcall(aio_setup);
diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/buffer.c ac10-aio/fs/buffer.c
--- /md0/kernels/2.4/v2.4.4-ac10/fs/buffer.c	Thu May 17 15:25:10 2001
+++ ac10-aio/fs/buffer.c	Thu May 24 17:53:00 2001
@@ -1997,6 +1997,7 @@
 	return tmp.b_blocknr;
 }
 
+#if 0
 /*
  * IO completion routine for a buffer_head being used for kiobuf IO: we
  * can't dispatch the kiobuf callback until io_count reaches 0.  
@@ -2157,6 +2158,7 @@
 		} /* End of page loop */		
 	} /* End of iovec loop */
 
+ error:
 	/* Is there any IO still left to submit? */
 	if (bhind) {
 		int tmp_err;
@@ -2174,6 +2176,7 @@
 		return transferred;
 	return err;
 }
+#endif
 
 /*
  * Start I/O on a page.
@@ -2608,15 +2611,16 @@
 	return flushed;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
+struct task_struct *bdflush_tsk = 0;
 
 void wakeup_bdflush(int block)
 {
-	if (waitqueue_active(&bdflush_wait))
-	wake_up_interruptible(&bdflush_wait);
+	if (current != bdflush_tsk) {
+		wake_up_process(bdflush_tsk);
 
-	if (block)
-		flush_dirty_buffers(0, 0);
+		if (block)
+			flush_dirty_buffers(0, 0);
+	}
 }
 
 /* 
@@ -2717,6 +2721,7 @@
 	tsk->session = 1;
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "bdflush");
+	bdflush_tsk = tsk;
 
 	/* avoid getting signals */
 	spin_lock_irq(&tsk->sigmask_lock);
@@ -2731,16 +2736,22 @@
 		CHECK_EMERGENCY_SYNC
 
 		flushed = flush_dirty_buffers(0, 0);
+		if (free_shortage())
+			flushed += page_launder(GFP_KERNEL, 0);
 
 		/*
 		 * If there are still a lot of dirty buffers around,
 		 * skip the sleep and flush some more. Otherwise, we
 		 * go to sleep waiting a wakeup.
 		 */
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (!flushed || balance_dirty_state(NODEV) < 0) {
 			run_task_queue(&tq_disk);
-			interruptible_sleep_on(&bdflush_wait);
+			schedule();
 		}
+		/* Remember to mark us as running otherwise
+		   the next schedule will block. */
+		__set_current_state(TASK_RUNNING);
 	}
 }
 
@@ -2811,3 +2822,251 @@
 
 module_init(bdflush_init)
 
+/* async kio interface */
+struct brw_cb {
+	struct kiobuf		*kiobuf;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf)
+{
+	if (atomic_dec_and_test(&kiobuf->io_count)) {
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (buffer_uptodate(bh) && !kiobuf->errno)
+				kiobuf->transferred += bh->b_size;
+			else if (!kiobuf->errno)
+				kiobuf->errno = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (kiobuf->end_io)
+			kiobuf->end_io(kiobuf);
+		wake_up(&kiobuf->wait_queue);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	kiobuf = brw_cb->kiobuf;
+	if (!uptodate && !kiobuf->errno)
+		brw_cb->kiobuf->errno = -EIO;
+	brw_kio_put_iobuf(brw_cb, kiobuf);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int		err;
+	int		length;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	struct page *	map;
+	struct buffer_head *tmp;
+	int		bh_nr;
+	int		i;
+
+#define MAX_KIOVEC_NR	8
+	struct brw_cb	*brw_cb_table[MAX_KIOVEC_NR];
+	struct brw_cb	*brw_cb;
+
+	if (!nr)
+		return 0;
+
+	if (nr > MAX_KIOVEC_NR) {
+		printk("kiovec too large: %d\n", nr);
+		BUG();
+	}
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (sector_size-1)) ||
+		    (iobuf->length & (sector_size-1))) {
+			printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size);
+			return -EINVAL;
+		}
+
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		iobuf->errno = 0;
+		iobuf->transferred = 0;
+		atomic_inc(&iobuf->io_count);
+
+		bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size;
+		if (!bh_nr) {
+			printk("brw_kiovec_async: !bh_nr\n");
+			return -EINVAL;
+		}
+
+		/* FIXME: tie into userbeans here */
+		brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL);
+		if (!brw_cb)
+			return -ENOMEM;
+
+		brw_cb_table[i] = brw_cb;
+		brw_cb->kiobuf = iobuf;
+		brw_cb->nr = 0;
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map  = iobuf->maplist[pageind];
+			err = -EFAULT;
+			if (!map)
+				goto error;
+
+			while (length > 0 && (bufind < nr_blocks)) {
+				blocknr = b[bufind++];
+				tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+				err = -ENOMEM;
+				if (!tmp)
+					goto error;
+
+				memset(tmp, 0, sizeof(*tmp));
+				init_waitqueue_head(&tmp->b_wait);
+				tmp->b_dev = B_FREE;
+				tmp->b_size = sector_size;
+				set_bh_page(tmp, map, offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+				tmp->b_private = brw_cb;
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					clear_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				brw_cb->bh[brw_cb->nr++] = tmp;
+				length -= sector_size;
+				offset += sector_size;
+
+				atomic_inc(&iobuf->io_count);
+
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i = 0; i < nr; i++) {
+		int j;
+		brw_cb = brw_cb_table[i];
+#if 1
+		for (j=0; j<brw_cb->nr; j++) 
+			submit_bh(rw, brw_cb->bh[j]);
+		//ll_rw_block(rw, brw_cb->nr, brw_cb->bh);
+#else
+		generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr);
+#endif
+		brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf);
+	}
+
+	return 0;
+
+ error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	do {
+		brw_cb = brw_cb_table[i];
+		if (brw_cb) {
+			/* We got an error allocating the bh'es.  Just free the current
+			   buffer_heads and exit. */
+			for (bhind = brw_cb->nr; bhind--; )
+				kmem_cache_free(bh_cachep, brw_cb->bh[bhind]);
+			atomic_dec(&brw_cb->kiobuf->io_count);
+			kfree(brw_cb);
+		}
+	} while (i--) ;
+
+	return err;
+}
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/ext2/file.c ac10-aio/fs/ext2/file.c
--- /md0/kernels/2.4/v2.4.4-ac10/fs/ext2/file.c	Thu May 17 15:25:10 2001
+++ ac10-aio/fs/ext2/file.c	Thu May 24 17:53:00 2001
@@ -41,6 +41,7 @@
 struct file_operations ext2_file_operations = {
 	read:		generic_file_read,
 	write:		generic_file_write,
+	rw_kiovec:	generic_file_rw_kiovec,
 	ioctl:		ext2_ioctl,
 	mmap:		generic_file_mmap,
 	open:		generic_file_open,
diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/foopp ac10-aio/fs/foopp
--- /md0/kernels/2.4/v2.4.4-ac10/fs/foopp	Wed Dec 31 19:00:00 1969
+++ ac10-aio/fs/foopp	Thu May 24 17:53:02 2001
@@ -0,0 +1,3134 @@
+/*
+ *  linux/fs/buffer.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ *  'buffer.c' implements the buffer-cache functions. Race-conditions have
+ * been avoided by NEVER letting an interrupt change a buffer (except for the
+ * data, of course), but instead letting the caller do it.
+ */
+
+/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
+
+/* Removed a lot of unnecessary code and simplified things now that
+ * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ */
+
+/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. -DaveM
+ */
+
+/* Added 32k buffer block sizes - these are required older ARM systems.
+ * - RMK
+ */
+
+/* Thread it... -DaveM */
+
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/locks.h>
+#include <linux/errno.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/sysrq.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/aio.h>
+
+struct brw_cb {
+	struct kiobuf		*kiobuf;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/bitops.h>
+#include <asm/mmu_context.h>
+
+#define NR_SIZES 7
+static char buffersize_index[65] =
+{-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
+  4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+  5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+  6};
+
+#define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
+#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
+#define NR_RESERVED (2*MAX_BUF_PER_PAGE)
+#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
+					     number of unused buffer heads */
+
+/* Anti-deadlock ordering:
+ *	lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
+ */
+
+#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
+
+/*
+ * Hash table gook..
+ */
+static unsigned int bh_hash_mask;
+static unsigned int bh_hash_shift;
+static struct buffer_head **hash_table;
+static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
+
+static struct buffer_head *lru_list[NR_LIST];
+static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
+static int nr_buffers_type[NR_LIST];
+static unsigned long size_buffers_type[NR_LIST];
+
+static struct buffer_head * unused_list;
+static int nr_unused_buffer_heads;
+static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
+
+struct bh_free_head {
+	struct buffer_head *list;
+	spinlock_t lock;
+};
+static struct bh_free_head free_list[NR_SIZES];
+
+static int grow_buffers(int size);
+static void __refile_buffer(struct buffer_head *);
+
+/* This is used by some architectures to estimate available memory. */
+atomic_t buffermem_pages = ATOMIC_INIT(0);
+
+/* Here is the parameter block for the bdflush process. If you add or
+ * remove any of the parameters, make sure to update kernel/sysctl.c.
+ */
+
+#define N_PARAM 9
+
+/* The dummy values in this structure are left in there for compatibility
+ * with old programs that play with the /proc entries.
+ */
+union bdflush_param {
+	struct {
+		int nfract;  /* Percentage of buffer cache dirty to 
+				activate bdflush */
+		int ndirty;  /* Maximum number of dirty blocks to write out per
+				wake-cycle */
+		int nrefill; /* Number of clean buffers to try to obtain
+				each time we call refill */
+		int dummy1;   /* unused */
+		int interval; /* jiffies delay between kupdate flushes */
+		int age_buffer;  /* Time for normal buffer to age before we flush it */
+		int nfract_sync; /* Percentage of buffer cache dirty to 
+				    activate bdflush synchronously */
+		int dummy2;    /* unused */
+		int dummy3;    /* unused */
+	} b_un;
+	unsigned int data[N_PARAM];
+} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
+
+/* These are the min and max parameter values that we will allow to be assigned */
+int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
+
+/*
+ * Rewrote the wait-routines to use the "new" wait-queue functionality,
+ * and getting rid of the cli-sti pairs. The wait-queue routines still
+ * need cli-sti, but now it's just a couple of 386 instructions or so.
+ *
+ * Note that the real wait_on_buffer() is an inline function that checks
+ * if 'b_wait' is set before calling this, so that the queues aren't set
+ * up unnecessarily.
+ */
+void __wait_on_buffer(struct buffer_head * bh)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	atomic_inc(&bh->b_count);
+	add_wait_queue(&bh->b_wait, &wait);
+	do {
+		run_task_queue(&tq_disk);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!buffer_locked(bh))
+			break;
+		schedule();
+	} while (buffer_locked(bh));
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&bh->b_wait, &wait);
+	atomic_dec(&bh->b_count);
+}
+
+/* Call sync_buffers with wait!=0 to ensure that the call does not
+ * return until all buffer writes have completed.  Sync() may return
+ * before the writes have finished; fsync() may not.
+ */
+
+/* Godamity-damn.  Some buffers (bitmaps for filesystems)
+ * spontaneously dirty themselves without ever brelse being called.
+ * We will ultimately want to put these in a separate list, but for
+ * now we search all of the lists for dirty buffers.
+ */
+static int sync_buffers(kdev_t dev, int wait)
+{
+	int i, retry, pass = 0, err = 0;
+	struct buffer_head * bh, *next;
+
+	/* One pass for no-wait, three for wait:
+	 * 0) write out all dirty, unlocked buffers;
+	 * 1) write out all dirty buffers, waiting if locked;
+	 * 2) wait for completion by waiting for all buffers to unlock.
+	 */
+	do {
+		retry = 0;
+
+		/* We search all lists as a failsafe mechanism, not because we expect
+		 * there to be dirty buffers on any of the other lists.
+		 */
+repeat:
+		spin_lock(&lru_list_lock);
+		bh = lru_list[BUF_DIRTY];
+		if (!bh)
+			goto repeat2;
+
+		for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
+			next = bh->b_next_free;
+
+			if (!lru_list[BUF_DIRTY])
+				break;
+			if (dev && bh->b_dev != dev)
+				continue;
+			if (buffer_locked(bh)) {
+				/* Buffer is locked; skip it unless wait is
+				 * requested AND pass > 0.
+				 */
+				if (!wait || !pass) {
+					retry = 1;
+					continue;
+				}
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer (bh);
+				atomic_dec(&bh->b_count);
+				goto repeat;
+			}
+
+			/* If an unlocked buffer is not uptodate, there has
+			 * been an IO error. Skip it.
+			 */
+			if (wait && buffer_req(bh) && !buffer_locked(bh) &&
+			    !buffer_dirty(bh) && !buffer_uptodate(bh)) {
+				err = -EIO;
+				continue;
+			}
+
+			/* Don't write clean buffers.  Don't write ANY buffers
+			 * on the third pass.
+			 */
+			if (!buffer_dirty(bh) || pass >= 2)
+				continue;
+
+			atomic_inc(&bh->b_count);
+			spin_unlock(&lru_list_lock);
+			ll_rw_block(WRITE, 1, &bh);
+			atomic_dec(&bh->b_count);
+			retry = 1;
+			goto repeat;
+		}
+
+    repeat2:
+		bh = lru_list[BUF_LOCKED];
+		if (!bh) {
+			spin_unlock(&lru_list_lock);
+			break;
+		}
+		for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
+			next = bh->b_next_free;
+
+			if (!lru_list[BUF_LOCKED])
+				break;
+			if (dev && bh->b_dev != dev)
+				continue;
+			if (buffer_locked(bh)) {
+				/* Buffer is locked; skip it unless wait is
+				 * requested AND pass > 0.
+				 */
+				if (!wait || !pass) {
+					retry = 1;
+					continue;
+				}
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer (bh);
+				spin_lock(&lru_list_lock);
+				atomic_dec(&bh->b_count);
+				goto repeat2;
+			}
+		}
+		spin_unlock(&lru_list_lock);
+
+		/* If we are waiting for the sync to succeed, and if any dirty
+		 * blocks were written, then repeat; on the second pass, only
+		 * wait for buffers being written (do not pass to write any
+		 * more buffers on the second pass).
+		 */
+	} while (wait && retry && ++pass<=2);
+	return err;
+}
+
+void sync_dev(kdev_t dev)
+{
+	sync_supers(dev);
+	sync_inodes(dev);
+	DQUOT_SYNC(dev);
+	/* sync all the dirty buffers out to disk only _after_ all the
+	   high level layers finished generated buffer dirty data
+	   (or we'll return with some buffer still dirty on the blockdevice
+	   so breaking the semantics of this call) */
+	sync_buffers(dev, 0);
+	/*
+	 * FIXME(eric) we need to sync the physical devices here.
+	 * This is because some (scsi) controllers have huge amounts of
+	 * cache onboard (hundreds of Mb), and we need to instruct
+	 * them to commit all of the dirty memory to disk, and we should
+	 * not return until this has happened.
+	 *
+	 * This would need to get implemented by going through the assorted
+	 * layers so that each block major number can be synced, and this
+	 * would call down into the upper and mid-layer scsi.
+	 */
+}
+
+int fsync_dev(kdev_t dev)
+{
+	sync_buffers(dev, 0);
+
+	lock_kernel();
+	sync_supers(dev);
+	sync_inodes(dev);
+	DQUOT_SYNC(dev);
+	unlock_kernel();
+
+	return sync_buffers(dev, 1);
+}
+
+asmlinkage long sys_sync(void)
+{
+	fsync_dev(0);
+	return 0;
+}
+
+/*
+ *	filp may be NULL if called via the msync of a vma.
+ */
+ 
+int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	struct super_block * sb;
+	kdev_t dev;
+	int ret;
+
+	lock_kernel();
+	/* sync the inode to buffers */
+	write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	lock_super(sb);
+	if (sb->s_op && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+
+	/* .. finally sync the buffers to disk */
+	dev = inode->i_dev;
+	ret = sync_buffers(dev, 1);
+	unlock_kernel();
+	return ret;
+}
+
+asmlinkage long sys_fsync(unsigned int fd)
+{
+	struct file * file;
+	struct dentry * dentry;
+	struct inode * inode;
+	int err;
+
+	err = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	inode = dentry->d_inode;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->fsync)
+		goto out_putf;
+
+	/* We need to protect against concurrent writers.. */
+	down(&inode->i_sem);
+	filemap_fdatasync(inode->i_mapping);
+	err = file->f_op->fsync(file, dentry, 0);
+	filemap_fdatawait(inode->i_mapping);
+	up(&inode->i_sem);
+
+out_putf:
+	fput(file);
+out:
+	return err;
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	struct file * file;
+	struct dentry * dentry;
+	struct inode * inode;
+	int err;
+
+	err = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	inode = dentry->d_inode;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->fsync)
+		goto out_putf;
+
+	down(&inode->i_sem);
+	filemap_fdatasync(inode->i_mapping);
+	err = file->f_op->fsync(file, dentry, 1);
+	filemap_fdatawait(inode->i_mapping);
+	up(&inode->i_sem);
+
+out_putf:
+	fput(file);
+out:
+	return err;
+}
+
+/* After several hours of tedious analysis, the following hash
+ * function won.  Do not mess with it... -DaveM
+ */
+#define _hashfn(dev,block)	\
+	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
+	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
+	  ((block) << (bh_hash_shift - 12))))
+#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
+
+static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
+{
+	if ((bh->b_next = *head) != NULL)
+		bh->b_next->b_pprev = &bh->b_next;
+	*head = bh;
+	bh->b_pprev = head;
+}
+
+static __inline__ void __hash_unlink(struct buffer_head *bh)
+{
+	if (bh->b_pprev) {
+		if (bh->b_next)
+			bh->b_next->b_pprev = bh->b_pprev;
+		*(bh->b_pprev) = bh->b_next;
+		bh->b_pprev = NULL;
+	}
+}
+
+static void __insert_into_lru_list(struct buffer_head * bh, int blist)
+{
+	struct buffer_head **bhp = &lru_list[blist];
+
+	if(!*bhp) {
+		*bhp = bh;
+		bh->b_prev_free = bh;
+	}
+	bh->b_next_free = *bhp;
+	bh->b_prev_free = (*bhp)->b_prev_free;
+	(*bhp)->b_prev_free->b_next_free = bh;
+	(*bhp)->b_prev_free = bh;
+	nr_buffers_type[blist]++;
+	size_buffers_type[blist] += bh->b_size;
+}
+
+static void __remove_from_lru_list(struct buffer_head * bh, int blist)
+{
+	if (bh->b_prev_free || bh->b_next_free) {
+		bh->b_prev_free->b_next_free = bh->b_next_free;
+		bh->b_next_free->b_prev_free = bh->b_prev_free;
+		if (lru_list[blist] == bh)
+			lru_list[blist] = bh->b_next_free;
+		if (lru_list[blist] == bh)
+			lru_list[blist] = NULL;
+		bh->b_next_free = bh->b_prev_free = NULL;
+		nr_buffers_type[blist]--;
+		size_buffers_type[blist] -= bh->b_size;
+	}
+}
+
+static void __remove_from_free_list(struct buffer_head * bh, int index)
+{
+	if(bh->b_next_free == bh)
+		 free_list[index].list = NULL;
+	else {
+		bh->b_prev_free->b_next_free = bh->b_next_free;
+		bh->b_next_free->b_prev_free = bh->b_prev_free;
+		if (free_list[index].list == bh)
+			 free_list[index].list = bh->b_next_free;
+	}
+	bh->b_next_free = bh->b_prev_free = NULL;
+}
+
+/* must be called with both the hash_table_lock and the lru_list_lock
+   held */
+static void __remove_from_queues(struct buffer_head *bh)
+{
+	__hash_unlink(bh);
+	__remove_from_lru_list(bh, bh->b_list);
+}
+
+static void __insert_into_queues(struct buffer_head *bh)
+{
+	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
+
+	__hash_link(bh, head);
+	__insert_into_lru_list(bh, bh->b_list);
+}
+
+/* This function must only run if there are no other
+ * references _anywhere_ to this buffer head.
+ */
+static void put_last_free(struct buffer_head * bh)
+{
+	struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
+	struct buffer_head **bhp = &head->list;
+
+	bh->b_state = 0;
+
+	spin_lock(&head->lock);
+	bh->b_dev = B_FREE;
+	if(!*bhp) {
+		*bhp = bh;
+		bh->b_prev_free = bh;
+	}
+	bh->b_next_free = *bhp;
+	bh->b_prev_free = (*bhp)->b_prev_free;
+	(*bhp)->b_prev_free->b_next_free = bh;
+	(*bhp)->b_prev_free = bh;
+	spin_unlock(&head->lock);
+}
+
+/*
+ * Why like this, I hear you say... The reason is race-conditions.
+ * As we don't lock buffers (unless we are reading them, that is),
+ * something might happen to it while we sleep (ie a read-error
+ * will force it bad). This shouldn't really happen currently, but
+ * the code is ready.
+ */
+static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
+{
+	struct buffer_head *bh = hash(dev, block);
+
+	for (; bh; bh = bh->b_next)
+		if (bh->b_blocknr == block	&&
+		    bh->b_size    == size	&&
+		    bh->b_dev     == dev)
+			break;
+	if (bh)
+		atomic_inc(&bh->b_count);
+
+	return bh;
+}
+
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+{
+	struct buffer_head *bh;
+
+	read_lock(&hash_table_lock);
+	bh = __get_hash_table(dev, block, size);
+	read_unlock(&hash_table_lock);
+
+	return bh;
+}
+
+unsigned int get_hardblocksize(kdev_t dev)
+{
+	int blksize = 0;
+	/*
+	 * Get the hard sector size for the given device.
+	 * If we don't know what it is, return 0.
+	 */
+	if (hardsect_size[MAJOR(dev)] != NULL)
+		blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
+	return blksize;
+}
+
+void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+{
+	spin_lock(&lru_list_lock);
+	if (bh->b_inode)
+		list_del(&bh->b_inode_buffers);
+	bh->b_inode = inode;
+	list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
+	spin_unlock(&lru_list_lock);
+}
+
+/* The caller must have the lru_list lock before calling the 
+   remove_inode_queue functions.  */
+static void __remove_inode_queue(struct buffer_head *bh)
+{
+	bh->b_inode = NULL;
+	list_del(&bh->b_inode_buffers);
+}
+
+static inline void remove_inode_queue(struct buffer_head *bh)
+{
+	if (bh->b_inode)
+		__remove_inode_queue(bh);
+}
+
+int inode_has_buffers(struct inode *inode)
+{
+	int ret;
+	
+	spin_lock(&lru_list_lock);
+	ret = !list_empty(&inode->i_dirty_buffers);
+	spin_unlock(&lru_list_lock);
+	
+	return ret;
+}
+
+
+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+   of fs corruption is going on. Trashing dirty data always imply losing
+   information that was supposed to be just stored on the physical layer
+   by the user.
+
+   Thus invalidate_buffers in general usage is not allwowed to trash dirty
+   buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
+
+   NOTE: In the case where the user removed a removable-media-disk even if
+   there's still dirty data not synced on disk (due a bug in the device driver
+   or due an error of the user), by not destroying the dirty buffers we could
+   generate corruption also on the next media inserted, thus a parameter is
+   necessary to handle this case in the most safe way possible (trying
+   to not corrupt also the new disk inserted with the data belonging to
+   the old now corrupted disk). Also for the ramdisk the natural thing
+   to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+   These are two special cases. Normal usage imply the device driver
+   to issue a sync on the device (without waiting I/O completation) and
+   then an invalidate_buffers call that doesn't trash dirty buffers. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+{
+	int i, nlist, slept;
+	struct buffer_head * bh, * bh_next;
+
+ retry:
+	slept = 0;
+	spin_lock(&lru_list_lock);
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		bh = lru_list[nlist];
+		if (!bh)
+			continue;
+		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
+			bh_next = bh->b_next_free;
+
+			/* Another device? */
+			if (bh->b_dev != dev)
+				continue;
+			/* Part of a mapping? */
+			if (bh->b_page->mapping)
+				continue;
+			if (buffer_locked(bh)) {
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer(bh);
+				slept = 1;
+				spin_lock(&lru_list_lock);
+				atomic_dec(&bh->b_count);
+			}
+
+			write_lock(&hash_table_lock);
+			if (!atomic_read(&bh->b_count) &&
+			    (destroy_dirty_buffers || !buffer_dirty(bh))) {
+				remove_inode_queue(bh);
+				__remove_from_queues(bh);
+				put_last_free(bh);
+			}
+			/* else complain loudly? */
+
+			write_unlock(&hash_table_lock);
+			if (slept)
+				goto out;
+		}
+	}
+out:
+	spin_unlock(&lru_list_lock);
+	if (slept)
+		goto retry;
+}
+
+void set_blocksize(kdev_t dev, int size)
+{
+	extern int *blksize_size[];
+	int i, nlist, slept;
+	struct buffer_head * bh, * bh_next;
+
+	if (!blksize_size[MAJOR(dev)])
+		return;
+
+	/* Size must be a power of two, and between 512 and PAGE_SIZE */
+	if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
+		panic("Invalid blocksize passed to set_blocksize");
+
+	if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
+		blksize_size[MAJOR(dev)][MINOR(dev)] = size;
+		return;
+	}
+	if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
+		return;
+	sync_buffers(dev, 2);
+	blksize_size[MAJOR(dev)][MINOR(dev)] = size;
+
+ retry:
+	slept = 0;
+	spin_lock(&lru_list_lock);
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		bh = lru_list[nlist];
+		if (!bh)
+			continue;
+		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
+			bh_next = bh->b_next_free;
+			if (bh->b_dev != dev || bh->b_size == size)
+				continue;
+			if (buffer_locked(bh)) {
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer(bh);
+				slept = 1;
+				spin_lock(&lru_list_lock);
+				atomic_dec(&bh->b_count);
+			}
+
+			write_lock(&hash_table_lock);
+			if (!atomic_read(&bh->b_count)) {
+				if (buffer_dirty(bh))
+					printk(KERN_WARNING
+					       "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
+					       kdevname(dev), bh->b_blocknr, bh->b_size);
+				remove_inode_queue(bh);
+				__remove_from_queues(bh);
+				put_last_free(bh);
+			} else {
+				if (atomic_set_buffer_clean(bh))
+					__refile_buffer(bh);
+				clear_bit(BH_Uptodate, &bh->b_state);
+				printk(KERN_WARNING
+				       "set_blocksize: "
+				       "b_count %d, dev %s, block %lu, from %p\n",
+				       atomic_read(&bh->b_count), bdevname(bh->b_dev),
+				       bh->b_blocknr, __builtin_return_address(0));
+			}
+			write_unlock(&hash_table_lock);
+			if (slept)
+				goto out;
+		}
+	}
+ out:
+	spin_unlock(&lru_list_lock);
+	if (slept)
+		goto retry;
+}
+
+/*
+ * We used to try various strange things. Let's not.
+ * We'll just try to balance dirty buffers, and possibly
+ * launder some pages.
+ */
+static void refill_freelist(int size)
+{
+	balance_dirty(NODEV);
+	if (free_shortage())
+		page_launder(GFP_BUFFER, 0);
+	grow_buffers(size);
+}
+
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+	bh->b_list = BUF_CLEAN;
+	bh->b_end_io = handler;
+	bh->b_private = private;
+}
+
+static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+{
+	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long flags;
+	struct buffer_head *tmp;
+	struct page *page;
+
+	mark_buffer_uptodate(bh, uptodate);
+
+	/* This is a temporary buffer used for page I/O. */
+	page = bh->b_page;
+
+	if (!uptodate)
+		SetPageError(page);
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 *
+	 * Async buffer_heads are here only as labels for IO, and get
+	 * thrown away once the IO for this page is complete.  IO is
+	 * deemed complete once all buffers have been visited
+	 * (b_count==0) and are now unlocked. We must make sure that
+	 * only the _last_ buffer that decrements its count is the one
+	 * that unlock the page..
+	 */
+	spin_lock_irqsave(&page_uptodate_lock, flags);
+	unlock_buffer(bh);
+	atomic_dec(&bh->b_count);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
+			goto still_busy;
+		tmp = tmp->b_this_page;
+	}
+
+	/* OK, the async IO on this page is complete. */
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+
+	/*
+	 * if none of the buffers had errors then we can set the
+	 * page uptodate:
+	 */
+	if (!PageError(page))
+		SetPageUptodate(page);
+
+	/*
+	 * Run the hooks that have to be done when a page I/O has completed.
+	 */
+	if (PageTestandClearDecrAfter(page))
+		atomic_dec(&nr_async_pages);
+
+	UnlockPage(page);
+
+	return;
+
+still_busy:
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	return;
+}
+
+void set_buffer_async_io(struct buffer_head *bh) {
+    bh->b_end_io = end_buffer_io_async ;
+}
+
+/*
+ * Synchronise all the inode's dirty buffers to the disk.
+ *
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't.  After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
+ *
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go.  Then we clean
+ * up, waiting for those writes to complete.
+ * 
+ * During this second stage, any subsequent updates to the file may end
+ * up refiling the buffer on the original inode's dirty list again, so
+ * there is a chance we will end up with a buffer queued for write but
+ * not yet completed on that list.  So, as a final cleanup we go through
+ * the osync code to catch these locked, dirty buffers without requeuing
+ * any newly dirty buffers for write.
+ */
+
+int fsync_inode_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct inode tmp;
+	int err = 0, err2;
+	
+	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
+	
+	spin_lock(&lru_list_lock);
+
+	while (!list_empty(&inode->i_dirty_buffers)) {
+		bh = BH_ENTRY(inode->i_dirty_buffers.next);
+		list_del(&bh->b_inode_buffers);
+		if (!buffer_dirty(bh) && !buffer_locked(bh))
+			bh->b_inode = NULL;
+		else {
+			bh->b_inode = &tmp;
+			list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
+			atomic_inc(&bh->b_count);
+			if (buffer_dirty(bh)) {
+				spin_unlock(&lru_list_lock);
+				ll_rw_block(WRITE, 1, &bh);
+				spin_lock(&lru_list_lock);
+			}
+		}
+	}
+
+	while (!list_empty(&tmp.i_dirty_buffers)) {
+		bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
+		remove_inode_queue(bh);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(&lru_list_lock);
+	}
+	
+	spin_unlock(&lru_list_lock);
+	err2 = osync_inode_buffers(inode);
+
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+
+int osync_inode_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct list_head *list;
+	int err = 0;
+
+	spin_lock(&lru_list_lock);
+	
+ repeat:
+	
+	for (list = inode->i_dirty_buffers.prev; 
+	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
+	     list = bh->b_inode_buffers.prev) {
+		if (buffer_locked(bh)) {
+			atomic_inc(&bh->b_count);
+			spin_unlock(&lru_list_lock);
+			wait_on_buffer(bh);
+			brelse(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
+	}
+
+	spin_unlock(&lru_list_lock);
+	return err;
+}
+
+
+/*
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
+ */
+
+void invalidate_inode_buffers(struct inode *inode)
+{
+	struct list_head *list, *next;
+	
+	spin_lock(&lru_list_lock);
+	list = inode->i_dirty_buffers.next; 
+	while (list != &inode->i_dirty_buffers) {
+		next = list->next;
+		remove_inode_queue(BH_ENTRY(list));
+		list = next;
+	}
+	spin_unlock(&lru_list_lock);
+}
+
+
+/*
+ * Ok, this is getblk, and it isn't very clear, again to hinder
+ * race-conditions. Most of the code is seldom used, (ie repeating),
+ * so it should be much more efficient than it looks.
+ *
+ * The algorithm is changed: hopefully better, and an elusive bug removed.
+ *
+ * 14.02.92: changed it to sync dirty buffers a bit: better performance
+ * when the filesystem starts to get full of dirty blocks (I hope).
+ */
+struct buffer_head * getblk(kdev_t dev, int block, int size)
+{
+	struct buffer_head * bh;
+	int isize;
+
+repeat:
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	bh = __get_hash_table(dev, block, size);
+	if (bh)
+		goto out;
+
+	isize = BUFSIZE_INDEX(size);
+	spin_lock(&free_list[isize].lock);
+	bh = free_list[isize].list;
+	if (bh) {
+		__remove_from_free_list(bh, isize);
+		atomic_set(&bh->b_count, 1);
+	}
+	spin_unlock(&free_list[isize].lock);
+
+	/*
+	 * OK, FINALLY we know that this buffer is the only one of
+	 * its kind, we hold a reference (b_count>0), it is unlocked,
+	 * and it is clean.
+	 */
+	if (bh) {
+		init_buffer(bh, NULL, NULL);
+		bh->b_dev = dev;
+		bh->b_blocknr = block;
+		bh->b_state = 1 << BH_Mapped;
+
+		/* Insert the buffer into the regular lists */
+		__insert_into_queues(bh);
+	out:
+		write_unlock(&hash_table_lock);
+		spin_unlock(&lru_list_lock);
+		touch_buffer(bh);
+		return bh;
+	}
+
+	/*
+	 * If we block while refilling the free list, somebody may
+	 * create the buffer first ... search the hashes again.
+	 */
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	refill_freelist(size);
+	goto repeat;
+}
+
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = nr_free_buffer_pages();
+
+	dirty *= 100;
+	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+	hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
+
+	/* First, check for the "real" dirty limit. */
+	if (dirty > soft_dirty_limit) {
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+
+	return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+	int state = balance_dirty_state(dev);
+
+	if (state < 0)
+		return;
+
+	if (state && (!dev || MAJOR(dev) == LOOP_MAJOR))
+		state = 0;
+
+	wakeup_bdflush(state);
+}
+
+static __inline__ void __mark_dirty(struct buffer_head *bh)
+{
+	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+	refile_buffer(bh);
+}
+
+/* atomic version, the user must call balance_dirty() by hand
+   as soon as it become possible to block */
+void __mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!atomic_set_buffer_dirty(bh))
+		__mark_dirty(bh);
+}
+
+void mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!atomic_set_buffer_dirty(bh)) {
+		__mark_dirty(bh);
+		balance_dirty(bh->b_dev);
+	}
+}
+
+/*
+ * A buffer may need to be moved from one buffer list to another
+ * (e.g. in case it is not shared any more). Handle this.
+ */
+static void __refile_buffer(struct buffer_head *bh)
+{
+	int dispose = BUF_CLEAN;
+	if (buffer_locked(bh))
+		dispose = BUF_LOCKED;
+	if (buffer_dirty(bh))
+		dispose = BUF_DIRTY;
+	if (buffer_protected(bh))
+		dispose = BUF_PROTECTED;
+	if (dispose != bh->b_list) {
+		__remove_from_lru_list(bh, bh->b_list);
+		bh->b_list = dispose;
+		if (dispose == BUF_CLEAN)
+			remove_inode_queue(bh);
+		__insert_into_lru_list(bh, dispose);
+	}
+}
+
+void refile_buffer(struct buffer_head *bh)
+{
+	spin_lock(&lru_list_lock);
+	__refile_buffer(bh);
+	spin_unlock(&lru_list_lock);
+}
+
+/*
+ * Release a buffer head
+ */
+void __brelse(struct buffer_head * buf)
+{
+	if (atomic_read(&buf->b_count)) {
+		atomic_dec(&buf->b_count);
+		return;
+	}
+	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+}
+
+/*
+ * bforget() is like brelse(), except it puts the buffer on the
+ * free list if it can.. We can NOT free the buffer if:
+ *  - there are other users of it
+ *  - it is locked and thus can have active IO
+ */
+void __bforget(struct buffer_head * buf)
+{
+	/* grab the lru lock here to block bdflush. */
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
+		goto in_use;
+	__hash_unlink(buf);
+	remove_inode_queue(buf);
+	write_unlock(&hash_table_lock);
+	__remove_from_lru_list(buf, buf->b_list);
+	spin_unlock(&lru_list_lock);
+	put_last_free(buf);
+	return;
+
+ in_use:
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+}
+
+/*
+ * bread() reads a specified block and returns the buffer that contains
+ * it. It returns NULL if the block was unreadable.
+ */
+struct buffer_head * bread(kdev_t dev, int block, int size)
+{
+	struct buffer_head * bh;
+
+	bh = getblk(dev, block, size);
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * Note: the caller should wake up the buffer_wait list if needed.
+ */
+static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
+{
+	if (bh->b_inode)
+		BUG();
+	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
+		kmem_cache_free(bh_cachep, bh);
+	} else {
+		bh->b_blocknr = -1;
+		init_waitqueue_head(&bh->b_wait);
+		nr_unused_buffer_heads++;
+		bh->b_next_free = unused_list;
+		bh->b_this_page = NULL;
+		unused_list = bh;
+	}
+}
+
+/*
+ * Reserve NR_RESERVED buffer heads for async IO requests to avoid
+ * no-buffer-head deadlock.  Return NULL on failure; waiting for
+ * buffer heads is now handled in create_buffers().
+ */ 
+static struct buffer_head * get_unused_buffer_head(int async)
+{
+	struct buffer_head * bh;
+
+	spin_lock(&unused_list_lock);
+	if (nr_unused_buffer_heads > NR_RESERVED) {
+		bh = unused_list;
+		unused_list = bh->b_next_free;
+		nr_unused_buffer_heads--;
+		spin_unlock(&unused_list_lock);
+		return bh;
+	}
+	spin_unlock(&unused_list_lock);
+
+	/* This is critical.  We can't swap out pages to get
+	 * more buffer heads, because the swap-out may need
+	 * more buffer-heads itself.  Thus SLAB_BUFFER.
+	 */
+	if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
+		memset(bh, 0, sizeof(*bh));
+		init_waitqueue_head(&bh->b_wait);
+		return bh;
+	}
+
+	/*
+	 * If we need an async buffer, use the reserved buffer heads.
+	 */
+	if (async) {
+		spin_lock(&unused_list_lock);
+		if (unused_list) {
+			bh = unused_list;
+			unused_list = bh->b_next_free;
+			nr_unused_buffer_heads--;
+			spin_unlock(&unused_list_lock);
+			return bh;
+		}
+		spin_unlock(&unused_list_lock);
+	}
+#if 0
+	/*
+	 * (Pending further analysis ...)
+	 * Ordinary (non-async) requests can use a different memory priority
+	 * to free up pages. Any swapping thus generated will use async
+	 * buffer heads.
+	 */
+	if(!async &&
+	   (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
+		memset(bh, 0, sizeof(*bh));
+		init_waitqueue_head(&bh->b_wait);
+		return bh;
+	}
+#endif
+
+	return NULL;
+}
+
+void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
+{
+	bh->b_page = page;
+	if (offset >= PAGE_SIZE)
+		BUG();
+	if (PageHighMem(page))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = page_address(page) + offset;
+}
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ * The async flag is used to differentiate async IO (paging, swapping)
+ * from ordinary buffer allocations, and only async requests are allowed
+ * to sleep waiting for buffer heads. 
+ */
+static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
+{
+	struct buffer_head *bh, *head;
+	long offset;
+
+try_again:
+	head = NULL;
+	offset = PAGE_SIZE;
+	while ((offset -= size) >= 0) {
+		bh = get_unused_buffer_head(async);
+		if (!bh)
+			goto no_grow;
+
+		bh->b_dev = B_FREE;  /* Flag as unused */
+		bh->b_this_page = head;
+		head = bh;
+
+		bh->b_state = 0;
+		bh->b_next_free = NULL;
+		bh->b_pprev = NULL;
+		atomic_set(&bh->b_count, 0);
+		bh->b_size = size;
+
+		set_bh_page(bh, page, offset);
+
+		bh->b_list = BUF_CLEAN;
+		bh->b_end_io = NULL;
+	}
+	return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+	if (head) {
+		spin_lock(&unused_list_lock);
+		do {
+			bh = head;
+			head = head->b_this_page;
+			__put_unused_buffer_head(bh);
+		} while (head);
+		spin_unlock(&unused_list_lock);
+
+		/* Wake up any waiters ... */
+		wake_up(&buffer_wait);
+	}
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!async)
+		return NULL;
+
+	/* We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	run_task_queue(&tq_disk);
+
+	/* 
+	 * Set our state for sleeping, then check again for buffer heads.
+	 * This ensures we won't miss a wake_up from an interrupt.
+	 */
+	wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
+	goto try_again;
+}
+
+static void unmap_buffer(struct buffer_head * bh)
+{
+	if (buffer_mapped(bh)) {
+		mark_buffer_clean(bh);
+		wait_on_buffer(bh);
+		clear_bit(BH_Uptodate, &bh->b_state);
+		clear_bit(BH_Mapped, &bh->b_state);
+		clear_bit(BH_Req, &bh->b_state);
+		clear_bit(BH_New, &bh->b_state);
+	}
+}
+
+/*
+ * We don't have to release all buffers here, but
+ * we have to be sure that no dirty buffer is left
+ * and no IO is going on (no buffer is locked), because
+ * we have truncated the file and are going to free the
+ * blocks on-disk..
+ */
+int block_flushpage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page->buffers)
+		return 1;
+
+	head = page->buffers;
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully flushed?
+		 */
+		if (offset <= curr_off)
+			unmap_buffer(bh);
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * subtle. We release buffer-heads only if this is
+	 * the 'final' flushpage. We have invalidated the get_block
+	 * cached value unconditionally, so real IO is not
+	 * possible anymore.
+	 *
+	 * If the free doesn't work out, the buffers can be
+	 * left around - they just turn into anonymous buffers
+	 * instead.
+	 */
+	if (!offset) {
+		if (!try_to_free_buffers(page, 0)) {
+			atomic_inc(&buffermem_pages);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	head = create_buffers(page, blocksize, 1);
+	if (page->buffers)
+		BUG();
+
+	bh = head;
+	do {
+		bh->b_dev = dev;
+		bh->b_blocknr = 0;
+		bh->b_end_io = NULL;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	page->buffers = head;
+	page_cache_get(page);
+}
+
+/*
+ * We are taking a block for data and we don't want any output from any
+ * buffer-cache aliases starting from return from that function and
+ * until the moment when something will explicitly mark the buffer
+ * dirty (hopefully that will not happen until we will free that block ;-)
+ * We don't even need to mark it not-uptodate - nobody can expect
+ * anything from a newly allocated buffer anyway. We used to used
+ * unmap_buffer() for such invalidation, but that was wrong. We definitely
+ * don't want to mark the alias unmapped, for example - it would confuse
+ * anyone who might pick it with bread() afterwards...
+ */
+
+static void unmap_underlying_metadata(struct buffer_head * bh)
+{
+	struct buffer_head *old_bh;
+
+	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+	if (old_bh) {
+		mark_buffer_clean(old_bh);
+		wait_on_buffer(old_bh);
+		clear_bit(BH_Req, &old_bh->b_state);
+		/* Here we could run brelse or bforget. We use
+		   bforget because it will try to put the buffer
+		   in the freelist. */
+		__bforget(old_bh);
+	}
+}
+
+/*
+ * NOTE! All mapped/uptodate combinations are valid:
+ *
+ *	Mapped	Uptodate	Meaning
+ *
+ *	No	No		"unknown" - must do get_block()
+ *	No	Yes		"hole" - zero-filled
+ *	Yes	No		"allocated" - allocated on disk, not read in
+ *	Yes	Yes		"valid" - allocated and up-to-date in memory.
+ *
+ * "Dirty" is valid only with the last case (mapped+uptodate).
+ */
+
+/*
+ * block_write_full_page() is SMP-safe - currently it's still
+ * being called with the kernel lock held, but the code is ready.
+ */
+static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
+{
+	int err, i;
+	unsigned long block;
+	struct buffer_head *bh, *head;
+	int need_unlock = 1;
+
+	if (!PageLocked(page))
+		BUG();
+
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
+	head = page->buffers;
+
+	block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	bh = head;
+	i = 0;
+
+	/* Stage 1: make sure we have all the buffers mapped! */
+	do {
+		/*
+		 * If the buffer isn't up-to-date, we can't be sure
+		 * that the buffer has been initialized with the proper
+		 * block number information etc..
+		 *
+		 * Leave it to the low-level FS to make all those
+		 * decisions (block #0 may actually be a valid block)
+		 */
+		if (!buffer_mapped(bh)) {
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto out;
+			if (buffer_new(bh))
+				unmap_underlying_metadata(bh);
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	/* Stage 2: lock the buffers, mark them clean */
+	do {
+		lock_buffer(bh);
+		bh->b_end_io = end_buffer_io_async;
+		atomic_inc(&bh->b_count);
+		set_bit(BH_Uptodate, &bh->b_state);
+		clear_bit(BH_Dirty, &bh->b_state);
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	SetPageUptodate(page);
+	/* Stage 3: submit the IO */
+	do {
+		submit_bh(WRITE, bh);
+		bh = bh->b_this_page;		
+	} while (bh != head);
+
+	/* Done - end_buffer_io_async will unlock */
+	return 0;
+
+out:
+	ClearPageUptodate(page);
+	bh = head;
+	need_unlock = 1;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh)) {
+			lock_buffer(bh);
+			need_unlock = 0;
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	do {
+		if (buffer_mapped(bh)) {
+			bh->b_end_io = end_buffer_io_async;
+			atomic_inc(&bh->b_count);
+			set_bit(BH_Uptodate, &bh->b_state);
+			clear_bit(BH_Dirty, &bh->b_state);
+			submit_bh(WRITE, bh);
+		}
+		bh = bh->b_this_page;
+	} while(bh != head);
+	if (need_unlock)
+		UnlockPage(page);
+	return err;
+}
+
+static int __block_prepare_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to, get_block_t *get_block)
+{
+	unsigned block_start, block_end;
+	unsigned long block;
+	int err = 0;
+	unsigned blocksize, bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+	char *kaddr = kmap(page);
+
+	blocksize = inode->i_sb->s_blocksize;
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+	head = page->buffers;
+
+	bbits = inode->i_sb->s_blocksize_bits;
+	block = page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for(bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start=block_end, bh = bh->b_this_page) {
+		if (!bh)
+			BUG();
+		block_end = block_start+blocksize;
+		if (block_end <= from)
+			continue;
+		if (block_start >= to)
+			break;
+		if (!buffer_mapped(bh)) {
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto out;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh);
+				if (Page_Uptodate(page)) {
+					set_bit(BH_Uptodate, &bh->b_state);
+					continue;
+				}
+				if (block_end > to)
+					memset(kaddr+to, 0, block_end-to);
+				if (block_start < from)
+					memset(kaddr+block_start, 0, from-block_start);
+				if (block_end > to || block_start < from)
+					flush_dcache_page(page);
+				continue;
+			}
+		}
+		if (Page_Uptodate(page)) {
+			set_bit(BH_Uptodate, &bh->b_state);
+			continue; 
+		}
+		if (!buffer_uptodate(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+	}
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		err = -EIO;
+		if (!buffer_uptodate(*wait_bh))
+			goto out;
+	}
+	return 0;
+out:
+	bh = head;
+	block_start = 0;
+	do {
+		if (buffer_new(bh) && !buffer_uptodate(bh)) {
+			memset(kaddr+block_start, 0, bh->b_size);
+			set_bit(BH_Uptodate, &bh->b_state);
+			mark_buffer_dirty(bh);
+		}
+		block_start += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return err;
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0, need_balance_dirty = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = inode->i_sb->s_blocksize;
+
+	for(bh = head = page->buffers, block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_bit(BH_Uptodate, &bh->b_state);
+			if (!atomic_set_buffer_dirty(bh)) {
+				__mark_dirty(bh);
+				buffer_insert_inode_queue(bh, inode);
+				need_balance_dirty = 1;
+			}
+		}
+	}
+
+	if (need_balance_dirty)
+		balance_dirty(bh->b_dev);
+	/*
+	 * is this a partial write that happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' wether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * mark_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, blocks;
+	int nr, i;
+
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+	blocksize = inode->i_sb->s_blocksize;
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+	head = page->buffers;
+
+	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
+	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			if (iblock < lblock) {
+				if (get_block(inode, iblock, bh, 0))
+					continue;
+			}
+			if (!buffer_mapped(bh)) {
+				memset(kmap(page) + i*blocksize, 0, blocksize);
+				flush_dcache_page(page);
+				kunmap(page);
+				set_bit(BH_Uptodate, &bh->b_state);
+				continue;
+			}
+			/* get_block() might have updated the buffer synchronously */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+
+		arr[nr] = bh;
+		nr++;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (!nr) {
+		/*
+		 * all buffers are uptodate - we can set the page
+		 * uptodate as well.
+		 */
+		SetPageUptodate(page);
+		UnlockPage(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		struct buffer_head * bh = arr[i];
+		lock_buffer(bh);
+		bh->b_end_io = end_buffer_io_async;
+		atomic_inc(&bh->b_count);
+	}
+
+	/* Stage 3: start the IO */
+	for (i = 0; i < nr; i++)
+		submit_bh(READ, arr[i]);
+
+	return 0;
+}
+
+/*
+ * For moronic filesystems that do not allow holes in file.
+ * We may have to extend the file.
+ */
+
+int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct page *new_page;
+	unsigned long pgpos;
+	long status;
+	unsigned zerofrom;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	char *kaddr;
+
+	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+		status = -ENOMEM;
+		new_page = grab_cache_page(mapping, pgpos);
+		if (!new_page)
+			goto out;
+		/* we might sleep */
+		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+			UnlockPage(new_page);
+			page_cache_release(new_page);
+			continue;
+		}
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		status = __block_prepare_write(inode, new_page, zerofrom,
+						PAGE_CACHE_SIZE, get_block);
+		if (status)
+			goto out_unmap;
+		kaddr = page_address(new_page);
+		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+		flush_dcache_page(new_page);
+		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+		kunmap(new_page);
+		UnlockPage(new_page);
+		page_cache_release(new_page);
+	}
+
+	if (page->index < pgpos) {
+		/* completely inside the area */
+		zerofrom = offset;
+	} else {
+		/* page covers the boundary, find the boundary offset */
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+
+		/* if we will expand the thing last block will be filled */
+		if (to > zerofrom && (zerofrom & (blocksize-1))) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+
+		/* starting below the boundary? Nothing to zero out */
+		if (offset <= zerofrom)
+			zerofrom = offset;
+	}
+	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
+	if (status)
+		goto out1;
+	kaddr = page_address(page);
+	if (zerofrom < offset) {
+		memset(kaddr+zerofrom, 0, offset-zerofrom);
+		flush_dcache_page(page);
+		__block_commit_write(inode, page, zerofrom, offset);
+	}
+	return 0;
+out1:
+	ClearPageUptodate(page);
+	kunmap(page);
+	return status;
+
+out_unmap:
+	ClearPageUptodate(new_page);
+	kunmap(new_page);
+	UnlockPage(new_page);
+	page_cache_release(new_page);
+out:
+	return status;
+}
+
+int block_prepare_write(struct page *page, unsigned from, unsigned to,
+			get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	int err = __block_prepare_write(inode, page, from, to, get_block);
+	if (err) {
+		ClearPageUptodate(page);
+		kunmap(page);
+	}
+	return err;
+}
+
+int generic_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	__block_commit_write(inode,page,from,to);
+	kunmap(page);
+	if (pos > inode->i_size) {
+		inode->i_size = pos;
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
+{
+	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head *bh;
+	int err;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+
+	/* Find the buffer that contains "offset" */
+	bh = page->buffers;
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (!buffer_mapped(bh)) {
+		/* Hole? Nothing to do */
+		if (buffer_uptodate(bh))
+			goto unlock;
+		get_block(inode, iblock, bh, 0);
+		/* Still unmapped? Nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (Page_Uptodate(page))
+		set_bit(BH_Uptodate, &bh->b_state);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	memset(kmap(page) + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap(page);
+
+	__mark_buffer_dirty(bh);
+	err = 0;
+
+unlock:
+	UnlockPage(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+
+int block_write_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	int err;
+
+	/* easy case */
+	if (page->index < end_index)
+		return __block_write_full_page(inode, page, get_block);
+
+	/* things got complicated... */
+	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+	/* OK, are we completely out? */
+	if (page->index >= end_index+1 || !offset) {
+		UnlockPage(page);
+		return -EIO;
+	}
+
+	/* Sigh... will have to work, then... */
+	err = __block_prepare_write(inode, page, 0, offset, get_block);
+	if (!err) {
+		memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		__block_commit_write(inode,page,0,offset);
+done:
+		kunmap(page);
+		UnlockPage(page);
+		return err;
+	}
+	ClearPageUptodate(page);
+	goto done;
+}
+
+int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	struct inode *inode = mapping->host;
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	get_block(inode, block, &tmp, 0);
+	return tmp.b_blocknr;
+}
+
+static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf)
+{
+	if (atomic_dec_and_test(&kiobuf->io_count)) {
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (buffer_uptodate(bh) && !kiobuf->errno)
+				kiobuf->transferred += bh->b_size;
+			else if (!kiobuf->errno)
+				kiobuf->errno = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (kiobuf->end_io)
+			kiobuf->end_io(kiobuf);
+		wake_up(&kiobuf->wait_queue);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	kiobuf = brw_cb->kiobuf;
+	if (!uptodate && !kiobuf->errno)
+		brw_cb->kiobuf->errno = -EIO;
+	brw_kio_put_iobuf(brw_cb, kiobuf);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int		err;
+	int		length;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	struct page *	map;
+	struct buffer_head *tmp;
+	int		bh_nr;
+	int		i;
+
+#define MAX_KIOVEC_NR	8
+	struct brw_cb	*brw_cb_table[MAX_KIOVEC_NR];
+	struct brw_cb	*brw_cb;
+
+	if (!nr)
+		return 0;
+
+	if (nr > MAX_KIOVEC_NR) {
+		printk("kiovec too large: %d\n", nr);
+		BUG();
+	}
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (sector_size-1)) ||
+		    (iobuf->length & (sector_size-1))) {
+			printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size);
+			return -EINVAL;
+		}
+
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		iobuf->errno = 0;
+		iobuf->transferred = 0;
+		atomic_inc(&iobuf->io_count);
+
+		bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size;
+		if (!bh_nr) {
+			printk("brw_kiovec_async: !bh_nr\n");
+			return -EINVAL;
+		}
+
+		/* FIXME: tie into userbeans here */
+		brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL);
+		if (!brw_cb)
+			return -ENOMEM;
+
+		brw_cb_table[i] = brw_cb;
+		brw_cb->kiobuf = iobuf;
+		brw_cb->nr = 0;
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map  = iobuf->maplist[pageind];
+			err = -EFAULT;
+			if (!map)
+				goto error;
+
+			while (length > 0 && (bufind < nr_blocks)) {
+				blocknr = b[bufind++];
+				tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+				err = -ENOMEM;
+				if (!tmp)
+					goto error;
+
+				memset(tmp, 0, sizeof(*tmp));
+				init_waitqueue_head(&tmp->b_wait);
+				tmp->b_dev = B_FREE;
+				tmp->b_size = sector_size;
+				set_bh_page(tmp, map, offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+				tmp->b_private = brw_cb;
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					clear_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				brw_cb->bh[brw_cb->nr++] = tmp;
+				length -= sector_size;
+				offset += sector_size;
+
+				atomic_inc(&iobuf->io_count);
+
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i = 0; i < nr; i++) {
+		int j;
+		brw_cb = brw_cb_table[i];
+#if 1
+		for (j=0; j<brw_cb->nr; j++) 
+			submit_bh(rw, brw_cb->bh[j]);
+		//ll_rw_block(rw, brw_cb->nr, brw_cb->bh);
+#else
+		generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr);
+#endif
+		brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf);
+	}
+
+	return 0;
+
+ error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	do {
+		brw_cb = brw_cb_table[i];
+		if (brw_cb) {
+			/* We got an error allocating the bh'es.  Just free the current
+			   buffer_heads and exit. */
+			for (bhind = brw_cb->nr; bhind--; )
+				kmem_cache_free(bh_cachep, brw_cb->bh[bhind]);
+			atomic_dec(&brw_cb->kiobuf->io_count);
+			kfree(brw_cb);
+		}
+	} while (i--) ;
+
+	return err;
+}
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
+
+/*
+ * Start I/O on a page.
+ * This function expects the page to be locked and may return
+ * before I/O is complete. You then have to check page->locked,
+ * page->uptodate, and maybe wait on page->wait.
+ *
+ * brw_page() is SMP-safe, although it's being called with the
+ * kernel lock held - but the code is ready.
+ *
+ * FIXME: we need a swapper_inode->get_block function to remove
+ *        some of the bmap kludges and interface ugliness here.
+ */
+int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
+{
+	struct buffer_head *head, *bh;
+
+	if (!PageLocked(page))
+		panic("brw_page: page not locked for I/O");
+
+	if (!page->buffers)
+		create_empty_buffers(page, dev, size);
+	head = bh = page->buffers;
+
+	/* Stage 1: lock all the buffers */
+	do {
+		lock_buffer(bh);
+		bh->b_blocknr = *(b++);
+		set_bit(BH_Mapped, &bh->b_state);
+		bh->b_end_io = end_buffer_io_async;
+		atomic_inc(&bh->b_count);
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* Stage 2: start the IO */
+	do {
+		submit_bh(rw, bh);
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 0;
+}
+
+int block_symlink(struct inode *inode, const char *symname, int len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	int err = -ENOMEM;
+	char *kaddr;
+
+	if (!page)
+		goto fail;
+	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
+	if (err)
+		goto fail_map;
+	kaddr = page_address(page);
+	memcpy(kaddr, symname, len-1);
+	mapping->a_ops->commit_write(NULL, page, 0, len-1);
+	/*
+	 * Notice that we are _not_ going to block here - end of page is
+	 * unmapped, so this will only try to map the rest of page, see
+	 * that it is unmapped (typically even will not look into inode -
+	 * ->i_size will be enough for everything) and zero it out.
+	 * OTOH it's obviously correct and should make the page up-to-date.
+	 */
+	err = mapping->a_ops->readpage(NULL, page);
+	wait_on_page(page);
+	page_cache_release(page);
+	if (err < 0)
+		goto fail;
+	mark_inode_dirty(inode);
+	return 0;
+fail_map:
+	UnlockPage(page);
+	page_cache_release(page);
+fail:
+	return err;
+}
+
+/*
+ * Try to increase the number of buffers available: the size argument
+ * is used to determine what kind of buffers we want.
+ */
+static int grow_buffers(int size)
+{
+	struct page * page;
+	struct buffer_head *bh, *tmp;
+	struct buffer_head * insert_point;
+	int isize;
+
+	if ((size & 511) || (size > PAGE_SIZE)) {
+		printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
+		return 0;
+	}
+
+	page = alloc_page(GFP_BUFFER);
+	if (!page)
+		goto out;
+	LockPage(page);
+	bh = create_buffers(page, size, 0);
+	if (!bh)
+		goto no_buffer_head;
+
+	isize = BUFSIZE_INDEX(size);
+
+	spin_lock(&free_list[isize].lock);
+	insert_point = free_list[isize].list;
+	tmp = bh;
+	while (1) {
+		if (insert_point) {
+			tmp->b_next_free = insert_point->b_next_free;
+			tmp->b_prev_free = insert_point;
+			insert_point->b_next_free->b_prev_free = tmp;
+			insert_point->b_next_free = tmp;
+		} else {
+			tmp->b_prev_free = tmp;
+			tmp->b_next_free = tmp;
+		}
+		insert_point = tmp;
+		if (tmp->b_this_page)
+			tmp = tmp->b_this_page;
+		else
+			break;
+	}
+	tmp->b_this_page = bh;
+	free_list[isize].list = bh;
+	spin_unlock(&free_list[isize].lock);
+
+	page->buffers = bh;
+	page->flags &= ~(1 << PG_referenced);
+	lru_cache_add(page);
+	UnlockPage(page);
+	atomic_inc(&buffermem_pages);
+	return 1;
+
+no_buffer_head:
+	UnlockPage(page);
+	page_cache_release(page);
+out:
+	return 0;
+}
+
+/*
+ * Sync all the buffers on one page..
+ *
+ * If we have old buffers that are locked, we'll
+ * wait on them, but we won't wait on the new ones
+ * we're writing out now.
+ *
+ * This all is required so that we can free up memory
+ * later.
+ *
+ * Wait:
+ *	0 - no wait (this does not get called - see try_to_free_buffers below)
+ *	1 - start IO for dirty buffers
+ *	2 - wait for completion of locked buffers
+ */
+static void sync_page_buffers(struct buffer_head *bh, int wait)
+{
+	struct buffer_head * tmp = bh;
+
+	do {
+		struct buffer_head *p = tmp;
+		tmp = tmp->b_this_page;
+		if (buffer_locked(p)) {
+			if (wait > 1)
+				__wait_on_buffer(p);
+		} else if (buffer_dirty(p))
+			ll_rw_block(WRITE, 1, &p);
+	} while (tmp != bh);
+}
+
+/*
+ * Can the buffer be thrown out?
+ */
+#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+
+/*
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and free's the page if so.
+ *
+ * Wake up bdflush() if this fails - if we're running low on memory due
+ * to dirty buffers, we need to flush them out as quickly as possible.
+ *
+ * NOTE: There are quite a number of ways that threads of control can
+ *       obtain a reference to a buffer head within a page.  So we must
+ *	 lock out all of these paths to cleanly toss the page.
+ */
+int try_to_free_buffers(struct page * page, int wait)
+{
+	struct buffer_head * tmp, * bh = page->buffers;
+	int index = BUFSIZE_INDEX(bh->b_size);
+	int loop = 0;
+
+cleaned_buffers_try_again:
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	spin_lock(&free_list[index].lock);
+	tmp = bh;
+	do {
+		struct buffer_head *p = tmp;
+
+		tmp = tmp->b_this_page;
+		if (buffer_busy(p))
+			goto busy_buffer_page;
+	} while (tmp != bh);
+
+	spin_lock(&unused_list_lock);
+	tmp = bh;
+	do {
+		struct buffer_head * p = tmp;
+		tmp = tmp->b_this_page;
+
+		/* The buffer can be either on the regular
+		 * queues or on the free list..
+		 */
+		if (p->b_dev != B_FREE) {
+			remove_inode_queue(p);
+			__remove_from_queues(p);
+		} else
+			__remove_from_free_list(p, index);
+		__put_unused_buffer_head(p);
+	} while (tmp != bh);
+	spin_unlock(&unused_list_lock);
+
+	/* Wake up anyone waiting for buffer heads */
+	wake_up(&buffer_wait);
+
+	/* And free the page */
+	page->buffers = NULL;
+	page_cache_release(page);
+	spin_unlock(&free_list[index].lock);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	return 1;
+
+busy_buffer_page:
+	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
+	spin_unlock(&free_list[index].lock);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	if (wait) {
+		sync_page_buffers(bh, wait);
+		/* We waited synchronously, so we can free the buffers. */
+		if (wait > 1 && !loop) {
+			loop = 1;
+			goto cleaned_buffers_try_again;
+		}
+		wakeup_bdflush(0);
+	}
+	return 0;
+}
+
+/* ================== Debugging =================== */
+
+void show_buffers(void)
+{
+#ifdef CONFIG_SMP
+	struct buffer_head * bh;
+	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
+	int protected = 0;
+	int nlist;
+	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
+#endif
+
+	printk("Buffer memory:   %6dkB\n",
+			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
+
+#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
+	if (!spin_trylock(&lru_list_lock))
+		return;
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		found = locked = dirty = used = lastused = protected = 0;
+		bh = lru_list[nlist];
+		if(!bh) continue;
+
+		do {
+			found++;
+			if (buffer_locked(bh))
+				locked++;
+			if (buffer_protected(bh))
+				protected++;
+			if (buffer_dirty(bh))
+				dirty++;
+			if (atomic_read(&bh->b_count))
+				used++, lastused = found;
+			bh = bh->b_next_free;
+		} while (bh != lru_list[nlist]);
+		{
+			int tmp = nr_buffers_type[nlist];
+			if (found != tmp)
+				printk("%9s: BUG -> found %d, reported %d\n",
+				       buf_types[nlist], found, tmp);
+		}
+		printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
+		       "%d locked, %d protected, %d dirty\n",
+		       buf_types[nlist], found, size_buffers_type[nlist]>>10,
+		       used, lastused, locked, protected, dirty);
+	}
+	spin_unlock(&lru_list_lock);
+#endif
+}
+
+/* ===================== Init ======================= */
+
+/*
+ * allocate the hash table and init the free list
+ * Use gfp() for the hash table to decrease TLB misses, use
+ * SLAB cache for buffer heads.
+ */
+void __init buffer_init(unsigned long mempages)
+{
+	int order, i;
+	unsigned int nr_hash;
+
+	/* The buffer cache hash table is less important these days,
+	 * trim it a bit.
+	 */
+	mempages >>= 14;
+
+	mempages *= sizeof(struct buffer_head *);
+
+	for (order = 0; (1 << order) < mempages; order++)
+		;
+
+	/* try to allocate something until we get it or we're asking
+	   for something that is really too small */
+
+	do {
+		unsigned long tmp;
+
+		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
+		bh_hash_mask = (nr_hash - 1);
+
+		tmp = nr_hash;
+		bh_hash_shift = 0;
+		while((tmp >>= 1UL) != 0UL)
+			bh_hash_shift++;
+
+		hash_table = (struct buffer_head **)
+		    __get_free_pages(GFP_ATOMIC, order);
+	} while (hash_table == NULL && --order > 0);
+	printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
+	       nr_hash, order, (PAGE_SIZE << order));
+
+	if (!hash_table)
+		panic("Failed to allocate buffer hash table\n");
+
+	/* Setup hash chains. */
+	for(i = 0; i < nr_hash; i++)
+		hash_table[i] = NULL;
+
+	/* Setup free lists. */
+	for(i = 0; i < NR_SIZES; i++) {
+		free_list[i].list = NULL;
+		free_list[i].lock = SPIN_LOCK_UNLOCKED;
+	}
+
+	/* Setup lru lists. */
+	for(i = 0; i < NR_LIST; i++)
+		lru_list[i] = NULL;
+
+}
+
+
+/* ====================== bdflush support =================== */
+
+/* This is a simple kernel daemon, whose job it is to provide a dynamic
+ * response to dirty buffers.  Once this process is activated, we write back
+ * a limited number of buffers to the disks and then go back to sleep again.
+ */
+
+/* This is the _only_ function that deals with flushing async writes
+   to disk.
+   NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
+   as all dirty buffers lives _only_ in the DIRTY lru list.
+   As we never browse the LOCKED and CLEAN lru lists they are infact
+   completly useless. */
+static int flush_dirty_buffers(int check_flushtime)
+{
+	struct buffer_head * bh, *next;
+	int flushed = 0, i;
+
+ restart:
+	spin_lock(&lru_list_lock);
+	bh = lru_list[BUF_DIRTY];
+	if (!bh)
+		goto out_unlock;
+	for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
+		next = bh->b_next_free;
+
+		if (!buffer_dirty(bh)) {
+			__refile_buffer(bh);
+			continue;
+		}
+		if (buffer_locked(bh))
+			continue;
+
+		if (check_flushtime) {
+			/* The dirty lru list is chronologically ordered so
+			   if the current bh is not yet timed out,
+			   then also all the following bhs
+			   will be too young. */
+			if (time_before(jiffies, bh->b_flushtime))
+				goto out_unlock;
+		} else {
+			if (++flushed > bdf_prm.b_un.ndirty)
+				goto out_unlock;
+		}
+
+		/* OK, now we are committed to write it out. */
+		atomic_inc(&bh->b_count);
+		spin_unlock(&lru_list_lock);
+		ll_rw_block(WRITE, 1, &bh);
+		atomic_dec(&bh->b_count);
+
+		if (current->need_resched) {
+			/* kick what we've already pushed down */
+			run_task_queue(&tq_disk);
+			schedule();
+		}
+		goto restart;
+	}
+ out_unlock:
+	spin_unlock(&lru_list_lock);
+
+	return flushed;
+}
+
+struct task_struct *bdflush_tsk = 0;
+
+void wakeup_bdflush(int block)
+{
+	if (current != bdflush_tsk) {
+		wake_up_process(bdflush_tsk);
+
+		if (block)
+			flush_dirty_buffers(0);
+	}
+}
+
+/* 
+ * Here we attempt to write back old buffers.  We also try to flush inodes 
+ * and supers as well, since this function is essentially "update", and 
+ * otherwise there would be no way of ensuring that these quantities ever 
+ * get written back.  Ideally, we would have a timestamp on the inodes
+ * and superblocks so that we could write back only the old ones as well
+ */
+
+static int sync_old_buffers(void)
+{
+	lock_kernel();
+	sync_supers(0);
+	sync_inodes(0);
+	unlock_kernel();
+
+	flush_dirty_buffers(1);
+	/* must really sync all the active I/O request to disk here */
+	run_task_queue(&tq_disk);
+	return 0;
+}
+
+int block_sync_page(struct page *page)
+{
+	run_task_queue(&tq_disk);
+	return 0;
+}
+
+/* This is the interface to bdflush.  As we get more sophisticated, we can
+ * pass tuning parameters to this "process", to adjust how it behaves. 
+ * We would want to verify each parameter, however, to make sure that it 
+ * is reasonable. */
+
+asmlinkage long sys_bdflush(int func, long data)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (func == 1) {
+		/* do_exit directly and let kupdate to do its work alone. */
+		do_exit(0);
+#if 0 /* left here as it's the only example of lazy-mm-stuff used from
+	 a syscall that doesn't care about the current mm context. */
+		int error;
+		struct mm_struct *user_mm;
+
+		/*
+		 * bdflush will spend all of it's time in kernel-space,
+		 * without touching user-space, so we can switch it into
+		 * 'lazy TLB mode' to reduce the cost of context-switches
+		 * to and from bdflush.
+		 */
+		user_mm = start_lazy_tlb();
+		error = sync_old_buffers();
+		end_lazy_tlb(user_mm);
+		return error;
+#endif
+	}
+
+	/* Basically func 1 means read param 1, 2 means write param 1, etc */
+	if (func >= 2) {
+		int i = (func-2) >> 1;
+		if (i >= 0 && i < N_PARAM) {
+			if ((func & 1) == 0)
+				return put_user(bdf_prm.data[i], (int*)data);
+
+			if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
+				bdf_prm.data[i] = data;
+				return 0;
+			}
+		}
+		return -EINVAL;
+	}
+
+	/* Having func 0 used to launch the actual bdflush and then never
+	 * return (unless explicitly killed). We return zero here to 
+	 * remain semi-compatible with present update(8) programs.
+	 */
+	return 0;
+}
+
+/*
+ * This is the actual bdflush daemon itself. It used to be started from
+ * the syscall above, but now we launch it ourselves internally with
+ * kernel_thread(...)  directly after the first thread in init/main.c
+ */
+int bdflush(void *sem)
+{
+	struct task_struct *tsk = current;
+	int flushed;
+	/*
+	 *	We have a bare-bones task_struct, and really should fill
+	 *	in a few more things so "top" and /proc/2/{exe,root,cwd}
+	 *	display semi-sane things. Not real crucial though...  
+	 */
+
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "bdflush");
+	bdflush_tsk = tsk;
+
+	/* avoid getting signals */
+	spin_lock_irq(&tsk->sigmask_lock);
+	flush_signals(tsk);
+	sigfillset(&tsk->blocked);
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	up((struct semaphore *)sem);
+
+	for (;;) {
+		CHECK_EMERGENCY_SYNC
+
+		flushed = flush_dirty_buffers(0);
+		if (free_shortage())
+			flushed += page_launder(GFP_KERNEL, 0);
+
+		/*
+		 * If there are still a lot of dirty buffers around,
+		 * skip the sleep and flush some more. Otherwise, we
+		 * go to sleep waiting a wakeup.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!flushed || balance_dirty_state(NODEV) < 0) {
+			run_task_queue(&tq_disk);
+			schedule();
+		}
+		/* Remember to mark us as running otherwise
+		   the next schedule will block. */
+		__set_current_state(TASK_RUNNING);
+	}
+}
+
+/*
+ * This is the kernel update daemon. It was used to live in userspace
+ * but since it's need to run safely we want it unkillable by mistake.
+ * You don't need to change your userspace configuration since
+ * the userspace `update` will do_exit(0) at the first sys_bdflush().
+ */
+int kupdate(void *sem)
+{
+	struct task_struct * tsk = current;
+	int interval;
+
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kupdated");
+
+	/* sigstop and sigcont will stop and wakeup kupdate */
+	spin_lock_irq(&tsk->sigmask_lock);
+	sigfillset(&tsk->blocked);
+	siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	up((struct semaphore *)sem);
+
+	for (;;) {
+		/* update interval */
+		interval = bdf_prm.b_un.interval;
+		if (interval) {
+			tsk->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(interval);
+		} else {
+		stop_kupdate:
+			tsk->state = TASK_STOPPED;
+			schedule(); /* wait for SIGCONT */
+		}
+		/* check for sigstop */
+		if (signal_pending(tsk)) {
+			int stopped = 0;
+			spin_lock_irq(&tsk->sigmask_lock);
+			if (sigismember(&tsk->pending.signal, SIGSTOP)) {
+				sigdelset(&tsk->pending.signal, SIGSTOP);
+				stopped = 1;
+			}
+			recalc_sigpending(tsk);
+			spin_unlock_irq(&tsk->sigmask_lock);
+			if (stopped)
+				goto stop_kupdate;
+		}
+#ifdef DEBUG
+		printk(KERN_DEBUG "kupdate() activated...\n");
+#endif
+		sync_old_buffers();
+	}
+}
+
+static int __init bdflush_init(void)
+{
+	DECLARE_MUTEX_LOCKED(sem);
+	kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	down(&sem);
+	kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	down(&sem);
+	return 0;
+}
+
+module_init(bdflush_init)
+
+/* async kio interface */
+struct brw_cb {
+	struct kiobuf		*kiobuf;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf)
+{
+	if (atomic_dec_and_test(&kiobuf->io_count)) {
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (buffer_uptodate(bh) && !kiobuf->errno)
+				kiobuf->transferred += bh->b_size;
+			else if (!kiobuf->errno)
+				kiobuf->errno = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (kiobuf->end_io)
+			kiobuf->end_io(kiobuf);
+		wake_up(&kiobuf->wait_queue);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	kiobuf = brw_cb->kiobuf;
+	if (!uptodate && !kiobuf->errno)
+		brw_cb->kiobuf->errno = -EIO;
+	brw_kio_put_iobuf(brw_cb, kiobuf);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int		err;
+	int		length;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	struct page *	map;
+	struct buffer_head *tmp;
+	int		bh_nr;
+	int		i;
+
+#define MAX_KIOVEC_NR	8
+	struct brw_cb	*brw_cb_table[MAX_KIOVEC_NR];
+	struct brw_cb	*brw_cb;
+
+	if (!nr)
+		return 0;
+
+	if (nr > MAX_KIOVEC_NR) {
+		printk("kiovec too large: %d\n", nr);
+		BUG();
+	}
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (sector_size-1)) ||
+		    (iobuf->length & (sector_size-1))) {
+			printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size);
+			return -EINVAL;
+		}
+
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		iobuf->errno = 0;
+		iobuf->transferred = 0;
+		atomic_inc(&iobuf->io_count);
+
+		bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size;
+		if (!bh_nr) {
+			printk("brw_kiovec_async: !bh_nr\n");
+			return -EINVAL;
+		}
+
+		/* FIXME: tie into userbeans here */
+		brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL);
+		if (!brw_cb)
+			return -ENOMEM;
+
+		brw_cb_table[i] = brw_cb;
+		brw_cb->kiobuf = iobuf;
+		brw_cb->nr = 0;
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map  = iobuf->maplist[pageind];
+			err = -EFAULT;
+			if (!map)
+				goto error;
+
+			while (length > 0 && (bufind < nr_blocks)) {
+				blocknr = b[bufind++];
+				tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+				err = -ENOMEM;
+				if (!tmp)
+					goto error;
+
+				memset(tmp, 0, sizeof(*tmp));
+				init_waitqueue_head(&tmp->b_wait);
+				tmp->b_dev = B_FREE;
+				tmp->b_size = sector_size;
+				set_bh_page(tmp, map, offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+				tmp->b_private = brw_cb;
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					clear_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				brw_cb->bh[brw_cb->nr++] = tmp;
+				length -= sector_size;
+				offset += sector_size;
+
+				atomic_inc(&iobuf->io_count);
+
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i = 0; i < nr; i++) {
+		int j;
+		brw_cb = brw_cb_table[i];
+#if 1
+		for (j=0; j<brw_cb->nr; j++) 
+			submit_bh(rw, brw_cb->bh[j]);
+		//ll_rw_block(rw, brw_cb->nr, brw_cb->bh);
+#else
+		generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr);
+#endif
+		brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf);
+	}
+
+	return 0;
+
+ error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	do {
+		brw_cb = brw_cb_table[i];
+		if (brw_cb) {
+			/* We got an error allocating the bh'es.  Just free the current
+			   buffer_heads and exit. */
+			for (bhind = brw_cb->nr; bhind--; )
+				kmem_cache_free(bh_cachep, brw_cb->bh[bhind]);
+			atomic_dec(&brw_cb->kiobuf->io_count);
+			kfree(brw_cb);
+		}
+	} while (i--) ;
+
+	return err;
+}
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/asm-i386/unistd.h ac10-aio/include/asm-i386/unistd.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/asm-i386/unistd.h	Fri Aug 11 17:39:23 2000
+++ ac10-aio/include/asm-i386/unistd.h	Thu May 24 17:53:04 2001
@@ -227,6 +227,11 @@
 #define __NR_madvise1		219	/* delete when C lib stub is removed */
 #define __NR_getdents64		220
 #define __NR_fcntl64		221
+/* reserved for tux	222 */
+#define __NR___io_cancel	224
+#define __NR___io_wait		225
+#define __NR___io_getevents	226
+#define __NR_submit_ios		227
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/aio.h ac10-aio/include/linux/aio.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/aio.h	Wed Dec 31 19:00:00 1969
+++ ac10-aio/include/linux/aio.h	Thu May 24 17:53:04 2001
@@ -0,0 +1,130 @@
+/* linux/aio.h
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ */
+#ifndef __AIO_H__
+#define __AIO_H__
+
+#define IOCB_CMD_FINISHING	-3	/* kernel internal */
+
+#define IOCB_CMD_READ		0
+#define IOCB_CMD_WRITE		1
+#define IOCB_CMD_NOP		2
+#define IOCB_CMD_CANCEL		3
+#define IOCB_CMD_FSYNC		4
+#define IOCB_CMD_FDSYNC		5
+#define IOCB_CMD_RUNNING	6
+#define IOCB_CMD_DONE		7
+
+#define AIO_RING_SIZE	8000
+
+/* Notification method.  Not implemented yet. */
+#define AIO_IOCTL_SET_NOTIFY_SIGNAL	0x10c11005
+
+struct io_group {
+	int		nr;
+	void		*data;
+	struct iocb	**list;
+};
+
+struct io_group_list {
+	int			nr;
+	struct io_group	*list;
+};
+
+/* read() from /dev/aio returns these structures. */
+enum io_event_types {
+	IO_EVENT_NONE,
+	IO_EVENT_IOCB_DONE,
+};
+
+struct io_event {
+	long		type;
+	long		flags;
+	long		key;
+	void		*data;
+};
+
+struct aio_ring {
+	unsigned long		head;
+	unsigned long		tail;
+	unsigned long		woke;
+	unsigned long		__reserved;
+	struct io_event		io_events[AIO_RING_SIZE];
+};
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ *
+ * FIXME: this must change from glibc's definition
+ * as we do *not* use the sigevent structure which
+ * is big and bloated.
+ */
+
+struct iocb {
+	int	aio_fildes;
+	short	aio_lio_opcode;
+	short	aio_reqprio;
+	void	*aio_buf;
+	size_t	aio_nbytes;
+	loff_t	aio_offset;
+
+	/* these are internal to the kernel/libc. */
+	ssize_t	__aio_return;	/* the kernel writes the return code here */
+	long	__aio_key;	/* the kernel sets this to -1 when completed,
+				 * otherwise is the >= 0 iogrp #. */
+}; /* 32 bytes on 32 bit machines, 48 on 64 */
+
+#ifdef __KERNEL__
+#define AIO_MAXSEGS		4
+#define AIO_KIOGRP_NR_ATOMIC	8
+
+struct kiocb {
+	int		nr_kiovec;
+	struct kiobuf	*kiovec[AIO_MAXSEGS];
+	struct iocb	*user_aiocb;
+	struct file	*filp;
+	long		aio_return;
+};
+
+#define IOGRP_STATE_SETUP	0
+#define IOGRP_STATE_DONE	1
+
+struct kiogrp {
+	int		locked:1;
+	atomic_t	count;		/* ios left */
+	void		*user_data;
+	struct kioctx	*ctx;
+	int		idx;
+	int		nr_iocbs;
+	struct kiocb	**iocbs;
+	struct kiocb	*atomic_iocbs[AIO_KIOGRP_NR_ATOMIC];
+};
+
+struct kioctx {
+	atomic_t		users;
+
+	wait_queue_head_t	wait;
+
+	int			max_reqs;
+	struct kiogrp		**reqs;
+
+	spinlock_t		done_lock;
+
+	int			pid;		/* pid to send wakeups to */
+	struct aio_ring		*ring;
+	struct file		*filp;
+};
+
+extern struct file_operations aio_fops;
+
+extern void __aioctx_put(struct kioctx *ctx);
+
+#define aioctx_get(kioctx)	atomic_inc(&(kioctx)->users)
+#define aioctx_put(kioctx)	do { if (atomic_dec_and_test(&(kioctx)->users)) __aioctx_put(kioctx); } while (0)
+
+#endif /*__KERNEL__*/
+
+#endif /* __AIO_H__ */
+
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/blkdev.h ac10-aio/include/linux/blkdev.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/blkdev.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/blkdev.h	Thu May 24 18:01:23 2001
@@ -149,7 +149,7 @@
 extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
 extern void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size);
 extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
-extern void generic_make_request(int rw, struct buffer_head * bh);
+extern void generic_make_request(int rw, struct buffer_head *bh);
 extern request_queue_t *blk_get_queue(kdev_t dev);
 extern inline request_queue_t *__blk_get_queue(kdev_t dev);
 extern void blkdev_release_request(struct request *);
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/event.h ac10-aio/include/linux/event.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/event.h	Wed Dec 31 19:00:00 1969
+++ ac10-aio/include/linux/event.h	Thu May 24 17:53:04 2001
@@ -0,0 +1,21 @@
+#ifndef _LINUX_KEVENTQ_H
+#define _LINUX_KEVENTQ_H
+
+typedef struct file *keventq_t;
+
+keventq_t keventq_get(int qid);
+#define keventq_put(evq)	fput(evq)
+
+keventq_t keventq_get(int qid)
+{
+	struct file *filp = fget(qid);
+	if (filp) {
+		if (&keventq_fops == filp->f_op)
+			return filp;
+		fput(filp);
+	}
+	return NULL;
+}
+
+
+#endif
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/fs.h ac10-aio/include/linux/fs.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/fs.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/fs.h	Thu May 24 18:01:23 2001
@@ -20,7 +20,6 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/stddef.h>
-#include <linux/string.h>
 
 #include <asm/atomic.h>
 
@@ -762,7 +761,13 @@
  * NOTE:
  * read, write, poll, fsync, readv, writev can be called
  *   without the big kernel lock held in all filesystems.
+ *
+ * rw_kiovec returns the number of bytes that will actually
+ * be transferred into the kiovec, or an error that occurred
+ * during queueing.
  */
+struct kiobuf;
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -782,6 +787,7 @@
 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	int (*rw_kiovec)(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 };
 
 struct inode_operations {
@@ -1323,6 +1329,7 @@
 extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos);
 
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
 extern int generic_file_open(struct inode *, struct file *);
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/iobuf.h ac10-aio/include/linux/iobuf.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/iobuf.h	Fri May 18 20:10:57 2001
+++ ac10-aio/include/linux/iobuf.h	Thu May 24 18:01:23 2001
@@ -53,8 +53,10 @@
 
 	/* Dynamic state for IO completion: */
 	atomic_t	io_count;	/* IOs still in progress */
+	int		transferred;	/* Number of bytes of completed IO at the beginning of the buffer */
 	int		errno;		/* Status of completed IO */
 	void		(*end_io) (struct kiobuf *); /* Completion callback */
+	void		*end_io_data;
 	wait_queue_head_t wait_queue;
 };
 
@@ -80,7 +82,9 @@
 
 /* fs/buffer.c */
 
+int	brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+		   kdev_t dev, int nr_blocks, unsigned long b[], int size);
 int	brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
-		   kdev_t dev, unsigned long b[], int size);
+		   kdev_t dev, int nr_blocks, unsigned long b[], int size);
 
 #endif /* __LINUX_IOBUF_H */
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/locks.h ac10-aio/include/linux/locks.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/locks.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/locks.h	Thu May 24 18:01:23 2001
@@ -30,8 +30,7 @@
 {
 	clear_bit(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
-	if (waitqueue_active(&bh->b_wait))
-		wake_up(&bh->b_wait);
+	wake_up(&bh->b_wait);
 }
 
 /*
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/mm.h ac10-aio/include/linux/mm.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/mm.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/mm.h	Thu May 24 18:01:23 2001
@@ -315,8 +315,7 @@
 					smp_mb__before_clear_bit(); \
 					if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \
 					smp_mb__after_clear_bit(); \
-					if (waitqueue_active(&(page)->wait)) \
-						wake_up(&(page)->wait); \
+					wake_up(&(page)->wait); \
 				} while (0)
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/sched.h ac10-aio/include/linux/sched.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/sched.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/sched.h	Thu May 24 18:01:23 2001
@@ -758,6 +758,7 @@
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
 #define __wait_event(wq, condition) 					\
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/tqueue.h ac10-aio/include/linux/tqueue.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/tqueue.h	Fri May 18 20:10:50 2001
+++ ac10-aio/include/linux/tqueue.h	Thu May 24 18:01:23 2001
@@ -67,6 +67,7 @@
 #define TQ_ACTIVE(q)		(!list_empty(&q))
 
 extern task_queue tq_timer, tq_immediate, tq_disk;
+extern struct tq_struct run_disk_tq;
 
 /*
  * To implement your own list of active bottom halfs, use the following
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/wait.h ac10-aio/include/linux/wait.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/wait.h	Thu May 17 15:25:12 2001
+++ ac10-aio/include/linux/wait.h	Thu May 24 18:01:23 2001
@@ -28,17 +28,20 @@
 #define WAITQUEUE_DEBUG 0
 #endif
 
+typedef struct __wait_queue wait_queue_t;
+typedef void (*wait_queue_func_t)(wait_queue_t *wait);
+
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
 	struct task_struct * task;
 	struct list_head task_list;
+	wait_queue_func_t func;
 #if WAITQUEUE_DEBUG
 	long __magic;
 	long __waker;
 #endif
 };
-typedef struct __wait_queue wait_queue_t;
 
 /*
  * 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -137,6 +140,7 @@
 #endif
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
+	func:		NULL,						\
 	task:		tsk,						\
 	task_list:	{ NULL, NULL },					\
 			 __WAITQUEUE_DEBUG_INIT(name)}
@@ -174,6 +178,22 @@
 #endif
 	q->flags = 0;
 	q->task = p;
+	q->func = NULL;
+#if WAITQUEUE_DEBUG
+	q->__magic = (long)&q->__magic;
+#endif
+}
+
+static inline void init_waitqueue_func_entry(wait_queue_t *q,
+					wait_queue_func_t func)
+{
+#if WAITQUEUE_DEBUG
+	if (!q || !p)
+		WQ_BUG();
+#endif
+	q->flags = 0;
+	q->task = NULL;
+	q->func = func;
 #if WAITQUEUE_DEBUG
 	q->__magic = (long)&q->__magic;
 #endif
@@ -230,6 +250,19 @@
 #endif
 	list_del(&old->task_list);
 }
+
+#define add_wait_queue_cond(q, wait, cond, fail) \
+	do {							\
+		unsigned long flags;				\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = 0;				\
+		if (cond)					\
+			__add_wait_queue((q), (wait));		\
+		else {						\
+			fail;					\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+	} while (0)
 
 #endif /* __KERNEL__ */
 
diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/worktodo.h ac10-aio/include/linux/worktodo.h
--- /md0/kernels/2.4/v2.4.4-ac10/include/linux/worktodo.h	Wed Dec 31 19:00:00 1969
+++ ac10-aio/include/linux/worktodo.h	Thu May 24 18:01:25 2001
@@ -0,0 +1,40 @@
+#ifndef _LINUX_WORKTODO_H
+#define _LINUX_WORKTODO_H
+
+#ifndef _LINUX_WAIT_H
+#include <linux/wait.h>
+#endif
+#ifndef _LINUX_TQUEUE_H
+#include <linux/tqueue.h>
+#endif
+
+struct worktodo {
+	wait_queue_t		wait;
+	struct tq_struct	tq;
+
+	void *data;	/* for use by the wtd_ primatives */
+};
+
+/* FIXME NOTE: factor from kernel/context.c */
+#define wtd_queue(wtd)	schedule_task(&(wtd)->tq)
+
+#define wtd_set_action(wtd, action, wtddata)	\
+	do {					\
+		(wtd)->tq.routine = (action);	\
+		(wtd)->tq.data = (wtddata);	\
+	} while (0)
+
+struct page;
+extern void wtd_wait_page(struct worktodo *wtd, struct page *page);
+extern void wtd_lock_page(struct worktodo *wtd, struct page *page);
+struct buffer_head;
+extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh);
+
+#if 0	/* not implemented yet */
+extern void wtd_down(struct worktodo *wtd, struct semaphore *sem);
+extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem);
+extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem);
+#endif
+
+#endif /* _LINUX_WORKTODO_H */
+
diff -urN /md0/kernels/2.4/v2.4.4-ac10/init/main.c ac10-aio/init/main.c
--- /md0/kernels/2.4/v2.4.4-ac10/init/main.c	Thu May 17 15:25:12 2001
+++ ac10-aio/init/main.c	Thu May 24 17:53:02 2001
@@ -803,8 +803,13 @@
 	if (initrd_start && mount_initrd) root_mountflags &= ~MS_RDONLY;
 	else mount_initrd =0;
 #endif
-
-	start_context_thread();
+	{
+		int i = smp_num_cpus;
+		if (i < 2)
+			i = 2;
+		for (; i>0; i--)
+			start_context_thread();
+	}
 	do_initcalls();
 
 #ifdef CONFIG_IRDA
diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/context.c ac10-aio/kernel/context.c
--- /md0/kernels/2.4/v2.4.4-ac10/kernel/context.c	Thu May 17 15:25:12 2001
+++ ac10-aio/kernel/context.c	Thu May 24 17:53:02 2001
@@ -91,12 +91,18 @@
 	 */
 	for (;;) {
 		set_task_state(curtask, TASK_INTERRUPTIBLE);
-		add_wait_queue(&context_task_wq, &wait);
-		if (TQ_ACTIVE(tq_context))
+		add_wait_queue_exclusive_lifo(&context_task_wq, &wait);
+		if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context))
 			set_task_state(curtask, TASK_RUNNING);
-		schedule();
+		else
+			schedule();
 		remove_wait_queue(&context_task_wq, &wait);
 		run_task_queue(&tq_context);
+		while (TQ_ACTIVE(tq_context)) {
+			if (current->need_resched)
+				schedule();
+			run_task_queue(&tq_context);
+		}
 		wake_up(&context_task_done);
 		if (signal_pending(curtask)) {
 			while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0)
diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/fork.c ac10-aio/kernel/fork.c
--- /md0/kernels/2.4/v2.4.4-ac10/kernel/fork.c	Thu May 17 15:25:12 2001
+++ ac10-aio/kernel/fork.c	Thu May 24 17:53:02 2001
@@ -44,6 +44,16 @@
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }
 
+void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)
+{
+	unsigned long flags;
+
+	wq_write_lock_irqsave(&q->lock, flags);
+	wait->flags = WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+	wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/sched.c ac10-aio/kernel/sched.c
--- /md0/kernels/2.4/v2.4.4-ac10/kernel/sched.c	Thu May 17 15:25:12 2001
+++ ac10-aio/kernel/sched.c	Thu May 24 17:53:02 2001
@@ -716,13 +716,13 @@
 }
 
 /*
- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small
+ * +ve number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by contonuing to scan the queue.
  */
 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 			 	     int nr_exclusive, const int sync)
@@ -735,14 +735,25 @@
 	
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
-                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_func_t func;
 
 		CHECK_MAGIC(curr->__magic);
+		func = curr->func;
+		if (func) {
+			unsigned flags = curr->flags;
+			func(curr);
+			if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+				break;
+			continue;
+		}
 		p = curr->task;
 		state = p->state;
 		if (state & mode) {
 			WQ_NOTE_WAKER(curr);
-			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			if (try_to_wake_up(p, sync) &&
+			    (curr->flags & WQ_FLAG_EXCLUSIVE) &&
+			    !--nr_exclusive)
 				break;
 		}
 	}
diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/softirq.c ac10-aio/kernel/softirq.c
--- /md0/kernels/2.4/v2.4.4-ac10/kernel/softirq.c	Fri Dec 29 17:07:24 2000
+++ ac10-aio/kernel/softirq.c	Thu May 24 17:53:02 2001
@@ -311,6 +311,7 @@
 		data = p->data;
 		wmb();
 		p->sync = 0;
+		smp_mb();
 		if (f)
 			f(data);
 	}
diff -urN /md0/kernels/2.4/v2.4.4-ac10/mm/filemap.c ac10-aio/mm/filemap.c
--- /md0/kernels/2.4/v2.4.4-ac10/mm/filemap.c	Thu May 17 15:25:12 2001
+++ ac10-aio/mm/filemap.c	Thu May 24 17:53:02 2001
@@ -23,12 +23,14 @@
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/worktodo.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/mman.h>
 
 #include <linux/highmem.h>
+#include <linux/iobuf.h>
 
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -2723,3 +2725,729 @@
 		panic("Failed to allocate page hash table\n");
 	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
 }
+
+/* address_space_map
+ *	Maps a series of pages from the page cache into the given array.
+ */
+static int address_space_map(struct address_space *as, unsigned long index,
+		int nr, struct page **pages,
+		int *nr_newp, struct page **new_pages)
+{
+	struct page *cached_page = NULL;
+	int nr_new = 0;
+	int ret;
+
+	ret = -EINVAL;
+	if (nr <= 0)
+		goto out;
+
+	ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+	while (nr > 0) {
+		struct page **hash = page_hash(as, index);
+		struct page *page;
+
+		page = __find_page_nolock(as, index, *hash);
+		if (page) {
+			page_cache_get(page);
+got_page:
+			pages[ret++] = page;
+			index++;
+			nr--;
+			continue;
+		}
+
+		if (cached_page) {
+			__add_to_page_cache(cached_page, as, index, hash);
+			nr_new++;
+			*new_pages++ = page = cached_page;
+			cached_page = NULL;
+			goto got_page;
+		}
+		spin_unlock(&pagecache_lock);
+
+		cached_page = page_cache_alloc(as);
+		if (!cached_page)
+			goto out;
+
+		/* Okay, we now have an allocated page.  Retry
+		 * the search and add. */
+		spin_lock(&pagecache_lock);
+	}
+
+	spin_unlock(&pagecache_lock);
+
+out:
+	if (cached_page)
+		page_cache_free(cached_page);
+
+	*nr_newp = nr_new;
+	return ret ? ret : -ENOMEM;
+}
+
+struct iodesc {
+	struct worktodo	wtd;
+
+	struct page	*good_page;	/* the highest Uptodate page */
+	int		good_idx;
+	int		err;
+	int		did_read;
+	int		rw;
+
+	struct page	**pages;
+	struct page	**new_pages;
+	struct page	**cur_pagep;
+	struct page	**src_pagep;
+	int		nr_pages;
+	int		nr_new_pages;
+
+	struct address_space *as;
+	struct file	*file;
+	struct kiobuf	*kiovec[8];
+	int		kio_nr;
+
+	size_t		size;
+	unsigned long	transferred;
+	unsigned	offset;
+	unsigned	src_offset;
+	struct kiobuf	*iobuf;
+
+	int		sync;
+
+#define READDESC_NR_DEF	3
+	struct page *def_pages[READDESC_NR_DEF];
+	struct page *def_new_pages[READDESC_NR_DEF];
+};
+
+static void __iodesc_free(struct iodesc *io)
+{
+	int i;
+
+	for (i=0; i<io->nr_pages; i++)
+		page_cache_release(io->pages[i]);
+
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+	kfree(io);
+}
+
+/* By the time this function is called, all of the pages prior to
+ * the current good_idx have been released appropriately.  The remaining
+ * duties are to release any remaining pages and to honour O_SYNC.
+ */
+static void __iodesc_finish_write(struct iodesc *io)
+{
+	int i;
+
+	pr_debug("__iodesc_finish_write(%p)\n", io);
+
+	if (WRITE == io->rw)
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		UnlockPage(page);
+		deactivate_page(page);
+		//page_cache_release(page);
+	}
+
+	/* FIXME: this is buggy */
+	{
+		struct kiobuf *iobuf = io->kiovec[0];
+		iobuf->transferred = io->transferred;
+		iobuf->errno = io->err;
+		iobuf->end_io(iobuf);
+	}
+
+	__iodesc_free(io);
+}
+
+/* This is mostly ripped from generic_file_write */
+static int __iodesc_write_page(struct iodesc *io, struct page *page)
+{
+	unsigned long bytes;
+	unsigned long offset, src_offset;
+	struct page *src_page;
+	long status;
+	char *kaddr;
+	int src_bytes;
+	char *src;
+	int done = 0;
+	unsigned left;
+
+	src_bytes = PAGE_CACHE_SIZE - io->src_offset;
+	src_page = *io->src_pagep;
+	src = kmap(src_page) + io->src_offset;
+
+	offset = io->offset;
+	src_offset = io->src_offset;
+	kaddr = kmap(page);
+	kaddr += offset;
+
+	bytes = PAGE_CACHE_SIZE - offset;
+	if (io->size < bytes)
+		bytes = io->size;
+
+	pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset);
+
+	io->err = io->as->a_ops->prepare_write(io->file, page,
+						offset, offset + bytes);
+	if (io->err) {
+printk("prepare_write: %d\n", io->err);
+		goto unlock;
+	}
+
+	left = bytes;
+	for (;;) {
+		if (left < src_bytes)
+			src_bytes = left;
+
+		memcpy(kaddr, src, src_bytes);
+		kaddr += src_bytes;
+		src += src_bytes;
+		left -= src_bytes;
+		src_offset += src_bytes;
+		src_offset &= PAGE_SIZE - 1;
+		if (!src_offset)
+			io->src_pagep++;
+
+		if (left <= 0)
+			break;
+
+		if (!src_offset) {
+			kunmap(src_page);
+			src_page = *io->src_pagep;
+			src = kmap(src_page);
+			src_bytes = PAGE_SIZE;
+		}
+	}
+	flush_dcache_page(page);
+	status = io->as->a_ops->commit_write(io->file, page,
+						offset, offset+bytes);
+
+	/* We don't handle short writes */
+	if (status > 0 && status != bytes)
+		done = 1;
+
+	if (!status)
+		status = bytes;
+else
+printk("commit_write: %ld\n", status);
+
+	if (status > 0) {
+		io->transferred += status;
+		io->size -= status;
+		io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1);
+
+		if (io->offset)
+			done = 1;
+
+		io->src_offset += status;
+		io->src_offset &= PAGE_CACHE_SIZE - 1;
+	} else {
+		io->err = status;
+		done = 1;
+	}
+
+unlock:
+	kunmap(page);
+	kunmap(src_page);
+
+	//UnlockPage(page);
+	//deactivate_page(page);
+	//page_cache_release(page);
+
+	return done;
+}
+
+void __iodesc_sync_wait_page(void *data)
+{
+	struct iodesc *io = data;
+
+	do {
+		struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers;
+
+		if (!head)
+			continue;
+
+		bh = head;
+		do {
+			if (buffer_locked(bh)) {
+//printk("waiting on bh=%pi io=%p\n", bh, io);
+				wtd_wait_on_buffer(&io->wtd, bh);
+				return;
+			}
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+//printk("io err bh=%p (%p)\n", bh, io);
+				io->err = -EIO;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+	} while (!io->err && ++io->good_idx < io->nr_pages) ;
+
+//printk("finish_write(%p)\n", io);
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_do_write(void *data)
+{
+	struct iodesc *io = data;
+	unsigned i;
+
+	up(&io->file->f_dentry->d_inode->i_sem);
+
+	for (i=0; i<io->nr_pages; i++)
+		if (__iodesc_write_page(io, io->pages[i]))
+			break;
+
+	if (io->sync) {
+		io->good_idx = 0;
+
+//printk("writing out pages(%p)\n", io);
+		for (i=0; i<io->nr_pages; i++) {
+			if (io->pages[i]->buffers)
+				writeout_one_page(io->pages[i]);
+		}
+
+//printk("calling __iodesc_sync_wait_page(%p)\n", io);
+		wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io);
+		__iodesc_sync_wait_page(io);
+		return;
+	}
+
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_write_lock_next_page(void *data)
+{
+	struct iodesc *io = data;
+	pr_debug("__iodesc_write_next_page(%p)\n", io);
+
+	while (io->good_idx < io->nr_pages) {
+		io->good_page = io->pages[io->good_idx++];
+		if (io->good_page == *io->cur_pagep)
+			io->cur_pagep++;
+		else {
+			wtd_lock_page(&io->wtd, io->good_page);
+			return;
+		}
+	}
+
+	//__iodesc_do_write(io);
+	wtd_set_action(&io->wtd, __iodesc_do_write, io);
+	wtd_queue(&io->wtd);
+}
+
+static 
+void __generic_file_write_iodesc(struct iodesc *io)
+{
+	struct inode *inode = io->file->f_dentry->d_inode;
+	time_t now = CURRENT_TIME;
+
+	remove_suid(inode);
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+		inode->i_ctime = inode->i_mtime = now;
+		mark_inode_dirty_sync(inode);
+	}
+
+	wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io);
+	io->sync = !!(io->file->f_flags & O_SYNC);
+	io->good_idx = 0;
+	io->cur_pagep = io->new_pages;
+	io->src_offset = io->kiovec[0]->offset;
+	io->src_pagep = io->kiovec[0]->maplist;
+	__iodesc_write_lock_next_page(io);
+}
+
+static void __iodesc_read_finish(struct iodesc *io)
+{
+	char *dst_addr, *src_addr;
+	int src_off, i;
+	size_t size;
+	size_t valid;
+
+	struct page **src_pagep;
+
+	pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx);
+	if (io->good_idx <= 0)
+		goto no_data;
+
+	size = io->size;
+	src_off = io->offset;
+	src_pagep = io->pages;
+	src_addr = kmap(*src_pagep);
+
+	valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT;
+	valid -= src_off;
+	pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off);
+
+	if (valid < size)
+		size = valid;
+
+	for (i=0; i<io->kio_nr; i++) {
+		struct kiobuf *iobuf = io->kiovec[i];
+		int dst_len = iobuf->length;
+		int dst_off = iobuf->offset;
+		struct page **dst_pagep = iobuf->maplist;
+
+		dst_addr = kmap(*dst_pagep);
+		iobuf->transferred = 0;
+
+		while (size > 0) {
+			int this = PAGE_CACHE_SIZE - src_off;
+			if ((PAGE_SIZE - dst_off) < this)
+				this = PAGE_SIZE - dst_off;
+			if (size < this)
+				this = size;
+			pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n",
+				this, src_off, dst_off, dst_len);
+			memcpy(dst_addr + dst_off, src_addr + src_off, this);
+
+			src_off += this;
+			dst_off += this;
+			dst_len -= this;
+			size -= this;
+			iobuf->transferred += this;
+			pr_debug("read_finish: this=%d transferred=%d\n", this, iobuf->transferred);
+
+			if (dst_len <= 0)
+				break;
+
+			if (size <= 0)
+				break;
+
+			if (dst_off >= PAGE_SIZE) {
+				kunmap(*dst_pagep);
+				dst_pagep++;
+				dst_addr = kmap(*dst_pagep);
+				dst_off = 0;
+			}
+
+			if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */
+				kunmap(*src_pagep);
+pr_debug("page(%lu)->count = %d\n", (*src_pagep)->index, atomic_read(&(*src_pagep)->count));
+				src_pagep++;
+				src_addr = kmap(*src_pagep);
+				src_off = 0;
+			}
+		}
+		kunmap(*dst_pagep);
+
+		iobuf->errno = iobuf->transferred ? 0 : io->err;
+		if (iobuf->errno && i)
+			iobuf->errno = -EAGAIN;
+		iobuf->end_io(iobuf);
+	}
+
+	kunmap(*src_pagep);
+	__iodesc_free(io);
+
+	return;
+
+no_data:
+	io->kiovec[0]->errno = io->err;
+	io->kiovec[0]->transferred = 0;
+	io->kiovec[0]->end_io(io->kiovec[0]);
+
+	for (i=1; i<io->kio_nr; i++) {
+		struct kiobuf *iobuf = io->kiovec[i];
+
+		iobuf->errno = -EAGAIN;
+		iobuf->transferred = 0;
+		iobuf->end_io(iobuf);
+	}
+	__iodesc_free(io);
+}
+
+static void __iodesc_make_uptodate(void *data)
+{
+	struct iodesc *io = data;
+	struct page *page = io->good_page;
+	int locked = 1;
+
+	pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index);
+	while (Page_Uptodate(page)) {
+again:
+		pr_debug("page index %lu uptodate\n", page->index);
+		if (locked) {
+			UnlockPage(page);
+			locked = 0;
+		}
+		io->did_read = 0;
+		io->good_idx++;
+		if (io->good_idx >= io->nr_pages) {
+			__iodesc_read_finish(io);
+			return;
+		}
+		page = io->good_page = io->pages[io->good_idx];
+		pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index);
+	}
+
+	if (!locked) {
+		wtd_lock_page(&io->wtd, page);
+		return;
+	}
+
+	if (!io->did_read) {
+		/* We haven't tried reading this page before, give it a go. */
+		printk("attempting to read %lu\n", page->index);
+		io->did_read = 1;
+		io->err = page->mapping->a_ops->readpage(io->file, page);
+		if (!io->err) {
+			if (Page_Uptodate(page))
+				goto again;
+			wtd_lock_page(&io->wtd, page);
+			return;
+		}
+	}
+
+	if (locked)
+		UnlockPage(page);
+
+	/* We've already read this page before.  Set err to EIO and quite */
+	if (!io->err)
+		io->err = -EIO;
+	__iodesc_read_finish(io);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data);
+
+static void __generic_file_read_iodesc(struct iodesc *io, int mayblock)
+{
+	int (*readpage)(struct file *, struct page *);
+	int i;
+
+	wtd_set_action(&io->wtd, __iodesc_make_uptodate, io);
+	readpage = io->as->a_ops->readpage;
+	for (i=0; i<io->nr_new_pages; i++) {
+		int foo;
+		if (!mayblock)
+			goto do_wtd;
+		foo = readpage(io->file, io->new_pages[i]);
+		if (foo)
+			printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, foo);
+	}
+
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		if (Page_Uptodate(page)) {
+			pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index);
+			continue;
+		}
+
+		if (!mayblock)
+			goto do_wtd;
+		if (!TryLockPage(page)) {
+			int foo = readpage(io->file, page);
+			if (foo)
+				printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, foo);
+		}
+
+		if (!Page_Uptodate(page) && io->good_idx == -1) {
+			pr_debug("first good_idx=%d (%lu)\n", i, page->index);
+			io->good_idx = i;
+			io->good_page = page;
+		}
+	}
+
+	/* Whee, all the pages are uptodate! */
+	if (!io->good_page) {
+	do {static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n");} while(0);
+		pr_debug("all pages uptodate!\n");
+		io->good_idx = io->nr_pages;
+		__iodesc_read_finish(io);
+		return;
+	}
+
+	pr_debug("locking good_page\n");
+	wtd_lock_page(&io->wtd, io->good_page);
+	return;
+
+do_wtd:
+	do {static int zoo; if (zoo++ < 5) printk("read sleep\n");} while(0);
+	wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+	wtd_queue(&io->wtd);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data)
+{
+	struct iodesc *io = data;
+	__generic_file_read_iodesc(io, 1);
+}
+
+int generic_file_rw_kiovec(struct file *file, int rw,
+	int kio_nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *as = inode->i_mapping;
+	unsigned long index;
+	unsigned long eindex;
+	unsigned long nr_pages;
+	struct iodesc *io = NULL;
+	int ret;
+
+	ret = -EINVAL;
+	if (rw != READ && rw != WRITE)
+		goto out;
+
+	ret = -ENOMEM;
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
+		goto out;
+
+	memset(io, 0, sizeof(*io));
+	io->size = size;
+
+	if (READ == rw) {
+		pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size);
+
+		if (pos > inode->i_size)
+			size = 0;
+		else if ((pos + size) > inode->i_size)
+			size = inode->i_size - pos;
+
+		if (io->size < size)
+			size = io->size;
+		else if (size < io->size)
+			io->size = size;
+
+		pr_debug("io->size=%d size=%d\n", io->size, size);
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = eindex - index + 1;
+
+	pr_debug("nr_pages: %lu\n", nr_pages);
+
+	io->good_idx = -1;
+	io->good_page = NULL;
+	io->did_read = 0;
+	io->err = 0;
+	io->rw = rw;
+	io->as = as;
+	io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1);
+	io->file = file;
+	io->kio_nr = kio_nr;
+	if (kio_nr > 8)
+		BUG();
+	memcpy(io->kiovec, kiovec, sizeof(struct kiobuf *) * kio_nr);
+	if (nr_pages < READDESC_NR_DEF) {
+		io->pages = io->def_pages;
+		io->new_pages = io->def_new_pages;
+	} else {
+		io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->pages)
+			goto out_io;
+
+		io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->new_pages)
+			goto out_pages;
+	}
+
+	/* FIXME: make the down a WTD_op */
+	if (rw == WRITE)
+		down(&io->file->f_dentry->d_inode->i_sem);
+
+	ret = address_space_map(as, index, nr_pages, io->pages,
+			&io->nr_new_pages, io->new_pages);
+	pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages);
+	if (ret <= 0)
+		goto out_new_pages;
+
+	io->nr_pages = ret;
+	io->pages[io->nr_pages] = NULL;
+	io->new_pages[io->nr_new_pages] = NULL;
+
+	if (rw == READ)
+		__generic_file_read_iodesc(io, 0);
+	else if (rw == WRITE)
+		__generic_file_write_iodesc(io);
+
+	return 0;
+
+out_new_pages:
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+out_pages:
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+out_io:
+	kfree(io);
+out:
+	return ret;
+}
+
+static void __wtd_lock_page_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct page *page = (struct page *)wtd->data;
+
+	if (!TryLockPage(page)) {
+		__remove_wait_queue(&page->wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else {
+		schedule_task(&run_disk_tq);
+	}
+}
+
+void wtd_lock_page(struct worktodo *wtd, struct page *page)
+{
+	if (TryLockPage(page)) {
+		int raced = 0;
+		wtd->data = page;
+		init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter);
+		add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page), raced = 1);
+
+		if (!raced) {
+			run_task_queue(&tq_disk);
+			return;
+		}
+	}
+
+	wtd->tq.routine(wtd->tq.data);
+}
+
+static void __wtd_bh_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct buffer_head *bh = (struct buffer_head *)wtd->data;
+
+	if (!buffer_locked(bh)) {
+		__remove_wait_queue(&bh->b_wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else {
+		schedule_task(&run_disk_tq);
+	}
+}
+
+void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh)
+{
+	int raced = 0;
+
+	if (!buffer_locked(bh)) {
+		wtd->tq.routine(wtd->tq.data);
+		return;
+	}
+	wtd->data = bh;
+	init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter);
+	add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh), raced = 1);
+
+	if (raced)
+		wtd->tq.routine(wtd->tq.data);
+	else
+		run_task_queue(&tq_disk);
+}
+
+void do_run_tq_disk(void *data)
+{
+	run_task_queue(&tq_disk);
+}
+
+struct tq_struct run_disk_tq = {
+	routine: do_run_tq_disk,
+	data: NULL
+};
+