diff -urN /md0/kernels/2.4/v2.4.4-ac10/Makefile ac10-aio/Makefile --- /md0/kernels/2.4/v2.4.4-ac10/Makefile Thu May 17 15:25:02 2001 +++ ac10-aio/Makefile Thu May 24 17:53:00 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 4 -EXTRAVERSION = -ac10 +EXTRAVERSION = -ac10-aio1 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN /md0/kernels/2.4/v2.4.4-ac10/arch/i386/kernel/entry.S ac10-aio/arch/i386/kernel/entry.S --- /md0/kernels/2.4/v2.4.4-ac10/arch/i386/kernel/entry.S Wed Nov 8 20:09:50 2000 +++ ac10-aio/arch/i386/kernel/entry.S Thu May 24 17:53:22 2001 @@ -646,6 +646,11 @@ .long SYMBOL_NAME(sys_getdents64) /* 220 */ .long SYMBOL_NAME(sys_fcntl64) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */ + .long SYMBOL_NAME(sys_ni_syscall) /* 223 */ + .long SYMBOL_NAME(sys___io_cancel) + .long SYMBOL_NAME(sys___io_wait) + .long SYMBOL_NAME(sys___io_getevents) + .long SYMBOL_NAME(sys_submit_ios) /* * NOTE!! This doesn't have to be exact - we just have diff -urN /md0/kernels/2.4/v2.4.4-ac10/drivers/char/mem.c ac10-aio/drivers/char/mem.c --- /md0/kernels/2.4/v2.4.4-ac10/drivers/char/mem.c Thu May 17 15:25:04 2001 +++ ac10-aio/drivers/char/mem.c Thu May 24 17:53:13 2001 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -571,6 +572,9 @@ case 9: filp->f_op = &urandom_fops; break; + case 10: + filp->f_op = &aio_fops; + break; default: return -ENXIO; } @@ -595,7 +599,8 @@ {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, {7, "full", S_IRUGO | S_IWUGO, &full_fops}, {8, "random", S_IRUGO | S_IWUSR, &random_fops}, - {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} + {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, + {10,"aio", S_IRUGO | S_IWUSR, &aio_fops}, }; int i; diff -urN /md0/kernels/2.4/v2.4.4-ac10/drivers/char/raw.c ac10-aio/drivers/char/raw.c --- /md0/kernels/2.4/v2.4.4-ac10/drivers/char/raw.c Thu May 3 11:22:10 2001 +++ ac10-aio/drivers/char/raw.c Thu May 24 17:53:14 2001 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -36,13 +38,14 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, write: raw_write, open: raw_open, release: raw_release, + rw_kiovec: raw_rw_kiovec, }; static struct file_operations raw_ctl_fops = { @@ -130,7 +133,8 @@ * the blocksize on a device which is already mounted. */ - sector_size = 512; + //sector_size = 512; + sector_size = 2048; if (get_super(rdev) != NULL) { if (blksize_size[MAJOR(rdev)]) sector_size = blksize_size[MAJOR(rdev)][MINOR(rdev)]; @@ -259,7 +263,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -360,7 +363,7 @@ for (i=0; i < blocks; i++) iobuf->blocks[i] = blocknr++; - err = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, sector_size); + err = brw_kiovec(rw, 1, &iobuf, dev, blocks, iobuf->blocks, sector_size); if (rw == READ && err > 0) mark_dirty_kiobuf(iobuf, err); @@ -390,3 +393,92 @@ out: return err; } + +int raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos) +{ + int err; + unsigned long blocknr, blocks; + unsigned long __b[KIO_MAX_SECTORS]; + unsigned long *b = __b; + int i; + int minor; + kdev_t dev; + unsigned long limit; + + int sector_size, sector_bits, sector_mask; + int max_sectors; + +#if 0 /* FIXME: this is wrong. */ + err = 0; + if (!size) + goto out_complete; +#endif + + pr_debug("raw_rw_kiovec: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + dprintk ("rw_raw_dev_async: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + printk("pos/size wrong\n"); + goto out; + } + + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + printk("raw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + if (!blocks) + goto out; + + if (blocks > KIO_MAX_SECTORS) { + err = -ENOMEM; + b = kmalloc(sizeof(*b) * blocks, GFP_KERNEL); + if (!b) + goto out; + } + + for (i=0; i < blocks; i++) + b[i] = blocknr++; + + err = brw_kiovec_async(rw, nr, kiovec, dev, blocks, b, sector_size); + pr_debug("brw_kiovec_async: %d\n", err); + + if (b != __b) + kfree(b); +out: + pr_debug("brw_kiovec_async: ret is %d\n", err); + return err; +} + diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/Makefile ac10-aio/fs/Makefile --- /md0/kernels/2.4/v2.4.4-ac10/fs/Makefile Thu May 17 15:25:10 2001 +++ ac10-aio/fs/Makefile Thu May 24 17:53:00 2001 @@ -12,7 +12,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o \ + ioctl.o readdir.o select.o fifo.o locks.o aio.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/aio.c ac10-aio/fs/aio.c --- /md0/kernels/2.4/v2.4.4-ac10/fs/aio.c Wed Dec 31 19:00:00 1969 +++ ac10-aio/fs/aio.c Thu May 24 17:53:02 2001 @@ -0,0 +1,894 @@ +/* drivers/char/aio.c + * Copyright 2000 Red Hat, Inc. All Rights Reserved. + * + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements /dev/aio, something on top of which it should be possible + * to write a POSIX AIO library. + * + * Notes on interface: + * - aiocbs are submitted by doing a submit_ios syscall + * on an array of aiocbs to the /dev/aio fd + * - on completion, the aiocb, events are placed in + * a ringbuffer + * - the contents of the ring buffer can be read via the + * __io_getevents syscall. + * - each open(/dev/aio) instance provides a unique aio + * control space + */ +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#undef KERN_DEBUG +#define KERN_DEBUG "" + +static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t aio_req_lock = SPIN_LOCK_UNLOCKED; + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kiogrp_cachep; +static kmem_cache_t *kioctx_cachep; + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kiogrp_cachep = kmem_cache_create("kiogrp", sizeof(struct kiogrp), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiogrp_cachep) + panic("unable to create kiogrp cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + printk(KERN_NOTICE "aio_setup: okay!\n"); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +/* ioctx_alloc + * Allocates and initializes an aioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(void) +{ + struct kioctx *ctx; + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (ctx) { + memset(ctx, 0, sizeof(*ctx)); + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->done_lock); + init_waitqueue_head(&ctx->wait); + + ctx->max_reqs = AIO_RING_SIZE; + ctx->reqs = kmalloc(sizeof(struct iocb *) * ctx->max_reqs, GFP_KERNEL); + if (ctx->reqs) { + memset(ctx->reqs, 0, sizeof(struct iocb *) * ctx->max_reqs); + ctx->ring = kmalloc(sizeof(*ctx->ring), GFP_KERNEL); + if (ctx->ring) { + memset(ctx->ring, 0, sizeof(*ctx->ring)); + printk("aio: allocated aioctx %p\n", ctx); + return ctx; + } + kfree(ctx->reqs); + ctx->reqs = NULL; + } + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + } + + printk("aio: error allocating aioctx %p\n", ctx); + return ctx; +} + +struct kiogrp *kiogrp_alloc(struct kioctx *ctx) +{ + struct kiogrp *iogrp; + + iogrp = kmem_cache_alloc(kiogrp_cachep, GFP_KERNEL); + if (iogrp) { + memset(iogrp, 0, sizeof(*iogrp)); + aioctx_get(ctx); + iogrp->ctx = ctx; + iogrp->idx = -1; + } + return iogrp; +} + +void kiocb_free(struct kiocb *iocb) +{ + int i; + + for (i=0; inr_kiovec; i++) + unmap_kiobuf(iocb->kiovec[i]); + + free_kiovec(iocb->nr_kiovec, iocb->kiovec); + iocb->nr_kiovec = 0; + fput(iocb->filp); + iocb->filp = NULL; + kmem_cache_free(kiocb_cachep, iocb); +} + +void kiogrp_free(struct kiogrp *iogrp) +{ + struct kioctx *ctx = iogrp->ctx; + int i; + pr_debug("kio_free: %p/%d\n", iogrp, iogrp->idx); + + if ((i=atomic_read(&iogrp->count))) { + printk("kiogrp_free: %d/%p/%d still active!!!\n", i, iogrp, iogrp->idx); + return; + } + + if ((iogrp->idx >= 0) && (iogrp->idx < ctx->max_reqs)) + ctx->reqs[iogrp->idx] = NULL; + + for (i=0; inr_iocbs; i++) { + kiocb_free(iogrp->iocbs[i]); + } + kmem_cache_free(kiogrp_cachep, iogrp); + aioctx_put(ctx); +} + +/* iogrp_putio + * Called when the io count on iogrp is decremented. Checks + * to see if the kiogrp the request belongs to has finished, + * and if so sends the completion notice to its context. + */ +static void iogrp_putio(struct kiogrp *iogrp) +{ + struct kioctx *ctx = iogrp->ctx; + struct aio_ring *ring = ctx->ring; + unsigned long flags; + unsigned long tail; + + /* Is this the last io to complete in the group? */ + if (!atomic_dec_and_test(&iogrp->count)) { + if (atomic_read(&iogrp->count) < 0) + BUG(); + return; + } + + /* Yes we are, go ahead with completion */ + aioctx_get(ctx); + + /* add a completion event to the ring buffer. + * must be done holding done_lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->done_lock, flags); + + tail = (ring->tail + 1) % AIO_RING_SIZE; + + ring->io_events[tail].data = iogrp->user_data; + ring->io_events[tail].key = iogrp->idx; + ring->io_events[tail].type = IO_EVENT_IOCB_DONE; + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + spin_unlock_irqrestore(&ctx->done_lock, flags); + + pr_debug("added to ring %p at [%lu]\n", iogrp, tail); +#if 0 + if (!wake) { + printk("kio_complete: should send user of %p a signal...\n", ctx); + } +#endif + + wake_up(&ctx->wait); + + aioctx_put(ctx); +} + +/* aio_kiobuf_endio + * Called when io on a given kiobuf is complete. + */ +static void aio_kiobuf_endio(struct kiobuf *iobuf) +{ + struct kiogrp *iogrp = iobuf->end_io_data; + + /* TODO: possibly put the return code into the iocb + * here. This only really makes sense if it's being + * put into the user's iocb, which would mean pinning + * it down in memory. Maybe. + */ + pr_debug("aio_kiobuf_endio: %p %p/%d\n", iobuf, iogrp, iogrp->idx); + iogrp_putio(iogrp); +} + +/* kio_submit: + * Submits an actual aiocb + */ +static inline int kio_submit(struct kiogrp *iogrp, struct kiocb *iocb, + struct iocb *aiocb) +{ + int (*rw_kiovec)(struct file *, int, int, struct kiobuf **, int, size_t, loff_t); + int ret = -ENOSYS; + int rw; + + switch(aiocb->aio_lio_opcode) { + case IOCB_CMD_WRITE: + rw = WRITE; + break; + case IOCB_CMD_READ: + rw = READ; + break; + default: + printk("kio_submit: lio_opcode = %d\n", aiocb->aio_lio_opcode); + goto out; + } + + rw_kiovec = iocb->filp->f_op->rw_kiovec; + if (rw_kiovec) + ret = rw_kiovec(iocb->filp, rw, iocb->nr_kiovec, iocb->kiovec, /*flags*/ 0, aiocb->aio_nbytes, aiocb->aio_offset); + else { + iocb->kiovec[0]->transferred = 0; + iocb->kiovec[0]->errno = -ENOSYS; + aio_kiobuf_endio(iocb->kiovec[0]); + ret = 0; + } + +out: + if (ret) { + static int count; + if (count < 10) { + count++; + printk("kio_submit: failed!\n"); + } + atomic_dec(&iogrp->count); + if (atomic_read(&iogrp->count) < 0) + BUG(); + } + + return ret; +} + +/*----------------- /dev/aio interface ----------------------- */ +static inline struct kiocb *aio_convert_user_aiocb(struct kiogrp *iogrp, + struct iocb *uaiocb, struct iocb *user_aiocb) +{ + struct kiocb *iocb; + int rw = WRITE; + int ret = -ENOMEM; + int i; + + iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); + if (!iocb) + goto out; + + atomic_inc(&iogrp->count); /* FIXME: should be according to number of iobufs in this iocb */ + + memset(iocb, 0, sizeof(*iocb)); + + iocb->user_aiocb = user_aiocb; + iocb->filp = fget(uaiocb->aio_fildes); + ret = -EBADF; + if (!iocb->filp) + goto out_err; + + iocb->nr_kiovec = 1; + ret = alloc_kiovec(1, iocb->kiovec); + if (ret) + goto out_err; + + for (i=0; i < iocb->nr_kiovec; i++) { + iocb->kiovec[i]->end_io = aio_kiobuf_endio; + iocb->kiovec[i]->end_io_data = iogrp; + } + + switch (uaiocb->aio_lio_opcode) { + case IOCB_CMD_READ: rw = READ; + case IOCB_CMD_WRITE: + pr_debug("aio: map_user_kiobuf(%d, %p, %lu, %lu) = ", + rw, iocb->kiovec[0], (unsigned long)uaiocb->aio_buf, + (unsigned long)uaiocb->aio_nbytes); + ret = map_user_kiobuf(rw, iocb->kiovec[0], + (unsigned long)uaiocb->aio_buf, + uaiocb->aio_nbytes); + pr_debug("%d\n", ret); + if (ret) + goto out_kiobuf_err; + break; + default: + ret = -EINVAL; + printk("aio_convert_user_aiocb: lio_opcode = %d\n", uaiocb->aio_lio_opcode); + goto out_kiobuf_err; + } + + pr_debug("kio_convert_user_aiocb: (%p, %p) / %p\n", iogrp, uaiocb, iocb); + + return iocb; + +out_kiobuf_err: +out_err: + kiocb_free(iocb); +out: + return ERR_PTR(ret); +} + +/* aio_open + * Open method for /dev/aio. Allocates an aioctx for this open()er + * and places it in the file's private_data field. Can fail because + * of memory allocation failure. + */ +int aio_open(struct inode *inode, struct file *filp) +{ + struct kioctx *ctx = ioctx_alloc(); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + filp->private_data = ctx; + ctx->filp = filp; + return 0; +} + +/* aio_release + * Free the aioctx associated with the file. FIXME! + */ +int aio_release(struct inode *inode, struct file *filp) +{ + struct kioctx *ioctx = filp->private_data; + printk("aio_release(%p)\n", filp->private_data); + aioctx_put(ioctx); + filp->private_data = NULL; + return 0; +} + +/* kiocb_get + * + */ +static inline struct kiogrp *kiogrp_get(struct kioctx *ctx, int idx, void *key) +{ + struct kiogrp *iogrp; + + spin_lock(&aio_req_lock); + iogrp = ctx->reqs[idx]; + if (iogrp && iogrp->user_data == key) { + if (!iogrp->locked) + iogrp->locked = 1; + else + iogrp = ERR_PTR(-EBUSY); + } else + iogrp = ERR_PTR(-ENOENT); + spin_unlock(&aio_req_lock); + return iogrp; +} + +/* aio_complete + * Checks if the kiogrp in ctx at idx is finished. If so, copies the + * completion codes into userspace, and then releases the kiogrp. + */ +static int aio_complete(struct kioctx *ctx, int idx, void *key, int please_wait) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + struct kiogrp *iogrp; + int ret = -EINVAL; + unsigned i; + + pr_debug("aio_complete: %p %d %p %d\n", ctx, idx, key, please_wait); + if (idx < 0 || idx >= ctx->max_reqs) { + printk("aio_complete: idx(%d) is invalid\n", idx); + goto out; + } + + ret = -EBUSY; + + if (please_wait) { + add_wait_queue(&ctx->wait, &wait); + + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + iogrp = kiogrp_get(ctx, idx, key); + if (iogrp == ERR_PTR(-EBUSY)) { + schedule(); + + /* interrupted due to a signal? */ + iogrp = ERR_PTR(-EINTR); + if (signal_pending(tsk)) + break; + iogrp = kiogrp_get(ctx, idx, key); + } + } while (iogrp == ERR_PTR(-EBUSY)); + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + } else + iogrp = kiogrp_get(ctx, idx, key); + + ret = PTR_ERR(iogrp); + if (IS_ERR(iogrp)) { + printk("aio_complete: ERR: %d [%d, %p] from %p\n", ret, idx, key, __builtin_return_address(0)); + goto out; + } + + pr_debug("aio_complete: [%d] = %p\n", idx, iogrp); + + ret = -EFAULT; + for (i=0; inr_iocbs; i++) { + struct kiocb *iocb = iogrp->iocbs[i]; + + /* FIXME: decide kiovec vs iocb interaction, this is a KLUDGE */ + iocb->aio_return = iocb->kiovec[0]->transferred ? + iocb->kiovec[0]->transferred : + iocb->kiovec[0]->errno; + + if (put_user(iocb->aio_return, &iocb->user_aiocb->__aio_return)) + goto out_undo; + if (put_user(-1, &iocb->user_aiocb->__aio_key)) + goto out_undo; + } + + /* everything turned out well, dispose of the aiocb. */ + kiogrp_free(iogrp); + + return 0; + +out_undo: +printk("out_undo\n"); + /* unlock and wakeup so anyone else waiting can attempt this iocb */ + iogrp->locked = 0; + wake_up(&ctx->wait); + +out: + return ret; +} + +/* aio_read_evt + * Pull an event off of the aioctx's event ring. + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct aio_ring *ring, struct io_event *ent) +{ + unsigned long head; + int ret = -EAGAIN; + + pr_debug("in aio_read_evt h%lu t%lu\n", ring->head, ring->tail); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&aio_read_lock); /* investigate the value of making this per-ctx */ + + head = ring->head; + if (head != ring->tail) { + head = (head + 1) % AIO_RING_SIZE; + *ent = ring->io_events[head]; + barrier(); + ring->head = head; + ret = 0; + } + spin_unlock(&aio_read_lock); + +out: + pr_debug("leaving aio_read_evt: %d h%lu t%lu\n", ret, ring->head, ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + wait_queue_head_t wait; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up(&to->wait); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + init_waitqueue_head(&to->wait); +} + +static inline void set_timeout(struct timeout *to, struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, struct io_event *event, int max_nr, + struct timespec *timeout) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(to_wait, tsk); + int ret = -EINVAL; + int nr = 0; + struct io_event ent; + struct timespec ts; + struct timeout to; + + init_timeout(&to); + + if (timeout) { + ret = -EFAULT; + if (copy_from_user(&ts, timeout, sizeof(ts))) + goto out; + + set_timeout(&to, &ts); + } + + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (nr < max_nr) { + ret = aio_read_evt(ctx->ring, &ent); + if (ret) { + if (nr) + break; + + add_wait_queue(&ctx->wait, &wait); + add_wait_queue(&to.wait, &to_wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx->ring, &ent); + if (!ret) + break; + ret = -ETIMEDOUT; + if (to.timed_out) + break; + schedule(); + if (to.timed_out) + break; + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + ret = aio_read_evt(ctx->ring, &ent); + } while (ret) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + remove_wait_queue(&to.wait, &to_wait); + } + + if (ret) + break; + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (copy_to_user(event, &ent, sizeof(ent))) { + /* FIXME: we lose an event here. */ + printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n"); + break; + } + + /* Now complete the aio request and copy the result codes to userland. */ + ret = aio_complete(ctx, ent.key, ent.data, 0); + if (ret) { + printk(KERN_DEBUG "aio: lost an event -- aio_complete: %d.\n", ret); + break; /* FIXME: we lose an event here */ + } + + event ++; + nr ++; + } + + if (timeout) + clear_timeout(&to); +out: + return nr ? nr : ret; +} + +/* __aioctx_put + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __aioctx_put(struct kioctx *ctx) +{ + struct io_event ent; + printk("aio: free aioctx %p\n", ctx); + + /* release any io requests that were not reaped by the user process */ + while (!aio_read_evt(ctx->ring, &ent)) { + struct kiogrp *iogrp = kiogrp_get(ctx, ent.key, ent.data); + if (!IS_ERR(iogrp)) + kiogrp_free(iogrp); + } + + kfree(ctx->ring); + kfree(ctx->reqs); + kmem_cache_free(kioctx_cachep, ctx); +} + +/* aio_read + * read() method for /dev/aio. Reads the next iogrp completion + * event off of the queue and then copies the iocb's return codes + * back into the userspace aiocbs. + * FIXME: error handling isn't complete. Bummer. + * TODO: implement O_NONBLOCK. + */ +static ssize_t aio_read(struct file *filp, char *buf, size_t size, loff_t *offp) +{ + struct kioctx *ctx; + int ret; + + if (size < 0) + return -EINVAL; + + size /= sizeof(struct io_event); + ctx = filp->private_data; + + ret = read_events(ctx, (struct io_event *)buf, size, NULL); + + return (ret > 0) ? ret * sizeof(struct io_event) : ret; +} + +/* iogrp_setup + * Allocate and initialize a kiogrp in the given + * context at idx. For positive values of idx, + * attempts to install the iogrp at idx, negative + * means allocate one. + * Error returns are by means of ERR_PTR's. + */ +static inline struct kiogrp *iogrp_setup(struct kioctx *ctx, int idx) +{ + struct kiogrp *iogrp; + + iogrp = ERR_PTR(-EINVAL); + if (idx >= ctx->max_reqs) + goto out; + + iogrp = kiogrp_alloc(ctx); + if (IS_ERR(iogrp)) + goto out; + + /* Get a reference to ze iogrp so that it isn't reported + * as complete before we're done queuing it. + */ + //atomic_inc(&iogrp->count); + + /* Assign the iogrp an id. */ + + /* FIXME: use cmpxchg instead of spin_lock? */ + spin_lock(&aio_req_lock); + if (idx < 0) { + for (idx=0; (idxmax_reqs) && (ctx->reqs[idx]); idx++) + ; + if (idx < ctx->max_reqs) + ctx->reqs[idx] = iogrp; + else { + printk("iogrp_setup: -EAGAIN\n"); + idx = -EAGAIN; + } + } else if (idx < ctx->max_reqs) { + if (!ctx->reqs[idx]) + ctx->reqs[idx] = iogrp; + else { + printk("iogrp_setup: -EBUSY\n"); + idx = -EBUSY; + } + } else + idx = -EINVAL; + + spin_unlock(&aio_req_lock); + + iogrp->idx = idx; /* side effect on error: kiogrp_free notices idx < 0 */ + if (idx < 0) { + //atomic_dec(&iogrp->count); + kiogrp_free(iogrp); + iogrp = ERR_PTR(idx); + } + +out: + return iogrp; +} + +static inline struct kioctx *get_ioctx(int ctx_id) +{ + struct file *filp; + + filp = fget(ctx_id); + if (filp) { + if (filp->f_op == &aio_fops) + return filp->private_data; + fput(filp); + } + + return NULL; +} + +static inline void put_ioctx(struct kioctx *ctx) +{ + fput(ctx->filp); +} + + +/* __submit_io + * Copies the aiocb from userspace into the kernel and sets up the + * request. Returns 0 if the request is successfully queued, -errno + * otherwise. + */ +static inline long __submit_io(struct kioctx *ctx, struct iocb *uaiocbp) +{ + struct iocb uaiocb; + long ret; + struct kiogrp *iogrp; + struct kiocb *kiocb; + + iogrp = iogrp_setup(ctx, -1); + ret = PTR_ERR(iogrp); + if (IS_ERR(iogrp)) + goto out_nofree; + + pr_debug("aio: submit %p %p\n", uaiocbp, &uaiocb); + ret = -EFAULT; + if (copy_from_user(&uaiocb, uaiocbp, sizeof(uaiocb))) + goto out; + + kiocb = aio_convert_user_aiocb(iogrp, &uaiocb, uaiocbp); + pr_debug("aio: kiocb = %p\n", kiocb); + ret = PTR_ERR(kiocb); + if (IS_ERR(kiocb)) + goto out; + + /* we don't do scatter gather... yet */ + iogrp->nr_iocbs = 1; + iogrp->iocbs = iogrp->atomic_iocbs; + iogrp->iocbs[0] = kiocb; + iogrp->user_data = uaiocbp; + + ret = -EFAULT; + if (put_user((int)iogrp->idx, &uaiocbp->__aio_key)) + goto out; + + /* kio_submit will free the kiocb if it fails. */ + ret = kio_submit(iogrp, kiocb, &uaiocb); + if (!ret) + return 0; + + if (atomic_read(&iogrp->count) != 0) + BUG(); + kiogrp_free(iogrp); + + return ret; + +out: + /* Shoot, something went wrong. Discard the iogrp we allocated. */ + kiogrp_free(iogrp); +out_nofree: + return ret; +} + +/* sys_submit_ios + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +long sys_submit_ios(int ctx_id, int nr, struct iocb **uaiocbpp) +{ + struct kioctx *ctx; + struct iocb *uaiocbp; + int i; + long ret = 0; + + if (ctx_id < 0 || nr <= 0) + goto out_inval; + + ctx = get_ioctx(ctx_id); + if (!ctx) + goto out_inval; + + for (i=0; isession = 1; tsk->pgrp = 1; strcpy(tsk->comm, "bdflush"); + bdflush_tsk = tsk; /* avoid getting signals */ spin_lock_irq(&tsk->sigmask_lock); @@ -2731,16 +2736,22 @@ CHECK_EMERGENCY_SYNC flushed = flush_dirty_buffers(0, 0); + if (free_shortage()) + flushed += page_launder(GFP_KERNEL, 0); /* * If there are still a lot of dirty buffers around, * skip the sleep and flush some more. Otherwise, we * go to sleep waiting a wakeup. */ + set_current_state(TASK_INTERRUPTIBLE); if (!flushed || balance_dirty_state(NODEV) < 0) { run_task_queue(&tq_disk); - interruptible_sleep_on(&bdflush_wait); + schedule(); } + /* Remember to mark us as running otherwise + the next schedule will block. */ + __set_current_state(TASK_RUNNING); } } @@ -2811,3 +2822,251 @@ module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + struct kiobuf *kiobuf; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf) +{ + if (atomic_dec_and_test(&kiobuf->io_count)) { + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (buffer_uptodate(bh) && !kiobuf->errno) + kiobuf->transferred += bh->b_size; + else if (!kiobuf->errno) + kiobuf->errno = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (kiobuf->end_io) + kiobuf->end_io(kiobuf); + wake_up(&kiobuf->wait_queue); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + kiobuf = brw_cb->kiobuf; + if (!uptodate && !kiobuf->errno) + brw_cb->kiobuf->errno = -EIO; + brw_kio_put_iobuf(brw_cb, kiobuf); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int err; + int length; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + struct page * map; + struct buffer_head *tmp; + int bh_nr; + int i; + +#define MAX_KIOVEC_NR 8 + struct brw_cb *brw_cb_table[MAX_KIOVEC_NR]; + struct brw_cb *brw_cb; + + if (!nr) + return 0; + + if (nr > MAX_KIOVEC_NR) { + printk("kiovec too large: %d\n", nr); + BUG(); + } + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (sector_size-1)) || + (iobuf->length & (sector_size-1))) { + printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size); + return -EINVAL; + } + + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + iobuf->errno = 0; + iobuf->transferred = 0; + atomic_inc(&iobuf->io_count); + + bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size; + if (!bh_nr) { + printk("brw_kiovec_async: !bh_nr\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb_table[i] = brw_cb; + brw_cb->kiobuf = iobuf; + brw_cb->nr = 0; + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + map = iobuf->maplist[pageind]; + err = -EFAULT; + if (!map) + goto error; + + while (length > 0 && (bufind < nr_blocks)) { + blocknr = b[bufind++]; + tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + err = -ENOMEM; + if (!tmp) + goto error; + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, map, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + atomic_inc(&iobuf->io_count); + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* okay, we've setup all our io requests, now fire them off! */ + for (i = 0; i < nr; i++) { + int j; + brw_cb = brw_cb_table[i]; +#if 1 + for (j=0; jnr; j++) + submit_bh(rw, brw_cb->bh[j]); + //ll_rw_block(rw, brw_cb->nr, brw_cb->bh); +#else + generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr); +#endif + brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf); + } + + return 0; + + error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + do { + brw_cb = brw_cb_table[i]; + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (bhind = brw_cb->nr; bhind--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[bhind]); + atomic_dec(&brw_cb->kiobuf->io_count); + kfree(brw_cb); + } + } while (i--) ; + + return err; +} + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/ext2/file.c ac10-aio/fs/ext2/file.c --- /md0/kernels/2.4/v2.4.4-ac10/fs/ext2/file.c Thu May 17 15:25:10 2001 +++ ac10-aio/fs/ext2/file.c Thu May 24 17:53:00 2001 @@ -41,6 +41,7 @@ struct file_operations ext2_file_operations = { read: generic_file_read, write: generic_file_write, + rw_kiovec: generic_file_rw_kiovec, ioctl: ext2_ioctl, mmap: generic_file_mmap, open: generic_file_open, diff -urN /md0/kernels/2.4/v2.4.4-ac10/fs/foopp ac10-aio/fs/foopp --- /md0/kernels/2.4/v2.4.4-ac10/fs/foopp Wed Dec 31 19:00:00 1969 +++ ac10-aio/fs/foopp Thu May 24 17:53:02 2001 @@ -0,0 +1,3134 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'buffer.c' implements the buffer-cache functions. Race-conditions have + * been avoided by NEVER letting an interrupt change a buffer (except for the + * data, of course), but instead letting the caller do it. + */ + +/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ + +/* Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + */ + +/* Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. -DaveM + */ + +/* Added 32k buffer block sizes - these are required older ARM systems. + * - RMK + */ + +/* Thread it... -DaveM */ + +/* async buffer flushing, 1999 Andrea Arcangeli */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct brw_cb { + struct kiobuf *kiobuf; + int nr; + struct buffer_head *bh[1]; +}; + +#include +#include +#include +#include + +#define NR_SIZES 7 +static char buffersize_index[65] = +{-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, + 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, + 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, + 6}; + +#define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) +#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) +#define NR_RESERVED (2*MAX_BUF_PER_PAGE) +#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this + number of unused buffer heads */ + +/* Anti-deadlock ordering: + * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock + */ + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers) + +/* + * Hash table gook.. + */ +static unsigned int bh_hash_mask; +static unsigned int bh_hash_shift; +static struct buffer_head **hash_table; +static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; + +static struct buffer_head *lru_list[NR_LIST]; +static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; +static int nr_buffers_type[NR_LIST]; +static unsigned long size_buffers_type[NR_LIST]; + +static struct buffer_head * unused_list; +static int nr_unused_buffer_heads; +static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; +static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); + +struct bh_free_head { + struct buffer_head *list; + spinlock_t lock; +}; +static struct bh_free_head free_list[NR_SIZES]; + +static int grow_buffers(int size); +static void __refile_buffer(struct buffer_head *); + +/* This is used by some architectures to estimate available memory. */ +atomic_t buffermem_pages = ATOMIC_INIT(0); + +/* Here is the parameter block for the bdflush process. If you add or + * remove any of the parameters, make sure to update kernel/sysctl.c. + */ + +#define N_PARAM 9 + +/* The dummy values in this structure are left in there for compatibility + * with old programs that play with the /proc entries. + */ +union bdflush_param { + struct { + int nfract; /* Percentage of buffer cache dirty to + activate bdflush */ + int ndirty; /* Maximum number of dirty blocks to write out per + wake-cycle */ + int nrefill; /* Number of clean buffers to try to obtain + each time we call refill */ + int dummy1; /* unused */ + int interval; /* jiffies delay between kupdate flushes */ + int age_buffer; /* Time for normal buffer to age before we flush it */ + int nfract_sync; /* Percentage of buffer cache dirty to + activate bdflush synchronously */ + int dummy2; /* unused */ + int dummy3; /* unused */ + } b_un; + unsigned int data[N_PARAM]; +} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}}; + +/* These are the min and max parameter values that we will allow to be assigned */ +int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; +int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0}; + +/* + * Rewrote the wait-routines to use the "new" wait-queue functionality, + * and getting rid of the cli-sti pairs. The wait-queue routines still + * need cli-sti, but now it's just a couple of 386 instructions or so. + * + * Note that the real wait_on_buffer() is an inline function that checks + * if 'b_wait' is set before calling this, so that the queues aren't set + * up unnecessarily. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + atomic_inc(&bh->b_count); + add_wait_queue(&bh->b_wait, &wait); + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!buffer_locked(bh)) + break; + schedule(); + } while (buffer_locked(bh)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&bh->b_wait, &wait); + atomic_dec(&bh->b_count); +} + +/* Call sync_buffers with wait!=0 to ensure that the call does not + * return until all buffer writes have completed. Sync() may return + * before the writes have finished; fsync() may not. + */ + +/* Godamity-damn. Some buffers (bitmaps for filesystems) + * spontaneously dirty themselves without ever brelse being called. + * We will ultimately want to put these in a separate list, but for + * now we search all of the lists for dirty buffers. + */ +static int sync_buffers(kdev_t dev, int wait) +{ + int i, retry, pass = 0, err = 0; + struct buffer_head * bh, *next; + + /* One pass for no-wait, three for wait: + * 0) write out all dirty, unlocked buffers; + * 1) write out all dirty buffers, waiting if locked; + * 2) wait for completion by waiting for all buffers to unlock. + */ + do { + retry = 0; + + /* We search all lists as a failsafe mechanism, not because we expect + * there to be dirty buffers on any of the other lists. + */ +repeat: + spin_lock(&lru_list_lock); + bh = lru_list[BUF_DIRTY]; + if (!bh) + goto repeat2; + + for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { + next = bh->b_next_free; + + if (!lru_list[BUF_DIRTY]) + break; + if (dev && bh->b_dev != dev) + continue; + if (buffer_locked(bh)) { + /* Buffer is locked; skip it unless wait is + * requested AND pass > 0. + */ + if (!wait || !pass) { + retry = 1; + continue; + } + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer (bh); + atomic_dec(&bh->b_count); + goto repeat; + } + + /* If an unlocked buffer is not uptodate, there has + * been an IO error. Skip it. + */ + if (wait && buffer_req(bh) && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_uptodate(bh)) { + err = -EIO; + continue; + } + + /* Don't write clean buffers. Don't write ANY buffers + * on the third pass. + */ + if (!buffer_dirty(bh) || pass >= 2) + continue; + + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + atomic_dec(&bh->b_count); + retry = 1; + goto repeat; + } + + repeat2: + bh = lru_list[BUF_LOCKED]; + if (!bh) { + spin_unlock(&lru_list_lock); + break; + } + for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) { + next = bh->b_next_free; + + if (!lru_list[BUF_LOCKED]) + break; + if (dev && bh->b_dev != dev) + continue; + if (buffer_locked(bh)) { + /* Buffer is locked; skip it unless wait is + * requested AND pass > 0. + */ + if (!wait || !pass) { + retry = 1; + continue; + } + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer (bh); + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + goto repeat2; + } + } + spin_unlock(&lru_list_lock); + + /* If we are waiting for the sync to succeed, and if any dirty + * blocks were written, then repeat; on the second pass, only + * wait for buffers being written (do not pass to write any + * more buffers on the second pass). + */ + } while (wait && retry && ++pass<=2); + return err; +} + +void sync_dev(kdev_t dev) +{ + sync_supers(dev); + sync_inodes(dev); + DQUOT_SYNC(dev); + /* sync all the dirty buffers out to disk only _after_ all the + high level layers finished generated buffer dirty data + (or we'll return with some buffer still dirty on the blockdevice + so breaking the semantics of this call) */ + sync_buffers(dev, 0); + /* + * FIXME(eric) we need to sync the physical devices here. + * This is because some (scsi) controllers have huge amounts of + * cache onboard (hundreds of Mb), and we need to instruct + * them to commit all of the dirty memory to disk, and we should + * not return until this has happened. + * + * This would need to get implemented by going through the assorted + * layers so that each block major number can be synced, and this + * would call down into the upper and mid-layer scsi. + */ +} + +int fsync_dev(kdev_t dev) +{ + sync_buffers(dev, 0); + + lock_kernel(); + sync_supers(dev); + sync_inodes(dev); + DQUOT_SYNC(dev); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + +asmlinkage long sys_sync(void) +{ + fsync_dev(0); + return 0; +} + +/* + * filp may be NULL if called via the msync of a vma. + */ + +int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + struct super_block * sb; + kdev_t dev; + int ret; + + lock_kernel(); + /* sync the inode to buffers */ + write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + lock_super(sb); + if (sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + /* .. finally sync the buffers to disk */ + dev = inode->i_dev; + ret = sync_buffers(dev, 1); + unlock_kernel(); + return ret; +} + +asmlinkage long sys_fsync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int err; + + err = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + err = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + /* We need to protect against concurrent writers.. */ + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 0); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return err; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int err; + + err = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + err = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 1); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return err; +} + +/* After several hours of tedious analysis, the following hash + * function won. Do not mess with it... -DaveM + */ +#define _hashfn(dev,block) \ + ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ + (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \ + ((block) << (bh_hash_shift - 12)))) +#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] + +static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) +{ + if ((bh->b_next = *head) != NULL) + bh->b_next->b_pprev = &bh->b_next; + *head = bh; + bh->b_pprev = head; +} + +static __inline__ void __hash_unlink(struct buffer_head *bh) +{ + if (bh->b_pprev) { + if (bh->b_next) + bh->b_next->b_pprev = bh->b_pprev; + *(bh->b_pprev) = bh->b_next; + bh->b_pprev = NULL; + } +} + +static void __insert_into_lru_list(struct buffer_head * bh, int blist) +{ + struct buffer_head **bhp = &lru_list[blist]; + + if(!*bhp) { + *bhp = bh; + bh->b_prev_free = bh; + } + bh->b_next_free = *bhp; + bh->b_prev_free = (*bhp)->b_prev_free; + (*bhp)->b_prev_free->b_next_free = bh; + (*bhp)->b_prev_free = bh; + nr_buffers_type[blist]++; + size_buffers_type[blist] += bh->b_size; +} + +static void __remove_from_lru_list(struct buffer_head * bh, int blist) +{ + if (bh->b_prev_free || bh->b_next_free) { + bh->b_prev_free->b_next_free = bh->b_next_free; + bh->b_next_free->b_prev_free = bh->b_prev_free; + if (lru_list[blist] == bh) + lru_list[blist] = bh->b_next_free; + if (lru_list[blist] == bh) + lru_list[blist] = NULL; + bh->b_next_free = bh->b_prev_free = NULL; + nr_buffers_type[blist]--; + size_buffers_type[blist] -= bh->b_size; + } +} + +static void __remove_from_free_list(struct buffer_head * bh, int index) +{ + if(bh->b_next_free == bh) + free_list[index].list = NULL; + else { + bh->b_prev_free->b_next_free = bh->b_next_free; + bh->b_next_free->b_prev_free = bh->b_prev_free; + if (free_list[index].list == bh) + free_list[index].list = bh->b_next_free; + } + bh->b_next_free = bh->b_prev_free = NULL; +} + +/* must be called with both the hash_table_lock and the lru_list_lock + held */ +static void __remove_from_queues(struct buffer_head *bh) +{ + __hash_unlink(bh); + __remove_from_lru_list(bh, bh->b_list); +} + +static void __insert_into_queues(struct buffer_head *bh) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + + __hash_link(bh, head); + __insert_into_lru_list(bh, bh->b_list); +} + +/* This function must only run if there are no other + * references _anywhere_ to this buffer head. + */ +static void put_last_free(struct buffer_head * bh) +{ + struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; + struct buffer_head **bhp = &head->list; + + bh->b_state = 0; + + spin_lock(&head->lock); + bh->b_dev = B_FREE; + if(!*bhp) { + *bhp = bh; + bh->b_prev_free = bh; + } + bh->b_next_free = *bhp; + bh->b_prev_free = (*bhp)->b_prev_free; + (*bhp)->b_prev_free->b_next_free = bh; + (*bhp)->b_prev_free = bh; + spin_unlock(&head->lock); +} + +/* + * Why like this, I hear you say... The reason is race-conditions. + * As we don't lock buffers (unless we are reading them, that is), + * something might happen to it while we sleep (ie a read-error + * will force it bad). This shouldn't really happen currently, but + * the code is ready. + */ +static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size) +{ + struct buffer_head *bh = hash(dev, block); + + for (; bh; bh = bh->b_next) + if (bh->b_blocknr == block && + bh->b_size == size && + bh->b_dev == dev) + break; + if (bh) + atomic_inc(&bh->b_count); + + return bh; +} + +struct buffer_head * get_hash_table(kdev_t dev, int block, int size) +{ + struct buffer_head *bh; + + read_lock(&hash_table_lock); + bh = __get_hash_table(dev, block, size); + read_unlock(&hash_table_lock); + + return bh; +} + +unsigned int get_hardblocksize(kdev_t dev) +{ + int blksize = 0; + /* + * Get the hard sector size for the given device. + * If we don't know what it is, return 0. + */ + if (hardsect_size[MAJOR(dev)] != NULL) + blksize = hardsect_size[MAJOR(dev)][MINOR(dev)]; + return blksize; +} + +void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers); + spin_unlock(&lru_list_lock); +} + +/* The caller must have the lru_list lock before calling the + remove_inode_queue functions. */ +static void __remove_inode_queue(struct buffer_head *bh) +{ + bh->b_inode = NULL; + list_del(&bh->b_inode_buffers); +} + +static inline void remove_inode_queue(struct buffer_head *bh) +{ + if (bh->b_inode) + __remove_inode_queue(bh); +} + +int inode_has_buffers(struct inode *inode) +{ + int ret; + + spin_lock(&lru_list_lock); + ret = !list_empty(&inode->i_dirty_buffers); + spin_unlock(&lru_list_lock); + + return ret; +} + + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash dirty + buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completation) and + then an invalidate_buffers call that doesn't trash dirty buffers. */ +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + + /* Another device? */ + if (bh->b_dev != dev) + continue; + /* Part of a mapping? */ + if (bh->b_page->mapping) + continue; + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + } + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count) && + (destroy_dirty_buffers || !buffer_dirty(bh))) { + remove_inode_queue(bh); + __remove_from_queues(bh); + put_last_free(bh); + } + /* else complain loudly? */ + + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } +out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; +} + +void set_blocksize(kdev_t dev, int size) +{ + extern int *blksize_size[]; + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + + if (!blksize_size[MAJOR(dev)]) + return; + + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || (size & (size-1))) + panic("Invalid blocksize passed to set_blocksize"); + + if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) { + blksize_size[MAJOR(dev)][MINOR(dev)] = size; + return; + } + if (blksize_size[MAJOR(dev)][MINOR(dev)] == size) + return; + sync_buffers(dev, 2); + blksize_size[MAJOR(dev)][MINOR(dev)] = size; + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + if (bh->b_dev != dev || bh->b_size == size) + continue; + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + } + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count)) { + if (buffer_dirty(bh)) + printk(KERN_WARNING + "set_blocksize: dev %s buffer_dirty %lu size %hu\n", + kdevname(dev), bh->b_blocknr, bh->b_size); + remove_inode_queue(bh); + __remove_from_queues(bh); + put_last_free(bh); + } else { + if (atomic_set_buffer_clean(bh)) + __refile_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + printk(KERN_WARNING + "set_blocksize: " + "b_count %d, dev %s, block %lu, from %p\n", + atomic_read(&bh->b_count), bdevname(bh->b_dev), + bh->b_blocknr, __builtin_return_address(0)); + } + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } + out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; +} + +/* + * We used to try various strange things. Let's not. + * We'll just try to balance dirty buffers, and possibly + * launder some pages. + */ +static void refill_freelist(int size) +{ + balance_dirty(NODEV); + if (free_shortage()) + page_launder(GFP_BUFFER, 0); + grow_buffers(size); +} + +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_list = BUF_CLEAN; + bh->b_end_io = handler; + bh->b_private = private; +} + +static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + + mark_buffer_uptodate(bh, uptodate); + + /* This is a temporary buffer used for page I/O. */ + page = bh->b_page; + + if (!uptodate) + SetPageError(page); + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + * + * Async buffer_heads are here only as labels for IO, and get + * thrown away once the IO for this page is complete. IO is + * deemed complete once all buffers have been visited + * (b_count==0) and are now unlocked. We must make sure that + * only the _last_ buffer that decrements its count is the one + * that unlock the page.. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + unlock_buffer(bh); + atomic_dec(&bh->b_count); + tmp = bh->b_this_page; + while (tmp != bh) { + if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp)) + goto still_busy; + tmp = tmp->b_this_page; + } + + /* OK, the async IO on this page is complete. */ + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * if none of the buffers had errors then we can set the + * page uptodate: + */ + if (!PageError(page)) + SetPageUptodate(page); + + /* + * Run the hooks that have to be done when a page I/O has completed. + */ + if (PageTestandClearDecrAfter(page)) + atomic_dec(&nr_async_pages); + + UnlockPage(page); + + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +void set_buffer_async_io(struct buffer_head *bh) { + bh->b_end_io = end_buffer_io_async ; +} + +/* + * Synchronise all the inode's dirty buffers to the disk. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ + +int fsync_inode_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp.i_dirty_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(&inode->i_dirty_buffers)) { + bh = BH_ENTRY(inode->i_dirty_buffers.next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers); + atomic_inc(&bh->b_count); + if (buffer_dirty(bh)) { + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_buffers.prev); + remove_inode_queue(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + + spin_unlock(&lru_list_lock); + err2 = osync_inode_buffers(inode); + + if (err) + return err; + else + return err2; +} + + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ + +int osync_inode_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + spin_lock(&lru_list_lock); + + repeat: + + for (list = inode->i_dirty_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + brelse(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + spin_lock(&lru_list_lock); + goto repeat; + } + } + + spin_unlock(&lru_list_lock); + return err; +} + + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + */ + +void invalidate_inode_buffers(struct inode *inode) +{ + struct list_head *list, *next; + + spin_lock(&lru_list_lock); + list = inode->i_dirty_buffers.next; + while (list != &inode->i_dirty_buffers) { + next = list->next; + remove_inode_queue(BH_ENTRY(list)); + list = next; + } + spin_unlock(&lru_list_lock); +} + + +/* + * Ok, this is getblk, and it isn't very clear, again to hinder + * race-conditions. Most of the code is seldom used, (ie repeating), + * so it should be much more efficient than it looks. + * + * The algorithm is changed: hopefully better, and an elusive bug removed. + * + * 14.02.92: changed it to sync dirty buffers a bit: better performance + * when the filesystem starts to get full of dirty blocks (I hope). + */ +struct buffer_head * getblk(kdev_t dev, int block, int size) +{ + struct buffer_head * bh; + int isize; + +repeat: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + bh = __get_hash_table(dev, block, size); + if (bh) + goto out; + + isize = BUFSIZE_INDEX(size); + spin_lock(&free_list[isize].lock); + bh = free_list[isize].list; + if (bh) { + __remove_from_free_list(bh, isize); + atomic_set(&bh->b_count, 1); + } + spin_unlock(&free_list[isize].lock); + + /* + * OK, FINALLY we know that this buffer is the only one of + * its kind, we hold a reference (b_count>0), it is unlocked, + * and it is clean. + */ + if (bh) { + init_buffer(bh, NULL, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = 1 << BH_Mapped; + + /* Insert the buffer into the regular lists */ + __insert_into_queues(bh); + out: + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + touch_buffer(bh); + return bh; + } + + /* + * If we block while refilling the free list, somebody may + * create the buffer first ... search the hashes again. + */ + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + refill_freelist(size); + goto repeat; +} + +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completation) */ +int balance_dirty_state(kdev_t dev) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync; + + /* First, check for the "real" dirty limit. */ + if (dirty > soft_dirty_limit) { + if (dirty > hard_dirty_limit) + return 1; + return 0; + } + + return -1; +} + +/* + * if a new dirty buffer is created we need to balance bdflush. + * + * in the future we might want to make bdflush aware of different + * pressures on different devices - thus the (currently unused) + * 'dev' parameter. + */ +void balance_dirty(kdev_t dev) +{ + int state = balance_dirty_state(dev); + + if (state < 0) + return; + + if (state && (!dev || MAJOR(dev) == LOOP_MAJOR)) + state = 0; + + wakeup_bdflush(state); +} + +static __inline__ void __mark_dirty(struct buffer_head *bh) +{ + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + refile_buffer(bh); +} + +/* atomic version, the user must call balance_dirty() by hand + as soon as it become possible to block */ +void __mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) + __mark_dirty(bh); +} + +void mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + balance_dirty(bh->b_dev); + } +} + +/* + * A buffer may need to be moved from one buffer list to another + * (e.g. in case it is not shared any more). Handle this. + */ +static void __refile_buffer(struct buffer_head *bh) +{ + int dispose = BUF_CLEAN; + if (buffer_locked(bh)) + dispose = BUF_LOCKED; + if (buffer_dirty(bh)) + dispose = BUF_DIRTY; + if (buffer_protected(bh)) + dispose = BUF_PROTECTED; + if (dispose != bh->b_list) { + __remove_from_lru_list(bh, bh->b_list); + bh->b_list = dispose; + if (dispose == BUF_CLEAN) + remove_inode_queue(bh); + __insert_into_lru_list(bh, dispose); + } +} + +void refile_buffer(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + __refile_buffer(bh); + spin_unlock(&lru_list_lock); +} + +/* + * Release a buffer head + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + atomic_dec(&buf->b_count); + return; + } + printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} + +/* + * bforget() is like brelse(), except it puts the buffer on the + * free list if it can.. We can NOT free the buffer if: + * - there are other users of it + * - it is locked and thus can have active IO + */ +void __bforget(struct buffer_head * buf) +{ + /* grab the lru lock here to block bdflush. */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf)) + goto in_use; + __hash_unlink(buf); + remove_inode_queue(buf); + write_unlock(&hash_table_lock); + __remove_from_lru_list(buf, buf->b_list); + spin_unlock(&lru_list_lock); + put_last_free(buf); + return; + + in_use: + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); +} + +/* + * bread() reads a specified block and returns the buffer that contains + * it. It returns NULL if the block was unreadable. + */ +struct buffer_head * bread(kdev_t dev, int block, int size) +{ + struct buffer_head * bh; + + bh = getblk(dev, block, size); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* + * Note: the caller should wake up the buffer_wait list if needed. + */ +static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) +{ + if (bh->b_inode) + BUG(); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { + kmem_cache_free(bh_cachep, bh); + } else { + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + bh->b_this_page = NULL; + unused_list = bh; + } +} + +/* + * Reserve NR_RESERVED buffer heads for async IO requests to avoid + * no-buffer-head deadlock. Return NULL on failure; waiting for + * buffer heads is now handled in create_buffers(). + */ +static struct buffer_head * get_unused_buffer_head(int async) +{ + struct buffer_head * bh; + + spin_lock(&unused_list_lock); + if (nr_unused_buffer_heads > NR_RESERVED) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + + /* This is critical. We can't swap out pages to get + * more buffer heads, because the swap-out may need + * more buffer-heads itself. Thus SLAB_BUFFER. + */ + if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + return bh; + } + + /* + * If we need an async buffer, use the reserved buffer heads. + */ + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + } +#if 0 + /* + * (Pending further analysis ...) + * Ordinary (non-async) requests can use a different memory priority + * to free up pages. Any swapping thus generated will use async + * buffer heads. + */ + if(!async && + (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + return bh; + } +#endif + + return NULL; +} + +void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) +{ + bh->b_page = page; + if (offset >= PAGE_SIZE) + BUG(); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * The async flag is used to differentiate async IO (paging, swapping) + * from ordinary buffer allocations, and only async requests are allowed + * to sleep waiting for buffer heads. + */ +static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = get_unused_buffer_head(async); + if (!bh) + goto no_grow; + + bh->b_dev = B_FREE; /* Flag as unused */ + bh->b_this_page = head; + head = bh; + + bh->b_state = 0; + bh->b_next_free = NULL; + bh->b_pprev = NULL; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + set_bh_page(bh, page, offset); + + bh->b_list = BUF_CLEAN; + bh->b_end_io = NULL; + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + spin_lock(&unused_list_lock); + do { + bh = head; + head = head->b_this_page; + __put_unused_buffer_head(bh); + } while (head); + spin_unlock(&unused_list_lock); + + /* Wake up any waiters ... */ + wake_up(&buffer_wait); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!async) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + run_task_queue(&tq_disk); + + /* + * Set our state for sleeping, then check again for buffer heads. + * This ensures we won't miss a wake_up from an interrupt. + */ + wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE); + goto try_again; +} + +static void unmap_buffer(struct buffer_head * bh) +{ + if (buffer_mapped(bh)) { + mark_buffer_clean(bh); + wait_on_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + } +} + +/* + * We don't have to release all buffers here, but + * we have to be sure that no dirty buffer is left + * and no IO is going on (no buffer is locked), because + * we have truncated the file and are going to free the + * blocks on-disk.. + */ +int block_flushpage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + if (!PageLocked(page)) + BUG(); + if (!page->buffers) + return 1; + + head = page->buffers; + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully flushed? + */ + if (offset <= curr_off) + unmap_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * subtle. We release buffer-heads only if this is + * the 'final' flushpage. We have invalidated the get_block + * cached value unconditionally, so real IO is not + * possible anymore. + * + * If the free doesn't work out, the buffers can be + * left around - they just turn into anonymous buffers + * instead. + */ + if (!offset) { + if (!try_to_free_buffers(page, 0)) { + atomic_inc(&buffermem_pages); + return 0; + } + } + + return 1; +} + +static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize) +{ + struct buffer_head *bh, *head, *tail; + + head = create_buffers(page, blocksize, 1); + if (page->buffers) + BUG(); + + bh = head; + do { + bh->b_dev = dev; + bh->b_blocknr = 0; + bh->b_end_io = NULL; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + */ + +static void unmap_underlying_metadata(struct buffer_head * bh) +{ + struct buffer_head *old_bh; + + old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (old_bh) { + mark_buffer_clean(old_bh); + wait_on_buffer(old_bh); + clear_bit(BH_Req, &old_bh->b_state); + /* Here we could run brelse or bforget. We use + bforget because it will try to put the buffer + in the freelist. */ + __bforget(old_bh); + } +} + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * block_write_full_page() is SMP-safe - currently it's still + * being called with the kernel lock held, but the code is ready. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) +{ + int err, i; + unsigned long block; + struct buffer_head *bh, *head; + int need_unlock = 1; + + if (!PageLocked(page)) + BUG(); + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); + head = page->buffers; + + block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + bh = head; + i = 0; + + /* Stage 1: make sure we have all the buffers mapped! */ + do { + /* + * If the buffer isn't up-to-date, we can't be sure + * that the buffer has been initialized with the proper + * block number information etc.. + * + * Leave it to the low-level FS to make all those + * decisions (block #0 may actually be a valid block) + */ + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) + unmap_underlying_metadata(bh); + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* Stage 2: lock the buffers, mark them clean */ + do { + lock_buffer(bh); + bh->b_end_io = end_buffer_io_async; + atomic_inc(&bh->b_count); + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + + SetPageUptodate(page); + /* Stage 3: submit the IO */ + do { + submit_bh(WRITE, bh); + bh = bh->b_this_page; + } while (bh != head); + + /* Done - end_buffer_io_async will unlock */ + return 0; + +out: + ClearPageUptodate(page); + bh = head; + need_unlock = 1; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh)) { + lock_buffer(bh); + need_unlock = 0; + } + bh = bh->b_this_page; + } while (bh != head); + do { + if (buffer_mapped(bh)) { + bh->b_end_io = end_buffer_io_async; + atomic_inc(&bh->b_count); + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + submit_bh(WRITE, bh); + } + bh = bh->b_this_page; + } while(bh != head); + if (need_unlock) + UnlockPage(page); + return err; +} + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + unsigned long block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + char *kaddr = kmap(page); + + blocksize = inode->i_sb->s_blocksize; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + bbits = inode->i_sb->s_blocksize_bits; + block = page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + if (!bh) + BUG(); + block_end = block_start+blocksize; + if (block_end <= from) + continue; + if (block_start >= to) + break; + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh); + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (block_end > to) + memset(kaddr+to, 0, block_end-to); + if (block_start < from) + memset(kaddr+block_start, 0, from-block_start); + if (block_end > to || block_start < from) + flush_dcache_page(page); + continue; + } + } + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (!buffer_uptodate(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + err = -EIO; + if (!buffer_uptodate(*wait_bh)) + goto out; + } + return 0; +out: + bh = head; + block_start = 0; + do { + if (buffer_new(bh) && !buffer_uptodate(bh)) { + memset(kaddr+block_start, 0, bh->b_size); + set_bit(BH_Uptodate, &bh->b_state); + mark_buffer_dirty(bh); + } + block_start += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0, need_balance_dirty = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = inode->i_sb->s_blocksize; + + for(bh = head = page->buffers, block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_bit(BH_Uptodate, &bh->b_state); + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + buffer_insert_inode_queue(bh, inode); + need_balance_dirty = 1; + } + } + } + + if (need_balance_dirty) + balance_dirty(bh->b_dev); + /* + * is this a partial write that happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' wether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * mark_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, blocks; + int nr, i; + + if (!PageLocked(page)) + PAGE_BUG(page); + blocksize = inode->i_sb->s_blocksize; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; + iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + if (iblock < lblock) { + if (get_block(inode, iblock, bh, 0)) + continue; + } + if (!buffer_mapped(bh)) { + memset(kmap(page) + i*blocksize, 0, blocksize); + flush_dcache_page(page); + kunmap(page); + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + /* get_block() might have updated the buffer synchronously */ + if (buffer_uptodate(bh)) + continue; + } + + arr[nr] = bh; + nr++; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (!nr) { + /* + * all buffers are uptodate - we can set the page + * uptodate as well. + */ + SetPageUptodate(page); + UnlockPage(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + lock_buffer(bh); + bh->b_end_io = end_buffer_io_async; + atomic_inc(&bh->b_count); + } + + /* Stage 3: start the IO */ + for (i = 0; i < nr; i++) + submit_bh(READ, arr[i]); + + return 0; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ + +int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct page *new_page; + unsigned long pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = inode->i_sb->s_blocksize; + char *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + UnlockPage(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = page_address(new_page); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(new_page); + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + kaddr = page_address(page); + if (zerofrom < offset) { + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + kunmap(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) { + ClearPageUptodate(page); + kunmap(page); + } + return err; +} + +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + __block_commit_write(inode,page,from,to); + kunmap(page); + if (pos > inode->i_size) { + inode->i_size = pos; + mark_inode_dirty(inode); + } + return 0; +} + +int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = inode->i_sb->s_blocksize; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + __mark_buffer_dirty(bh); + err = 0; + +unlock: + UnlockPage(page); + page_cache_release(page); +out: + return err; +} + +int block_write_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int err; + + /* easy case */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block); + + /* things got complicated... */ + offset = inode->i_size & (PAGE_CACHE_SIZE-1); + /* OK, are we completely out? */ + if (page->index >= end_index+1 || !offset) { + UnlockPage(page); + return -EIO; + } + + /* Sigh... will have to work, then... */ + err = __block_prepare_write(inode, page, 0, offset, get_block); + if (!err) { + memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + __block_commit_write(inode,page,0,offset); +done: + kunmap(page); + UnlockPage(page); + return err; + } + ClearPageUptodate(page); + goto done; +} + +int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf) +{ + if (atomic_dec_and_test(&kiobuf->io_count)) { + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (buffer_uptodate(bh) && !kiobuf->errno) + kiobuf->transferred += bh->b_size; + else if (!kiobuf->errno) + kiobuf->errno = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (kiobuf->end_io) + kiobuf->end_io(kiobuf); + wake_up(&kiobuf->wait_queue); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + kiobuf = brw_cb->kiobuf; + if (!uptodate && !kiobuf->errno) + brw_cb->kiobuf->errno = -EIO; + brw_kio_put_iobuf(brw_cb, kiobuf); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int err; + int length; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + struct page * map; + struct buffer_head *tmp; + int bh_nr; + int i; + +#define MAX_KIOVEC_NR 8 + struct brw_cb *brw_cb_table[MAX_KIOVEC_NR]; + struct brw_cb *brw_cb; + + if (!nr) + return 0; + + if (nr > MAX_KIOVEC_NR) { + printk("kiovec too large: %d\n", nr); + BUG(); + } + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (sector_size-1)) || + (iobuf->length & (sector_size-1))) { + printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size); + return -EINVAL; + } + + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + iobuf->errno = 0; + iobuf->transferred = 0; + atomic_inc(&iobuf->io_count); + + bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size; + if (!bh_nr) { + printk("brw_kiovec_async: !bh_nr\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb_table[i] = brw_cb; + brw_cb->kiobuf = iobuf; + brw_cb->nr = 0; + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + map = iobuf->maplist[pageind]; + err = -EFAULT; + if (!map) + goto error; + + while (length > 0 && (bufind < nr_blocks)) { + blocknr = b[bufind++]; + tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + err = -ENOMEM; + if (!tmp) + goto error; + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, map, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + atomic_inc(&iobuf->io_count); + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* okay, we've setup all our io requests, now fire them off! */ + for (i = 0; i < nr; i++) { + int j; + brw_cb = brw_cb_table[i]; +#if 1 + for (j=0; jnr; j++) + submit_bh(rw, brw_cb->bh[j]); + //ll_rw_block(rw, brw_cb->nr, brw_cb->bh); +#else + generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr); +#endif + brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf); + } + + return 0; + + error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + do { + brw_cb = brw_cb_table[i]; + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (bhind = brw_cb->nr; bhind--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[bhind]); + atomic_dec(&brw_cb->kiobuf->io_count); + kfree(brw_cb); + } + } while (i--) ; + + return err; +} + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} + +/* + * Start I/O on a page. + * This function expects the page to be locked and may return + * before I/O is complete. You then have to check page->locked, + * page->uptodate, and maybe wait on page->wait. + * + * brw_page() is SMP-safe, although it's being called with the + * kernel lock held - but the code is ready. + * + * FIXME: we need a swapper_inode->get_block function to remove + * some of the bmap kludges and interface ugliness here. + */ +int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) +{ + struct buffer_head *head, *bh; + + if (!PageLocked(page)) + panic("brw_page: page not locked for I/O"); + + if (!page->buffers) + create_empty_buffers(page, dev, size); + head = bh = page->buffers; + + /* Stage 1: lock all the buffers */ + do { + lock_buffer(bh); + bh->b_blocknr = *(b++); + set_bit(BH_Mapped, &bh->b_state); + bh->b_end_io = end_buffer_io_async; + atomic_inc(&bh->b_count); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 2: start the IO */ + do { + submit_bh(rw, bh); + bh = bh->b_this_page; + } while (bh != head); + return 0; +} + +int block_symlink(struct inode *inode, const char *symname, int len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + int err = -ENOMEM; + char *kaddr; + + if (!page) + goto fail; + err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); + if (err) + goto fail_map; + kaddr = page_address(page); + memcpy(kaddr, symname, len-1); + mapping->a_ops->commit_write(NULL, page, 0, len-1); + /* + * Notice that we are _not_ going to block here - end of page is + * unmapped, so this will only try to map the rest of page, see + * that it is unmapped (typically even will not look into inode - + * ->i_size will be enough for everything) and zero it out. + * OTOH it's obviously correct and should make the page up-to-date. + */ + err = mapping->a_ops->readpage(NULL, page); + wait_on_page(page); + page_cache_release(page); + if (err < 0) + goto fail; + mark_inode_dirty(inode); + return 0; +fail_map: + UnlockPage(page); + page_cache_release(page); +fail: + return err; +} + +/* + * Try to increase the number of buffers available: the size argument + * is used to determine what kind of buffers we want. + */ +static int grow_buffers(int size) +{ + struct page * page; + struct buffer_head *bh, *tmp; + struct buffer_head * insert_point; + int isize; + + if ((size & 511) || (size > PAGE_SIZE)) { + printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size); + return 0; + } + + page = alloc_page(GFP_BUFFER); + if (!page) + goto out; + LockPage(page); + bh = create_buffers(page, size, 0); + if (!bh) + goto no_buffer_head; + + isize = BUFSIZE_INDEX(size); + + spin_lock(&free_list[isize].lock); + insert_point = free_list[isize].list; + tmp = bh; + while (1) { + if (insert_point) { + tmp->b_next_free = insert_point->b_next_free; + tmp->b_prev_free = insert_point; + insert_point->b_next_free->b_prev_free = tmp; + insert_point->b_next_free = tmp; + } else { + tmp->b_prev_free = tmp; + tmp->b_next_free = tmp; + } + insert_point = tmp; + if (tmp->b_this_page) + tmp = tmp->b_this_page; + else + break; + } + tmp->b_this_page = bh; + free_list[isize].list = bh; + spin_unlock(&free_list[isize].lock); + + page->buffers = bh; + page->flags &= ~(1 << PG_referenced); + lru_cache_add(page); + UnlockPage(page); + atomic_inc(&buffermem_pages); + return 1; + +no_buffer_head: + UnlockPage(page); + page_cache_release(page); +out: + return 0; +} + +/* + * Sync all the buffers on one page.. + * + * If we have old buffers that are locked, we'll + * wait on them, but we won't wait on the new ones + * we're writing out now. + * + * This all is required so that we can free up memory + * later. + * + * Wait: + * 0 - no wait (this does not get called - see try_to_free_buffers below) + * 1 - start IO for dirty buffers + * 2 - wait for completion of locked buffers + */ +static void sync_page_buffers(struct buffer_head *bh, int wait) +{ + struct buffer_head * tmp = bh; + + do { + struct buffer_head *p = tmp; + tmp = tmp->b_this_page; + if (buffer_locked(p)) { + if (wait > 1) + __wait_on_buffer(p); + } else if (buffer_dirty(p)) + ll_rw_block(WRITE, 1, &p); + } while (tmp != bh); +} + +/* + * Can the buffer be thrown out? + */ +#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and free's the page if so. + * + * Wake up bdflush() if this fails - if we're running low on memory due + * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. + */ +int try_to_free_buffers(struct page * page, int wait) +{ + struct buffer_head * tmp, * bh = page->buffers; + int index = BUFSIZE_INDEX(bh->b_size); + int loop = 0; + +cleaned_buffers_try_again: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + spin_lock(&free_list[index].lock); + tmp = bh; + do { + struct buffer_head *p = tmp; + + tmp = tmp->b_this_page; + if (buffer_busy(p)) + goto busy_buffer_page; + } while (tmp != bh); + + spin_lock(&unused_list_lock); + tmp = bh; + do { + struct buffer_head * p = tmp; + tmp = tmp->b_this_page; + + /* The buffer can be either on the regular + * queues or on the free list.. + */ + if (p->b_dev != B_FREE) { + remove_inode_queue(p); + __remove_from_queues(p); + } else + __remove_from_free_list(p, index); + __put_unused_buffer_head(p); + } while (tmp != bh); + spin_unlock(&unused_list_lock); + + /* Wake up anyone waiting for buffer heads */ + wake_up(&buffer_wait); + + /* And free the page */ + page->buffers = NULL; + page_cache_release(page); + spin_unlock(&free_list[index].lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return 1; + +busy_buffer_page: + /* Uhhuh, start writeback so that we don't end up with all dirty pages */ + spin_unlock(&free_list[index].lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + if (wait) { + sync_page_buffers(bh, wait); + /* We waited synchronously, so we can free the buffers. */ + if (wait > 1 && !loop) { + loop = 1; + goto cleaned_buffers_try_again; + } + wakeup_bdflush(0); + } + return 0; +} + +/* ================== Debugging =================== */ + +void show_buffers(void) +{ +#ifdef CONFIG_SMP + struct buffer_head * bh; + int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; + int protected = 0; + int nlist; + static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", }; +#endif + + printk("Buffer memory: %6dkB\n", + atomic_read(&buffermem_pages) << (PAGE_SHIFT-10)); + +#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */ + if (!spin_trylock(&lru_list_lock)) + return; + for(nlist = 0; nlist < NR_LIST; nlist++) { + found = locked = dirty = used = lastused = protected = 0; + bh = lru_list[nlist]; + if(!bh) continue; + + do { + found++; + if (buffer_locked(bh)) + locked++; + if (buffer_protected(bh)) + protected++; + if (buffer_dirty(bh)) + dirty++; + if (atomic_read(&bh->b_count)) + used++, lastused = found; + bh = bh->b_next_free; + } while (bh != lru_list[nlist]); + { + int tmp = nr_buffers_type[nlist]; + if (found != tmp) + printk("%9s: BUG -> found %d, reported %d\n", + buf_types[nlist], found, tmp); + } + printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " + "%d locked, %d protected, %d dirty\n", + buf_types[nlist], found, size_buffers_type[nlist]>>10, + used, lastused, locked, protected, dirty); + } + spin_unlock(&lru_list_lock); +#endif +} + +/* ===================== Init ======================= */ + +/* + * allocate the hash table and init the free list + * Use gfp() for the hash table to decrease TLB misses, use + * SLAB cache for buffer heads. + */ +void __init buffer_init(unsigned long mempages) +{ + int order, i; + unsigned int nr_hash; + + /* The buffer cache hash table is less important these days, + * trim it a bit. + */ + mempages >>= 14; + + mempages *= sizeof(struct buffer_head *); + + for (order = 0; (1 << order) < mempages; order++) + ; + + /* try to allocate something until we get it or we're asking + for something that is really too small */ + + do { + unsigned long tmp; + + nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *); + bh_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + bh_hash_shift = 0; + while((tmp >>= 1UL) != 0UL) + bh_hash_shift++; + + hash_table = (struct buffer_head **) + __get_free_pages(GFP_ATOMIC, order); + } while (hash_table == NULL && --order > 0); + printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!hash_table) + panic("Failed to allocate buffer hash table\n"); + + /* Setup hash chains. */ + for(i = 0; i < nr_hash; i++) + hash_table[i] = NULL; + + /* Setup free lists. */ + for(i = 0; i < NR_SIZES; i++) { + free_list[i].list = NULL; + free_list[i].lock = SPIN_LOCK_UNLOCKED; + } + + /* Setup lru lists. */ + for(i = 0; i < NR_LIST; i++) + lru_list[i] = NULL; + +} + + +/* ====================== bdflush support =================== */ + +/* This is a simple kernel daemon, whose job it is to provide a dynamic + * response to dirty buffers. Once this process is activated, we write back + * a limited number of buffers to the disks and then go back to sleep again. + */ + +/* This is the _only_ function that deals with flushing async writes + to disk. + NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list + as all dirty buffers lives _only_ in the DIRTY lru list. + As we never browse the LOCKED and CLEAN lru lists they are infact + completly useless. */ +static int flush_dirty_buffers(int check_flushtime) +{ + struct buffer_head * bh, *next; + int flushed = 0, i; + + restart: + spin_lock(&lru_list_lock); + bh = lru_list[BUF_DIRTY]; + if (!bh) + goto out_unlock; + for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) { + next = bh->b_next_free; + + if (!buffer_dirty(bh)) { + __refile_buffer(bh); + continue; + } + if (buffer_locked(bh)) + continue; + + if (check_flushtime) { + /* The dirty lru list is chronologically ordered so + if the current bh is not yet timed out, + then also all the following bhs + will be too young. */ + if (time_before(jiffies, bh->b_flushtime)) + goto out_unlock; + } else { + if (++flushed > bdf_prm.b_un.ndirty) + goto out_unlock; + } + + /* OK, now we are committed to write it out. */ + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + atomic_dec(&bh->b_count); + + if (current->need_resched) { + /* kick what we've already pushed down */ + run_task_queue(&tq_disk); + schedule(); + } + goto restart; + } + out_unlock: + spin_unlock(&lru_list_lock); + + return flushed; +} + +struct task_struct *bdflush_tsk = 0; + +void wakeup_bdflush(int block) +{ + if (current != bdflush_tsk) { + wake_up_process(bdflush_tsk); + + if (block) + flush_dirty_buffers(0); + } +} + +/* + * Here we attempt to write back old buffers. We also try to flush inodes + * and supers as well, since this function is essentially "update", and + * otherwise there would be no way of ensuring that these quantities ever + * get written back. Ideally, we would have a timestamp on the inodes + * and superblocks so that we could write back only the old ones as well + */ + +static int sync_old_buffers(void) +{ + lock_kernel(); + sync_supers(0); + sync_inodes(0); + unlock_kernel(); + + flush_dirty_buffers(1); + /* must really sync all the active I/O request to disk here */ + run_task_queue(&tq_disk); + return 0; +} + +int block_sync_page(struct page *page) +{ + run_task_queue(&tq_disk); + return 0; +} + +/* This is the interface to bdflush. As we get more sophisticated, we can + * pass tuning parameters to this "process", to adjust how it behaves. + * We would want to verify each parameter, however, to make sure that it + * is reasonable. */ + +asmlinkage long sys_bdflush(int func, long data) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (func == 1) { + /* do_exit directly and let kupdate to do its work alone. */ + do_exit(0); +#if 0 /* left here as it's the only example of lazy-mm-stuff used from + a syscall that doesn't care about the current mm context. */ + int error; + struct mm_struct *user_mm; + + /* + * bdflush will spend all of it's time in kernel-space, + * without touching user-space, so we can switch it into + * 'lazy TLB mode' to reduce the cost of context-switches + * to and from bdflush. + */ + user_mm = start_lazy_tlb(); + error = sync_old_buffers(); + end_lazy_tlb(user_mm); + return error; +#endif + } + + /* Basically func 1 means read param 1, 2 means write param 1, etc */ + if (func >= 2) { + int i = (func-2) >> 1; + if (i >= 0 && i < N_PARAM) { + if ((func & 1) == 0) + return put_user(bdf_prm.data[i], (int*)data); + + if (data >= bdflush_min[i] && data <= bdflush_max[i]) { + bdf_prm.data[i] = data; + return 0; + } + } + return -EINVAL; + } + + /* Having func 0 used to launch the actual bdflush and then never + * return (unless explicitly killed). We return zero here to + * remain semi-compatible with present update(8) programs. + */ + return 0; +} + +/* + * This is the actual bdflush daemon itself. It used to be started from + * the syscall above, but now we launch it ourselves internally with + * kernel_thread(...) directly after the first thread in init/main.c + */ +int bdflush(void *sem) +{ + struct task_struct *tsk = current; + int flushed; + /* + * We have a bare-bones task_struct, and really should fill + * in a few more things so "top" and /proc/2/{exe,root,cwd} + * display semi-sane things. Not real crucial though... + */ + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "bdflush"); + bdflush_tsk = tsk; + + /* avoid getting signals */ + spin_lock_irq(&tsk->sigmask_lock); + flush_signals(tsk); + sigfillset(&tsk->blocked); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + up((struct semaphore *)sem); + + for (;;) { + CHECK_EMERGENCY_SYNC + + flushed = flush_dirty_buffers(0); + if (free_shortage()) + flushed += page_launder(GFP_KERNEL, 0); + + /* + * If there are still a lot of dirty buffers around, + * skip the sleep and flush some more. Otherwise, we + * go to sleep waiting a wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!flushed || balance_dirty_state(NODEV) < 0) { + run_task_queue(&tq_disk); + schedule(); + } + /* Remember to mark us as running otherwise + the next schedule will block. */ + __set_current_state(TASK_RUNNING); + } +} + +/* + * This is the kernel update daemon. It was used to live in userspace + * but since it's need to run safely we want it unkillable by mistake. + * You don't need to change your userspace configuration since + * the userspace `update` will do_exit(0) at the first sys_bdflush(). + */ +int kupdate(void *sem) +{ + struct task_struct * tsk = current; + int interval; + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kupdated"); + + /* sigstop and sigcont will stop and wakeup kupdate */ + spin_lock_irq(&tsk->sigmask_lock); + sigfillset(&tsk->blocked); + siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP)); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + up((struct semaphore *)sem); + + for (;;) { + /* update interval */ + interval = bdf_prm.b_un.interval; + if (interval) { + tsk->state = TASK_INTERRUPTIBLE; + schedule_timeout(interval); + } else { + stop_kupdate: + tsk->state = TASK_STOPPED; + schedule(); /* wait for SIGCONT */ + } + /* check for sigstop */ + if (signal_pending(tsk)) { + int stopped = 0; + spin_lock_irq(&tsk->sigmask_lock); + if (sigismember(&tsk->pending.signal, SIGSTOP)) { + sigdelset(&tsk->pending.signal, SIGSTOP); + stopped = 1; + } + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + if (stopped) + goto stop_kupdate; + } +#ifdef DEBUG + printk(KERN_DEBUG "kupdate() activated...\n"); +#endif + sync_old_buffers(); + } +} + +static int __init bdflush_init(void) +{ + DECLARE_MUTEX_LOCKED(sem); + kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + down(&sem); + kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + down(&sem); + return 0; +} + +module_init(bdflush_init) + +/* async kio interface */ +struct brw_cb { + struct kiobuf *kiobuf; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf) +{ + if (atomic_dec_and_test(&kiobuf->io_count)) { + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (buffer_uptodate(bh) && !kiobuf->errno) + kiobuf->transferred += bh->b_size; + else if (!kiobuf->errno) + kiobuf->errno = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (kiobuf->end_io) + kiobuf->end_io(kiobuf); + wake_up(&kiobuf->wait_queue); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + kiobuf = brw_cb->kiobuf; + if (!uptodate && !kiobuf->errno) + brw_cb->kiobuf->errno = -EIO; + brw_kio_put_iobuf(brw_cb, kiobuf); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int err; + int length; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + struct page * map; + struct buffer_head *tmp; + int bh_nr; + int i; + +#define MAX_KIOVEC_NR 8 + struct brw_cb *brw_cb_table[MAX_KIOVEC_NR]; + struct brw_cb *brw_cb; + + if (!nr) + return 0; + + if (nr > MAX_KIOVEC_NR) { + printk("kiovec too large: %d\n", nr); + BUG(); + } + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (sector_size-1)) || + (iobuf->length & (sector_size-1))) { + printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size); + return -EINVAL; + } + + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + iobuf->errno = 0; + iobuf->transferred = 0; + atomic_inc(&iobuf->io_count); + + bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size; + if (!bh_nr) { + printk("brw_kiovec_async: !bh_nr\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb_table[i] = brw_cb; + brw_cb->kiobuf = iobuf; + brw_cb->nr = 0; + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + map = iobuf->maplist[pageind]; + err = -EFAULT; + if (!map) + goto error; + + while (length > 0 && (bufind < nr_blocks)) { + blocknr = b[bufind++]; + tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + err = -ENOMEM; + if (!tmp) + goto error; + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, map, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + atomic_inc(&iobuf->io_count); + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* okay, we've setup all our io requests, now fire them off! */ + for (i = 0; i < nr; i++) { + int j; + brw_cb = brw_cb_table[i]; +#if 1 + for (j=0; jnr; j++) + submit_bh(rw, brw_cb->bh[j]); + //ll_rw_block(rw, brw_cb->nr, brw_cb->bh); +#else + generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr); +#endif + brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf); + } + + return 0; + + error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + do { + brw_cb = brw_cb_table[i]; + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (bhind = brw_cb->nr; bhind--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[bhind]); + atomic_dec(&brw_cb->kiobuf->io_count); + kfree(brw_cb); + } + } while (i--) ; + + return err; +} + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/asm-i386/unistd.h ac10-aio/include/asm-i386/unistd.h --- /md0/kernels/2.4/v2.4.4-ac10/include/asm-i386/unistd.h Fri Aug 11 17:39:23 2000 +++ ac10-aio/include/asm-i386/unistd.h Thu May 24 17:53:04 2001 @@ -227,6 +227,11 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 +/* reserved for tux 222 */ +#define __NR___io_cancel 224 +#define __NR___io_wait 225 +#define __NR___io_getevents 226 +#define __NR_submit_ios 227 /* user-visible error numbers are in the range -1 - -124: see */ diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/aio.h ac10-aio/include/linux/aio.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ ac10-aio/include/linux/aio.h Thu May 24 17:53:04 2001 @@ -0,0 +1,130 @@ +/* linux/aio.h + * Written by Benjamin LaHaise + */ +#ifndef __AIO_H__ +#define __AIO_H__ + +#define IOCB_CMD_FINISHING -3 /* kernel internal */ + +#define IOCB_CMD_READ 0 +#define IOCB_CMD_WRITE 1 +#define IOCB_CMD_NOP 2 +#define IOCB_CMD_CANCEL 3 +#define IOCB_CMD_FSYNC 4 +#define IOCB_CMD_FDSYNC 5 +#define IOCB_CMD_RUNNING 6 +#define IOCB_CMD_DONE 7 + +#define AIO_RING_SIZE 8000 + +/* Notification method. Not implemented yet. */ +#define AIO_IOCTL_SET_NOTIFY_SIGNAL 0x10c11005 + +struct io_group { + int nr; + void *data; + struct iocb **list; +}; + +struct io_group_list { + int nr; + struct io_group *list; +}; + +/* read() from /dev/aio returns these structures. */ +enum io_event_types { + IO_EVENT_NONE, + IO_EVENT_IOCB_DONE, +}; + +struct io_event { + long type; + long flags; + long key; + void *data; +}; + +struct aio_ring { + unsigned long head; + unsigned long tail; + unsigned long woke; + unsigned long __reserved; + struct io_event io_events[AIO_RING_SIZE]; +}; + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + * + * FIXME: this must change from glibc's definition + * as we do *not* use the sigevent structure which + * is big and bloated. + */ + +struct iocb { + int aio_fildes; + short aio_lio_opcode; + short aio_reqprio; + void *aio_buf; + size_t aio_nbytes; + loff_t aio_offset; + + /* these are internal to the kernel/libc. */ + ssize_t __aio_return; /* the kernel writes the return code here */ + long __aio_key; /* the kernel sets this to -1 when completed, + * otherwise is the >= 0 iogrp #. */ +}; /* 32 bytes on 32 bit machines, 48 on 64 */ + +#ifdef __KERNEL__ +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kiocb { + int nr_kiovec; + struct kiobuf *kiovec[AIO_MAXSEGS]; + struct iocb *user_aiocb; + struct file *filp; + long aio_return; +}; + +#define IOGRP_STATE_SETUP 0 +#define IOGRP_STATE_DONE 1 + +struct kiogrp { + int locked:1; + atomic_t count; /* ios left */ + void *user_data; + struct kioctx *ctx; + int idx; + int nr_iocbs; + struct kiocb **iocbs; + struct kiocb *atomic_iocbs[AIO_KIOGRP_NR_ATOMIC]; +}; + +struct kioctx { + atomic_t users; + + wait_queue_head_t wait; + + int max_reqs; + struct kiogrp **reqs; + + spinlock_t done_lock; + + int pid; /* pid to send wakeups to */ + struct aio_ring *ring; + struct file *filp; +}; + +extern struct file_operations aio_fops; + +extern void __aioctx_put(struct kioctx *ctx); + +#define aioctx_get(kioctx) atomic_inc(&(kioctx)->users) +#define aioctx_put(kioctx) do { if (atomic_dec_and_test(&(kioctx)->users)) __aioctx_put(kioctx); } while (0) + +#endif /*__KERNEL__*/ + +#endif /* __AIO_H__ */ + diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/blkdev.h ac10-aio/include/linux/blkdev.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/blkdev.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/blkdev.h Thu May 24 18:01:23 2001 @@ -149,7 +149,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV]; extern void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size); extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size); -extern void generic_make_request(int rw, struct buffer_head * bh); +extern void generic_make_request(int rw, struct buffer_head *bh); extern request_queue_t *blk_get_queue(kdev_t dev); extern inline request_queue_t *__blk_get_queue(kdev_t dev); extern void blkdev_release_request(struct request *); diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/event.h ac10-aio/include/linux/event.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/event.h Wed Dec 31 19:00:00 1969 +++ ac10-aio/include/linux/event.h Thu May 24 17:53:04 2001 @@ -0,0 +1,21 @@ +#ifndef _LINUX_KEVENTQ_H +#define _LINUX_KEVENTQ_H + +typedef struct file *keventq_t; + +keventq_t keventq_get(int qid); +#define keventq_put(evq) fput(evq) + +keventq_t keventq_get(int qid) +{ + struct file *filp = fget(qid); + if (filp) { + if (&keventq_fops == filp->f_op) + return filp; + fput(filp); + } + return NULL; +} + + +#endif diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/fs.h ac10-aio/include/linux/fs.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/fs.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/fs.h Thu May 24 18:01:23 2001 @@ -20,7 +20,6 @@ #include #include #include -#include #include @@ -762,7 +761,13 @@ * NOTE: * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. + * + * rw_kiovec returns the number of bytes that will actually + * be transferred into the kiovec, or an error that occurred + * during queueing. */ +struct kiobuf; + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -782,6 +787,7 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*rw_kiovec)(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); }; struct inode_operations { @@ -1323,6 +1329,7 @@ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern int generic_file_open(struct inode *, struct file *); diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/iobuf.h ac10-aio/include/linux/iobuf.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/iobuf.h Fri May 18 20:10:57 2001 +++ ac10-aio/include/linux/iobuf.h Thu May 24 18:01:23 2001 @@ -53,8 +53,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -80,7 +82,9 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], - kdev_t dev, unsigned long b[], int size); + kdev_t dev, int nr_blocks, unsigned long b[], int size); #endif /* __LINUX_IOBUF_H */ diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/locks.h ac10-aio/include/linux/locks.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/locks.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/locks.h Thu May 24 18:01:23 2001 @@ -30,8 +30,7 @@ { clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); - if (waitqueue_active(&bh->b_wait)) - wake_up(&bh->b_wait); + wake_up(&bh->b_wait); } /* diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/mm.h ac10-aio/include/linux/mm.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/mm.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/mm.h Thu May 24 18:01:23 2001 @@ -315,8 +315,7 @@ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ - if (waitqueue_active(&(page)->wait)) \ - wake_up(&(page)->wait); \ + wake_up(&(page)->wait); \ } while (0) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/sched.h ac10-aio/include/linux/sched.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/sched.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/sched.h Thu May 24 18:01:23 2001 @@ -758,6 +758,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/tqueue.h ac10-aio/include/linux/tqueue.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/tqueue.h Fri May 18 20:10:50 2001 +++ ac10-aio/include/linux/tqueue.h Thu May 24 18:01:23 2001 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/wait.h ac10-aio/include/linux/wait.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/wait.h Thu May 17 15:25:12 2001 +++ ac10-aio/include/linux/wait.h Thu May 24 18:01:23 2001 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -137,6 +140,7 @@ #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -174,6 +178,22 @@ #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -230,6 +250,19 @@ #endif list_del(&old->task_list); } + +#define add_wait_queue_cond(q, wait, cond, fail) \ + do { \ + unsigned long flags; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + if (cond) \ + __add_wait_queue((q), (wait)); \ + else { \ + fail; \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + } while (0) #endif /* __KERNEL__ */ diff -urN /md0/kernels/2.4/v2.4.4-ac10/include/linux/worktodo.h ac10-aio/include/linux/worktodo.h --- /md0/kernels/2.4/v2.4.4-ac10/include/linux/worktodo.h Wed Dec 31 19:00:00 1969 +++ ac10-aio/include/linux/worktodo.h Thu May 24 18:01:25 2001 @@ -0,0 +1,40 @@ +#ifndef _LINUX_WORKTODO_H +#define _LINUX_WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_set_action(wtd, action, wtddata) \ + do { \ + (wtd)->tq.routine = (action); \ + (wtd)->tq.data = (wtddata); \ + } while (0) + +struct page; +extern void wtd_wait_page(struct worktodo *wtd, struct page *page); +extern void wtd_lock_page(struct worktodo *wtd, struct page *page); +struct buffer_head; +extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); + +#if 0 /* not implemented yet */ +extern void wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* _LINUX_WORKTODO_H */ + diff -urN /md0/kernels/2.4/v2.4.4-ac10/init/main.c ac10-aio/init/main.c --- /md0/kernels/2.4/v2.4.4-ac10/init/main.c Thu May 17 15:25:12 2001 +++ ac10-aio/init/main.c Thu May 24 17:53:02 2001 @@ -803,8 +803,13 @@ if (initrd_start && mount_initrd) root_mountflags &= ~MS_RDONLY; else mount_initrd =0; #endif - - start_context_thread(); + { + int i = smp_num_cpus; + if (i < 2) + i = 2; + for (; i>0; i--) + start_context_thread(); + } do_initcalls(); #ifdef CONFIG_IRDA diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/context.c ac10-aio/kernel/context.c --- /md0/kernels/2.4/v2.4.4-ac10/kernel/context.c Thu May 17 15:25:12 2001 +++ ac10-aio/kernel/context.c Thu May 24 17:53:02 2001 @@ -91,12 +91,18 @@ */ for (;;) { set_task_state(curtask, TASK_INTERRUPTIBLE); - add_wait_queue(&context_task_wq, &wait); - if (TQ_ACTIVE(tq_context)) + add_wait_queue_exclusive_lifo(&context_task_wq, &wait); + if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context)) set_task_state(curtask, TASK_RUNNING); - schedule(); + else + schedule(); remove_wait_queue(&context_task_wq, &wait); run_task_queue(&tq_context); + while (TQ_ACTIVE(tq_context)) { + if (current->need_resched) + schedule(); + run_task_queue(&tq_context); + } wake_up(&context_task_done); if (signal_pending(curtask)) { while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0) diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/fork.c ac10-aio/kernel/fork.c --- /md0/kernels/2.4/v2.4.4-ac10/kernel/fork.c Thu May 17 15:25:12 2001 +++ ac10-aio/kernel/fork.c Thu May 24 17:53:02 2001 @@ -44,6 +44,16 @@ wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/sched.c ac10-aio/kernel/sched.c --- /md0/kernels/2.4/v2.4.4-ac10/kernel/sched.c Thu May 17 15:25:12 2001 +++ ac10-aio/kernel/sched.c Thu May 24 17:53:02 2001 @@ -716,13 +716,13 @@ } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small + * +ve number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by contonuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) @@ -735,14 +735,25 @@ list_for_each(tmp,&q->task_list) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_func_t func; CHECK_MAGIC(curr->__magic); + func = curr->func; + if (func) { + unsigned flags = curr->flags; + func(curr); + if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if (state & mode) { WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if (try_to_wake_up(p, sync) && + (curr->flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) break; } } diff -urN /md0/kernels/2.4/v2.4.4-ac10/kernel/softirq.c ac10-aio/kernel/softirq.c --- /md0/kernels/2.4/v2.4.4-ac10/kernel/softirq.c Fri Dec 29 17:07:24 2000 +++ ac10-aio/kernel/softirq.c Thu May 24 17:53:02 2001 @@ -311,6 +311,7 @@ data = p->data; wmb(); p->sync = 0; + smp_mb(); if (f) f(data); } diff -urN /md0/kernels/2.4/v2.4.4-ac10/mm/filemap.c ac10-aio/mm/filemap.c --- /md0/kernels/2.4/v2.4.4-ac10/mm/filemap.c Thu May 17 15:25:12 2001 +++ ac10-aio/mm/filemap.c Thu May 24 17:53:02 2001 @@ -23,12 +23,14 @@ #include #include #include +#include #include #include #include #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -2723,3 +2725,729 @@ panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + ret = -EINVAL; + if (nr <= 0) + goto out; + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_free(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + struct page **src_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + struct kiobuf *kiovec[8]; + int kio_nr; + + size_t size; + unsigned long transferred; + unsigned offset; + unsigned src_offset; + struct kiobuf *iobuf; + + int sync; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io) +{ + int i; + + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + kfree(io); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + int i; + + pr_debug("__iodesc_finish_write(%p)\n", io); + + if (WRITE == io->rw) + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + deactivate_page(page); + //page_cache_release(page); + } + + /* FIXME: this is buggy */ + { + struct kiobuf *iobuf = io->kiovec[0]; + iobuf->transferred = io->transferred; + iobuf->errno = io->err; + iobuf->end_io(iobuf); + } + + __iodesc_free(io); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + unsigned long bytes; + unsigned long offset, src_offset; + struct page *src_page; + long status; + char *kaddr; + int src_bytes; + char *src; + int done = 0; + unsigned left; + + src_bytes = PAGE_CACHE_SIZE - io->src_offset; + src_page = *io->src_pagep; + src = kmap(src_page) + io->src_offset; + + offset = io->offset; + src_offset = io->src_offset; + kaddr = kmap(page); + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (io->err) { +printk("prepare_write: %d\n", io->err); + goto unlock; + } + + left = bytes; + for (;;) { + if (left < src_bytes) + src_bytes = left; + + memcpy(kaddr, src, src_bytes); + kaddr += src_bytes; + src += src_bytes; + left -= src_bytes; + src_offset += src_bytes; + src_offset &= PAGE_SIZE - 1; + if (!src_offset) + io->src_pagep++; + + if (left <= 0) + break; + + if (!src_offset) { + kunmap(src_page); + src_page = *io->src_pagep; + src = kmap(src_page); + src_bytes = PAGE_SIZE; + } + } + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; +else +printk("commit_write: %ld\n", status); + + if (status > 0) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + + io->src_offset += status; + io->src_offset &= PAGE_CACHE_SIZE - 1; + } else { + io->err = status; + done = 1; + } + +unlock: + kunmap(page); + kunmap(src_page); + + //UnlockPage(page); + //deactivate_page(page); + //page_cache_release(page); + + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { +//printk("waiting on bh=%pi io=%p\n", bh, io); + wtd_wait_on_buffer(&io->wtd, bh); + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { +//printk("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + +//printk("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + up(&io->file->f_dentry->d_inode->i_sem); + + for (i=0; inr_pages; i++) + if (__iodesc_write_page(io, io->pages[i])) + break; + + if (io->sync) { + io->good_idx = 0; + +//printk("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + +//printk("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + wtd_lock_page(&io->wtd, io->good_page); + return; + } + } + + //__iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static +void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + io->src_offset = io->kiovec[0]->offset; + io->src_pagep = io->kiovec[0]->maplist; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + char *dst_addr, *src_addr; + int src_off, i; + size_t size; + size_t valid; + + struct page **src_pagep; + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + for (i=0; ikio_nr; i++) { + struct kiobuf *iobuf = io->kiovec[i]; + int dst_len = iobuf->length; + int dst_off = iobuf->offset; + struct page **dst_pagep = iobuf->maplist; + + dst_addr = kmap(*dst_pagep); + iobuf->transferred = 0; + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + iobuf->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", this, iobuf->transferred); + + if (dst_len <= 0) + break; + + if (size <= 0) + break; + + if (dst_off >= PAGE_SIZE) { + kunmap(*dst_pagep); + dst_pagep++; + dst_addr = kmap(*dst_pagep); + dst_off = 0; + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); +pr_debug("page(%lu)->count = %d\n", (*src_pagep)->index, atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(*dst_pagep); + + iobuf->errno = iobuf->transferred ? 0 : io->err; + if (iobuf->errno && i) + iobuf->errno = -EAGAIN; + iobuf->end_io(iobuf); + } + + kunmap(*src_pagep); + __iodesc_free(io); + + return; + +no_data: + io->kiovec[0]->errno = io->err; + io->kiovec[0]->transferred = 0; + io->kiovec[0]->end_io(io->kiovec[0]); + + for (i=1; ikio_nr; i++) { + struct kiobuf *iobuf = io->kiovec[i]; + + iobuf->errno = -EAGAIN; + iobuf->transferred = 0; + iobuf->end_io(iobuf); + } + __iodesc_free(io); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); + while (Page_Uptodate(page)) { +again: + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + wtd_lock_page(&io->wtd, page); + return; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + printk("attempting to read %lu\n", page->index); + io->did_read = 1; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + wtd_lock_page(&io->wtd, page); + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int foo; + if (!mayblock) + goto do_wtd; + foo = readpage(io->file, io->new_pages[i]); + if (foo) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, foo); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) + goto do_wtd; + if (!TryLockPage(page)) { + int foo = readpage(io->file, page); + if (foo) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, foo); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + do {static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n");} while(0); + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + wtd_lock_page(&io->wtd, io->good_page); + return; + +do_wtd: + do {static int zoo; if (zoo++ < 5) printk("read sleep\n");} while(0); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +int generic_file_rw_kiovec(struct file *file, int rw, + int kio_nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (rw != READ && rw != WRITE) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->size = size; + + if (READ == rw) { + pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size); + + if (pos > inode->i_size) + size = 0; + else if ((pos + size) > inode->i_size) + size = inode->i_size - pos; + + if (io->size < size) + size = io->size; + else if (size < io->size) + io->size = size; + + pr_debug("io->size=%d size=%d\n", io->size, size); + } + + index = pos >> PAGE_CACHE_SHIFT; + eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->rw = rw; + io->as = as; + io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1); + io->file = file; + io->kio_nr = kio_nr; + if (kio_nr > 8) + BUG(); + memcpy(io->kiovec, kiovec, sizeof(struct kiobuf *) * kio_nr); + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + /* FIXME: make the down a WTD_op */ + if (rw == WRITE) + down(&io->file->f_dentry->d_inode->i_sem); + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (rw == READ) + __generic_file_read_iodesc(io, 0); + else if (rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + kfree(io); +out: + return ret; +} + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(&page->wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + int raced = 0; + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page), raced = 1); + + if (!raced) { + run_task_queue(&tq_disk); + return; + } + } + + wtd->tq.routine(wtd->tq.data); +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + int raced = 0; + + if (!buffer_locked(bh)) { + wtd->tq.routine(wtd->tq.data); + return; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh), raced = 1); + + if (raced) + wtd->tq.routine(wtd->tq.data); + else + run_task_queue(&tq_disk); +} + +void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +struct tq_struct run_disk_tq = { + routine: do_run_tq_disk, + data: NULL +}; +