diff -urN v2.4.19/AIO-NOTES aio-2.4.19.diff/AIO-NOTES --- v2.4.19/AIO-NOTES Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/AIO-NOTES Mon Sep 16 21:54:13 2002 @@ -0,0 +1,3 @@ +- aio context destruction is now synchronous: it waits for all pending + ios to complete. This will now cause a task that is exiting to be + delayed if outstanding ios are executing. diff -urN v2.4.19/MAINTAINERS aio-2.4.19.diff/MAINTAINERS --- v2.4.19/MAINTAINERS Fri Aug 9 13:49:02 2002 +++ aio-2.4.19.diff/MAINTAINERS Mon Sep 16 21:54:13 2002 @@ -228,6 +228,12 @@ L: linux-net@vger.kernel.org S: Maintained +ASYNC IO +P: Benjamin LaHaise +M: bcrl@redhat.com +L: linux-aio@kvack.org +S: Maintained + AX.25 NETWORK LAYER P: Matthias Welwarsky M: dg2fef@afthd.tu-darmstadt.de diff -urN v2.4.19/arch/i386/Makefile aio-2.4.19.diff/arch/i386/Makefile --- v2.4.19/arch/i386/Makefile Thu May 3 11:22:07 2001 +++ aio-2.4.19.diff/arch/i386/Makefile Mon Sep 16 21:54:13 2002 @@ -22,6 +22,7 @@ LINKFLAGS =-T $(TOPDIR)/arch/i386/vmlinux.lds $(LDFLAGS) CFLAGS += -pipe +CFLAGS+=-freorder-blocks # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(shell if $(CC) -mpreferred-stack-boundary=2 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-mpreferred-stack-boundary=2"; fi) @@ -98,7 +99,7 @@ DRIVERS += arch/i386/math-emu/math.o endif -arch/i386/kernel: dummy +arch/i386/kernel: dummy include/linux/compile.h $(MAKE) linuxsubdirs SUBDIRS=arch/i386/kernel arch/i386/mm: dummy diff -urN v2.4.19/arch/i386/kernel/entry.S aio-2.4.19.diff/arch/i386/kernel/entry.S --- v2.4.19/arch/i386/kernel/entry.S Fri Aug 9 13:49:03 2002 +++ aio-2.4.19.diff/arch/i386/kernel/entry.S Mon Sep 16 21:54:13 2002 @@ -45,6 +45,7 @@ #include #include #include +#include EBX = 0x00 ECX = 0x04 @@ -639,6 +640,13 @@ .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_setaffinity */ .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_getaffinity */ + .long SYMBOL_NAME(sys_ni_syscall) /* reserved for set_thread_area */ + .long SYMBOL_NAME(sys_ni_syscall) /* reserved for get_thread_area */ + .long SYMBOL_NAME(sys_io_setup) /* 245 */ + .long SYMBOL_NAME(sys_io_destroy) + .long SYMBOL_NAME(sys_io_getevents) + .long SYMBOL_NAME(sys_io_submit) + .long SYMBOL_NAME(sys_io_cancel) .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff -urN v2.4.19/arch/i386/kernel/irq.c aio-2.4.19.diff/arch/i386/kernel/irq.c --- v2.4.19/arch/i386/kernel/irq.c Mon Nov 12 17:49:47 2001 +++ aio-2.4.19.diff/arch/i386/kernel/irq.c Mon Sep 16 21:54:13 2002 @@ -577,7 +577,17 @@ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; + long esp; + /* Debugging check for stack overflow: is there less than 2KB free? */ + __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191)); + if (esp < (sizeof(struct task_struct) + 2048)) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct task_struct)); + __asm__ __volatile__("movl %%esp,%0" : "=r" (esp)); + show_stack((void *)esp); + } + kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); diff -urN v2.4.19/arch/i386/kernel/semaphore.c aio-2.4.19.diff/arch/i386/kernel/semaphore.c --- v2.4.19/arch/i386/kernel/semaphore.c Fri Aug 9 13:49:03 2002 +++ aio-2.4.19.diff/arch/i386/kernel/semaphore.c Mon Sep 16 21:54:13 2002 @@ -14,6 +14,7 @@ */ #include #include +#include #include /* @@ -54,6 +55,54 @@ static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; +void __wtd_down(struct semaphore * sem, struct worktodo *wtd); + +void __wtd_down_action(void *data) +{ + struct worktodo *wtd = data; + struct semaphore *sem; + + wtd_pop(wtd); + sem = wtd->data; + + __wtd_down(sem, wtd); +} + +void __wtd_down_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct semaphore *sem = wtd->data; + + __remove_wait_queue(&sem->wait, &wtd->wait); + wtd_push(wtd, __wtd_down_action, wtd); + wtd_queue(wtd); +} + +void __wtd_down(struct semaphore * sem, struct worktodo *wtd) +{ + int gotit; + int sleepers; + + init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter); + wtd->data = sem; + + spin_lock_irq(&semaphore_lock); + sem->sleepers++; + sleepers = sem->sleepers; + gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait, + atomic_add_negative(sleepers - 1, &sem->count)); + if (gotit) + sem->sleepers = 0; + else + sem->sleepers = 1; + spin_unlock_irq(&semaphore_lock); + + if (gotit) { + wake_up(&sem->wait); + wtd_queue(wtd); + } +} + void __down(struct semaphore * sem) { struct task_struct *tsk = current; @@ -254,6 +303,21 @@ "popl %ecx\n\t" "popl %edx\n\t" "popl %eax\n\t" + "ret" +); + +asm( +".text\n" +".align 4\n" +".globl __wtd_down_failed\n" +"__wtd_down_failed:\n\t" + "pushl %eax\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __wtd_down\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "popl %eax\n\t" "ret" ); diff -urN v2.4.19/arch/i386/mm/fault.c aio-2.4.19.diff/arch/i386/mm/fault.c --- v2.4.19/arch/i386/mm/fault.c Fri Aug 9 13:49:03 2002 +++ aio-2.4.19.diff/arch/i386/mm/fault.c Mon Sep 16 21:54:13 2002 @@ -27,6 +27,8 @@ extern void die(const char *,struct pt_regs *,long); +spinlock_t oops_lock = SPIN_LOCK_UNLOCKED; + /* * Ugly, ugly, but the goto's result in better assembly.. */ @@ -306,7 +308,7 @@ * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - + spin_lock(&oops_lock); bust_spinlocks(1); if (address < PAGE_SIZE) @@ -327,6 +329,7 @@ } die("Oops", regs, error_code); bust_spinlocks(0); + spin_unlock(&oops_lock); do_exit(SIGKILL); /* diff -urN v2.4.19/arch/ia64/kernel/entry.S aio-2.4.19.diff/arch/ia64/kernel/entry.S --- v2.4.19/arch/ia64/kernel/entry.S Fri Aug 9 13:49:03 2002 +++ aio-2.4.19.diff/arch/ia64/kernel/entry.S Mon Sep 16 21:54:13 2002 @@ -1154,11 +1154,11 @@ data8 ia64_ni_syscall // 1235 data8 ia64_ni_syscall data8 ia64_ni_syscall - data8 ia64_ni_syscall - data8 ia64_ni_syscall - data8 ia64_ni_syscall // 1240 - data8 ia64_ni_syscall - data8 ia64_ni_syscall + data8 sys_io_setup + data8 sys_io_destroy + data8 sys_io_getevents // 1240 + data8 sys_io_submit + data8 sys_io_cancel data8 ia64_ni_syscall data8 ia64_ni_syscall data8 ia64_ni_syscall // 1245 diff -urN v2.4.19/arch/ia64/kernel/semaphore.c aio-2.4.19.diff/arch/ia64/kernel/semaphore.c --- v2.4.19/arch/ia64/kernel/semaphore.c Thu May 3 11:22:08 2001 +++ aio-2.4.19.diff/arch/ia64/kernel/semaphore.c Mon Sep 16 21:54:13 2002 @@ -24,7 +24,7 @@ * where we want to avoid any extra jumps and calls. */ #include - +#include #include /* @@ -45,6 +45,70 @@ static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; +void __wtd_down(struct semaphore * sem, struct worktodo *wtd); + +void __wtd_down_action(void *data) +{ + struct worktodo *wtd = data; + struct semaphore *sem; + + wtd_pop(wtd); + sem = wtd->data; + + __wtd_down(sem, wtd); +} + +void __wtd_down_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct semaphore *sem = wtd->data; + + __remove_wait_queue(&sem->wait, &wtd->wait); + wtd_push(wtd, __wtd_down_action, wtd); + wtd_queue(wtd); +} + +void __wtd_down(struct semaphore * sem, struct worktodo *wtd) +{ + int gotit; + int sleepers; + + init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter); + wtd->data = sem; + + spin_lock_irq(&semaphore_lock); + sem->sleepers++; + sleepers = sem->sleepers; + gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait, + atomic_add_negative(sleepers - 1, &sem->count)); + if (gotit) + sem->sleepers = 0; + else + sem->sleepers = 1; + spin_unlock_irq(&semaphore_lock); + + if (gotit) { + wake_up(&sem->wait); + wtd_queue(wtd); + } +} + +/* Returns 0 if we acquired the semaphore, 1 if it was queued. */ +int wtd_down(struct worktodo *wtd, struct semaphore *sem) +{ +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + if (atomic_dec_return(&sem->count) < 0) { + __wtd_down(sem, wtd); + return 1; + } + else { + return 0; + } +} + + void __down (struct semaphore *sem) { diff -urN v2.4.19/drivers/block/loop.c aio-2.4.19.diff/drivers/block/loop.c --- v2.4.19/drivers/block/loop.c Fri Aug 9 13:49:22 2002 +++ aio-2.4.19.diff/drivers/block/loop.c Mon Sep 16 21:54:13 2002 @@ -283,7 +283,7 @@ spin_lock_irq(&lo->lo_lock); file = lo->lo_backing_file; spin_unlock_irq(&lo->lo_lock); - do_generic_file_read(file, &pos, &desc, lo_read_actor); + do_generic_file_read(file, &pos, &desc, lo_read_actor, 0); return desc.error; } diff -urN v2.4.19/drivers/char/raw.c aio-2.4.19.diff/drivers/char/raw.c --- v2.4.19/drivers/char/raw.c Fri Aug 9 13:49:27 2002 +++ aio-2.4.19.diff/drivers/char/raw.c Mon Sep 16 21:54:13 2002 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -35,6 +37,9 @@ int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); int raw_ioctl(struct inode *, struct file *, unsigned int, unsigned long); +int raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); + static struct file_operations raw_fops = { @@ -43,6 +48,10 @@ open: raw_open, release: raw_release, ioctl: raw_ioctl, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: raw_kvec_read, + kvec_write: raw_kvec_write, }; static struct file_operations raw_ctl_fops = { @@ -271,7 +280,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -402,3 +410,99 @@ out: return err; } + +static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, READ, cb, size, pos); +} + +int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, WRITE, cb, size, pos); +} + +int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos) +{ + int err; + unsigned minor; + kdev_t dev; + unsigned long limit, blocknr, blocks; + + unsigned sector_size, sector_bits, sector_mask; + unsigned max_sectors; + unsigned i; + + pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + /* EOF at the end */ + err = 0; + if (!size || (pos >> sector_bits) == limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + cb.fn(cb.data, cb.vec, err); + return 0; + } + + /* ENXIO for io beyond the end */ + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask); + goto out; + } + + /* Verify that the scatter-gather list is sector aligned. */ + for (i=0; inr; i++) + if ((cb.vec->veclet[i].offset & sector_mask) || + (cb.vec->veclet[i].length & sector_mask)) { + pr_debug("veclet offset/length wrong"); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + if (!blocks) { + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + goto out; + } + + err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits); +out: + if (err) + printk(KERN_DEBUG "raw_kvec_rw: ret is %d\n", err); + return err; +} + diff -urN v2.4.19/fs/Makefile aio-2.4.19.diff/fs/Makefile --- v2.4.19/fs/Makefile Thu Mar 7 16:40:03 2002 +++ aio-2.4.19.diff/fs/Makefile Mon Sep 16 21:54:13 2002 @@ -22,6 +22,9 @@ obj-y += noquot.o endif +obj-y += aio.o +export-objs += aio.o + subdir-$(CONFIG_PROC_FS) += proc subdir-y += partitions diff -urN v2.4.19/fs/aio.c aio-2.4.19.diff/fs/aio.c --- v2.4.19/fs/aio.c Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/fs/aio.c Mon Sep 16 21:54:13 2002 @@ -0,0 +1,1387 @@ +/* fs/aio.c + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if DEBUG > 1 +#define dprintk printk +#else +#define dprintk(x...) do { ; } while (0) +#endif + +/*------ sysctl variables----*/ +unsigned aio_nr; /* current system wide number of aio requests */ +unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +unsigned aio_max_size = 0x20000; /* 128KB per chunk */ +unsigned aio_max_pinned; /* set to mem/4 in aio_setup */ +/*----end sysctl variables---*/ + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kioctx_cachep; + +/* tunable. Needs to be added to sysctl. */ +int max_aio_reqs = 0x10000; + +/* Used for rare fput completion. */ +static void aio_fput_routine(void *); +static struct tq_struct fput_tqueue = { + routine: aio_fput_routine, +}; + +static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(fput_head); + +/* forward prototypes */ +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res); +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res); + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + aio_max_pinned = num_physpages/4; + + printk(KERN_NOTICE "aio_setup: num_physpages = %u\n", aio_max_pinned); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +static void ioctx_free_reqs(struct kioctx *ctx) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, &ctx->free_reqs) { + struct kiocb *iocb = list_kiocb(pos); + list_del(&iocb->list); + kmem_cache_free(kiocb_cachep, iocb); + } +} + +static void aio_free_ring(struct kioctx *ctx) +{ + struct aio_ring_info *info = &ctx->ring_info; + + if (info->kvec) { + unmap_kvec(info->kvec, 1); + free_kvec(info->kvec); + } + + if (info->mmap_size) { + down_write(&ctx->mm->mmap_sem); + do_munmap(ctx->mm, info->mmap_base, info->mmap_size); + up_write(&ctx->mm->mmap_sem); + } + + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + info->ring_pages = NULL; + info->nr = 0; +} + +static int aio_setup_ring(struct kioctx *ctx) +{ + struct aio_ring *ring; + struct aio_ring_info *info = &ctx->ring_info; + unsigned nr_reqs = ctx->max_reqs; + unsigned long size; + int nr_pages, i; + + /* Compensate for the ring buffer's head/tail overlap entry */ + nr_reqs += 2; /* 1 is required, 2 for good luck */ + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_reqs; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages < 0) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_reqs = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + dprintk("attempting mmap of %lu bytes\n", info->mmap_size); + down_write(&ctx->mm->mmap_sem); + info->mmap_base = do_mmap(NULL, 0, info->mmap_size, + PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, + 0); + up_write(&ctx->mm->mmap_sem); + if (IS_ERR((void *)info->mmap_base)) { + printk("mmap err: %ld\n", -info->mmap_base); + info->mmap_size = 0; + aio_free_ring(ctx); + return -EAGAIN; + } + dprintk("mmap address: 0x%08lx\n", info->mmap_base); + info->kvec = map_user_kvec(READ, info->mmap_base, info->mmap_size); + if (unlikely(IS_ERR(info->kvec))) { + info->kvec = NULL; + aio_free_ring(ctx); + return -EAGAIN; + } + + if (unlikely(info->kvec->nr != nr_pages)) + BUG(); + + for (i=0; ikvec->veclet[i].offset)) + BUG(); + info->ring_pages[i] = info->kvec->veclet[i].page; + //printk("[%d] %p -> %p\n", i, info->kvec->veclet[i].page, + // info->pages[i]); + } + + + ctx->user_id = info->mmap_base; + + info->nr = nr_reqs; /* trusted copy */ + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring->nr = nr_reqs; /* user copy */ + ring->id = ctx->user_id; + kunmap_atomic(ring, KM_USER0); + + return 0; +} + +/* aio_ring_event: returns a pointer to the event at the given index from + * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); + */ +static inline struct io_event *aio_ring_event(struct aio_ring_info *info, int nr, enum km_type km) +{ + struct io_event *events; +#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) + + if (nr < AIO_EVENTS_FIRST_PAGE) { + struct aio_ring *ring; + ring = kmap_atomic(info->ring_pages[0], km); + return &ring->io_events[nr]; + } + nr -= AIO_EVENTS_FIRST_PAGE; + + events = kmap_atomic(info->ring_pages[1 + nr / AIO_EVENTS_PER_PAGE], km); + + return events + (nr % AIO_EVENTS_PER_PAGE); +} + +static inline void put_aio_ring_event(struct io_event *event, enum km_type km) +{ + void *p = (void *)((unsigned long)event & PAGE_MASK); + kunmap_atomic(p, km); +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_reqs) +{ + struct kioctx *ctx; + unsigned i; + + /* Prevent overflows */ + if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) || + (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) { + pr_debug("ENOMEM: nr_reqs too high\n"); + return ERR_PTR(-EINVAL); + } + + if (nr_reqs > aio_max_nr) + return ERR_PTR(-EAGAIN); + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + ctx->max_reqs = nr_reqs; + ctx->mm = current->mm; + atomic_inc(&ctx->mm->mm_count); + + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->lock); + spin_lock_init(&ctx->ring_info.ring_lock); + init_waitqueue_head(&ctx->wait); + + INIT_LIST_HEAD(&ctx->free_reqs); + INIT_LIST_HEAD(&ctx->active_reqs); + //ctx->user_id = ++current->mm->new_ioctx_id; + + if (aio_setup_ring(ctx) < 0) + goto out_freectx; + + /* Allocate nr_reqs iocbs for io. Free iocbs are on the + * ctx->free_reqs list. When active they migrate to the + * active_reqs list. During completion and cancellation + * the request may temporarily not be on any list. + */ + for (i=0; ikey = i; + iocb->users = 0; + list_add(&iocb->list, &ctx->free_reqs); + } + + /* now link into global list. kludge. FIXME */ + br_write_lock(BR_AIO_REQ_LOCK); + if (unlikely(aio_nr + ctx->max_reqs > aio_max_nr)) + goto out_cleanup; + aio_nr += ctx->max_reqs; /* undone by __put_ioctx */ + ctx->next = current->mm->ioctx_list; + current->mm->ioctx_list = ctx; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, current->mm, ctx->ring_info.ring->nr); + return ctx; + +out_cleanup: + br_write_unlock(BR_AIO_REQ_LOCK); + ctx->max_reqs = 0; /* prevent __put_ioctx from sub'ing aio_nr */ + __put_ioctx(ctx); + return ERR_PTR(-EAGAIN); + +out_freering: + aio_free_ring(ctx); + ioctx_free_reqs(ctx); +out_freectx: + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + dprintk("aio: error allocating ioctx %p\n", ctx); + return ctx; +} + +/* aio_cancel_all + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void aio_cancel_all(struct kioctx *ctx) +{ + int (*cancel)(struct kiocb *); + spin_lock_irq(&ctx->lock); + ctx->dead = 1; + while (!list_empty(&ctx->active_reqs)) { + struct list_head *pos = ctx->active_reqs.next; + struct kiocb *iocb = list_kiocb(pos); + list_del_init(&iocb->list); + cancel = iocb->cancel; + if (cancel) + iocb->users++; + spin_unlock_irq(&ctx->lock); + if (cancel) + cancel(iocb); + spin_lock_irq(&ctx->lock); + } + spin_unlock_irq(&ctx->lock); +} + +void wait_for_all_aios(struct kioctx *ctx) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + if (!ctx->reqs_active) + return; + + add_wait_queue(&ctx->wait, &wait); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (ctx->reqs_active) { + printk("ctx->reqs_active = %d\n", ctx->reqs_active); + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); +} + +/* exit_aio: called when the last user of mm goes away. At this point, + * there is no way for any new requests to be submited or any of the + * io_* syscalls to be called on the context. However, there may be + * outstanding requests which hold references to the context; as they + * go away, they will call put_ioctx and release any pinned memory + * associated with the request (held via struct page * references). + */ +void exit_aio(struct mm_struct *mm) +{ + struct kioctx *ctx = mm->ioctx_list; + mm->ioctx_list = NULL; + while (ctx) { + struct kioctx *next = ctx->next; + ctx->next = NULL; + aio_cancel_all(ctx); + + wait_for_all_aios(ctx); + + if (1 != atomic_read(&ctx->users)) + printk(KERN_DEBUG + "exit_aio:ioctx still alive: %d %d %d\n", + atomic_read(&ctx->users), ctx->dead, + ctx->reqs_active); + put_ioctx(ctx); + ctx = next; + } +} + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __put_ioctx(struct kioctx *ctx) +{ + unsigned nr_reqs = ctx->max_reqs; + + if (unlikely(ctx->reqs_active)) + BUG(); + + aio_free_ring(ctx); + mmdrop(ctx->mm); + ctx->mm = NULL; + pr_debug("__put_ioctx: freeing %p\n", ctx); + ioctx_free_reqs(ctx); + kmem_cache_free(kioctx_cachep, ctx); + + br_write_lock(BR_AIO_REQ_LOCK); + aio_nr -= nr_reqs; + br_write_unlock(BR_AIO_REQ_LOCK); +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns -EAGAIN if no requests are free. + */ +static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx)); +static struct kiocb *__aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req = NULL; + struct aio_ring *ring; + + /* Use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + if (likely(!list_empty(&ctx->free_reqs) && + (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)))) { + req = list_kiocb(ctx->free_reqs.next); + list_del(&req->list); + list_add(&req->list, &ctx->active_reqs); + ctx->reqs_active++; + req->user_obj = NULL; + get_ioctx(ctx); + + if (unlikely(req->ctx != NULL)) + BUG(); + req->ctx = ctx; + if (unlikely(req->users)) + BUG(); + req->users = 1; + } + kunmap_atomic(ring, KM_USER0); + spin_unlock_irq(&ctx->lock); + + return req; +} + +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + /* Handle a potential starvation case -- should be exceedingly rare as + * requests will be stuck on fput_head only if the aio_fput_routine is + * delayed and the requests were the last user of the struct file. + */ + req = __aio_get_req(ctx); + if (unlikely(NULL == ctx)) { + aio_fput_routine(NULL); + req = __aio_get_req(ctx); + } + return req; +} + +static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +{ + req->ctx = NULL; + req->filp = NULL; + req->user_obj = NULL; + ctx->reqs_active--; + list_add(&req->list, &ctx->free_reqs); + + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up(&ctx->wait); +} + +static void aio_fput_routine(void *data) +{ + spin_lock_irq(&fput_lock); + while (likely(!list_empty(&fput_head))) { + struct kiocb *req = list_kiocb(fput_head.next); + struct kioctx *ctx = req->ctx; + + list_del(&req->list); + spin_unlock_irq(&fput_lock); + + /* Complete the fput */ + __fput(req->filp); + + /* Link the iocb into the context's free list */ + spin_lock_irq(&ctx->lock); + really_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + + put_ioctx(ctx); + spin_lock_irq(&fput_lock); + } + spin_unlock_irq(&fput_lock); +} + +/* __aio_put_req + * Returns true if this put was the last user of the request. + */ +static inline int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", + req, atomic_read(&req->filp->f_count)); + + req->users --; + if (unlikely(req->users < 0)) + BUG(); + if (likely(req->users)) + return 0; + list_del(&req->list); /* remove from active_reqs */ + req->cancel = NULL; + + /* Must be done under the lock to serialise against cancellation. + * Call this aio_fput as it duplicates fput via the fput_tqueue. + */ + if (unlikely(atomic_dec_and_test(&req->filp->f_count))) { + get_ioctx(ctx); + spin_lock(&fput_lock); + list_add(&req->list, &fput_head); + spin_unlock(&fput_lock); + schedule_task(&fput_tqueue); + } else + really_put_req(ctx, req); + return 1; +} + +/* aio_put_req + * Returns true if this put was the last user of the kiocb, + * false if the request is still in use. + */ +int aio_put_req(struct kiocb *req) +{ + struct kioctx *ctx = req->ctx; + int ret; + spin_lock_irq(&ctx->lock); + ret = __aio_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + if (ret) + put_ioctx(ctx); + return ret; +} + +/* Lookup an ioctx id. ioctx_list is lockless for reads. + * FIXME: this is O(n) and is only suitable for development. + */ +static inline struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct kioctx *ioctx; + struct mm_struct *mm; + + br_read_lock(BR_AIO_REQ_LOCK); + mm = current->mm; + for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) + if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { + get_ioctx(ioctx); + break; + } + br_read_unlock(BR_AIO_REQ_LOCK); + + return ioctx; +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + * Returns true if this is the last user of the request. The + * only other user of the request can be the cancellation code. + */ +int aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ctx; + struct aio_ring_info *info = &ctx->ring_info; + struct aio_ring *ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + int ret; + + /* add a completion event to the ring buffer. + * must be done holding ctx->lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->lock, flags); + + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + tail = info->tail; + event = aio_ring_event(info, tail, KM_IRQ0); + tail = (tail + 1) % info->nr; + + event->obj = (u64)(unsigned long)iocb->user_obj; + event->data = iocb->user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + info->tail = tail; + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + put_aio_ring_event(event, KM_IRQ0); + kunmap_atomic(ring, KM_IRQ1); + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); + + /* everything turned out well, dispose of the aiocb. */ + ret = __aio_put_req(ctx, iocb); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + + if (ret) + put_ioctx(ctx); + + return ret; +} + +/* aio_read_evt + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched (0 or 1 ;-) + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring_info *info = &ioctx->ring_info; + struct aio_ring *ring; + unsigned long head; + int ret = 0; + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + dprintk("in aio_read_evt h%lu t%lu m%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail, + (unsigned long)ring->nr); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&info->ring_lock); + + head = ring->head % info->nr; + if (head != ring->tail) { + struct io_event *evp = aio_ring_event(info, head, KM_USER1); + *ent = *evp; + head = (head + 1) % info->nr; + barrier(); + ring->head = head; + ret = 1; + put_aio_ring_event(evp, KM_USER1); + } + spin_unlock(&info->ring_lock); + +out: + kunmap_atomic(ring, KM_USER0); + dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + struct task_struct *p; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up_process(to->p); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + to->p = current; +} + +static inline void set_timeout(struct timeout *to, const struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void update_ts(struct timespec *ts, long jiffies) +{ + struct timespec tmp; + jiffies_to_timespec(jiffies, &tmp); + ts->tv_sec -= tmp.tv_sec; + ts->tv_nsec -= tmp.tv_nsec; + if (ts->tv_nsec < 0) { + ts->tv_nsec += 1000000000; + ts->tv_sec -= 1; + } + if (ts->tv_sec < 0) + ts->tv_sec = ts->tv_nsec = 0; +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, + long min_nr, long nr, + struct io_event *event, + struct timespec *timeout) +{ + long start_jiffies = jiffies; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct timeout to; + struct timespec ts; + + /* needed to zero any padding within an entry (there shouldn't be + * any, but C is fun! + */ + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (likely(i < nr)) { + ret = aio_read_evt(ctx, &ent); + if (unlikely(ret <= 0)) + break; + + dprintk("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + ret = 0; + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (i) + return i; + if (ret) + return ret; + + /* End fast path */ + + init_timeout(&to); + if (timeout) { + ret = -EFAULT; + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + goto out; + + set_timeout(&to, &ts); + if (to.timed_out) { + timeout = 0; + clear_timeout(&to); + } + } + + while (likely(i < nr)) { + add_wait_queue_exclusive_lifo(&ctx->wait, &wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx, &ent); + if (ret) + break; + if (i) + break; + ret = 0; + if (to.timed_out) /* Only check after read evt */ + break; + schedule(); + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + /*ret = aio_read_evt(ctx, &ent);*/ + } while (1) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + + if (unlikely(ret <= 0)) + break; + + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (timeout) { + clear_timeout(&to); + update_ts(&ts, jiffies - start_jiffies); + if (copy_to_user(timeout, &ts, sizeof(ts))) + ret = -EFAULT; + } +out: + return i ? i : ret; +} + +/* Take an ioctx and remove it from the list of ioctx's. Protects + * against races with itself via ->dead. + */ +static void io_destroy(struct kioctx *ioctx) +{ + struct kioctx **tmp; + int was_dead; + + /* delete the entry from the list is someone else hasn't already */ + br_write_lock(BR_AIO_REQ_LOCK); + was_dead = ioctx->dead; + ioctx->dead = 1; + for (tmp = ¤t->mm->ioctx_list; *tmp && *tmp != ioctx; + tmp = &(*tmp)->next) + ; + if (*tmp) + *tmp = ioctx->next; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio_release(%p)\n", ioctx); + if (likely(!was_dead)) + put_ioctx(ioctx); /* twice for the list */ + + aio_cancel_all(ioctx); + wait_for_all_aios(ioctx); + put_ioctx(ioctx); /* once for the lookup */ +} + +asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) { + pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n"); + goto out; + } + + ret = -EAGAIN; + if (unlikely(nr_reqs > max_aio_reqs)) + goto out; + + ioctx = ioctx_alloc(nr_reqs); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) + return 0; + io_destroy(ioctx); + } + +out: + return ret; +} + +/* aio_release + * Release the kioctx associated with the userspace handle. + */ +asmlinkage long sys_io_destroy(aio_context_t ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (likely(NULL != ioctx)) { + io_destroy(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + unsigned events = iocb->aio_buf; + + /* Did the user set any bits they weren't supposed to? (The + * above is actually a cast. + */ + if (unlikely(events != iocb->aio_buf)) + return -EINVAL; + + return async_poll(req, events); +} + +/* sys_io_submit + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp) +{ + struct kioctx *ctx; + long ret = 0; + int i; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + for (i=0; ifilp = file; + tmp.aio_key = req->key; + ret = put_user(tmp.aio_key, &iocbp->aio_key); + if (unlikely(ret)) { + dprintk("EFAULT: aio_key\n"); + goto out_put_req; + } + + req->user_obj = iocbp; + req->user_data = tmp.aio_data; + req->buf = tmp.aio_buf; + req->pos = tmp.aio_offset; + req->size = tmp.aio_nbytes; + req->nr_transferred = 0; + req->rlim_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur; + + ret = -EBADF; + if (IOCB_CMD_PREAD == tmp.aio_lio_opcode) { + op = file->f_op->aio_read; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_put_req; + } else if (IOCB_CMD_PREADX == tmp.aio_lio_opcode) { + op = file->f_op->aio_readx; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_put_req; + } else if (IOCB_CMD_PWRITE == tmp.aio_lio_opcode) { + op = file->f_op->aio_write; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_put_req; + } else if (IOCB_CMD_FSYNC == tmp.aio_lio_opcode) { + op = file->f_op->aio_fsync; + } else if (IOCB_CMD_POLL == tmp.aio_lio_opcode) { + op = generic_aio_poll; + } else + op = NULL; + + if (unlikely(!op)) { + printk("EINVAL: io_submit: no operation provided\n"); + ret = -EINVAL; + goto out_put_req; + } + + ret = op(file, req, &tmp); + if (likely(!ret)) + continue; + + pr_debug("io_submit: op returned %ld\n", ret); + aio_complete(req, ret, 0); + ret = 0; /* A completion event was sent, so + * submit is a success. */ + continue; + + out_put_req: + aio_put_req(req); + break; + } + + put_ioctx(ctx); + //run_task_queue(&tq_disk); + return i ? i : ret; +} + +static void generic_aio_next_chunk(void *_iocb) +{ + int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t); + struct kiocb *iocb = _iocb; + int rw = iocb->this_size; + unsigned long buf = iocb->buf; + unsigned long old_fsize; + kvec_cb_t cb; + ssize_t res; + + iocb->this_size = iocb->size - iocb->nr_transferred; + if (iocb->this_size > aio_max_size) + iocb->this_size = aio_max_size; + + buf += iocb->nr_transferred; + cb.vec = mm_map_user_kvec(iocb->ctx->mm, rw, buf, iocb->this_size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = iocb; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (unlikely(IS_ERR(cb.vec))) + goto done; + + old_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur; + current->rlim[RLIMIT_FSIZE].rlim_cur = iocb->rlim_fsize; + kvec_op = (rw == READ) ? iocb->filp->f_op->kvec_read + : iocb->filp->f_op->kvec_write; + dprintk("submit: %d %d %d\n", iocb->this_size, iocb->nr_transferred, iocb->size); + res = kvec_op(iocb->filp, cb, iocb->this_size, + iocb->pos + iocb->nr_transferred); + current->rlim[RLIMIT_FSIZE].rlim_cur = old_fsize; + if (!res) { + dprintk("submit okay\n"); + return; + } + dprintk("submit failed: %d\n", res); + + cb.fn(cb.data, cb.vec, res); + return; + +done: + if (unlikely(!iocb->nr_transferred)) + BUG(); + aio_complete(iocb, iocb->nr_transferred, 0); +} + +static void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res) +{ + struct kiocb *iocb = _iocb; + + unmap_kvec(vec, rw == READ); + free_kvec(vec); + + if (res > 0) + iocb->nr_transferred += res; + + /* Was this chunk successful? Is there more left to transfer? */ + if (res == iocb->this_size && iocb->nr_transferred < iocb->size) { + /* We may be in irq context, so queue processing in + * process context. + */ + iocb->this_size = rw; + INIT_TQUEUE(&iocb->u.tq, generic_aio_next_chunk, iocb); + schedule_task(&iocb->u.tq); + return; + } + + aio_complete(iocb, iocb->nr_transferred ? iocb->nr_transferred : res, + 0); +} + +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(READ, _iocb, vec, res); +} + +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(WRITE, _iocb, vec, res); +} + +ssize_t generic_aio_rw(int rw, struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size) +{ + int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t); + unsigned long buf = iocb->aio_buf; + size_t size = iocb->aio_nbytes; + size_t nr_read = 0; + loff_t pos = iocb->aio_offset; + kvec_cb_t cb; + ssize_t res; + +#if 0 + if (likely(NULL != file->f_op->new_read)) { + nr_read = file->f_op->new_read(file, (void *)buf, size, + &pos, F_ATOMIC); + dprintk("from new_read: nr_read: %ld\n", (long)nr_read); + if ((-EAGAIN == nr_read) || (-EWOULDBLOCKIO == nr_read)) + nr_read = 0; + else if ((nr_read >= min_size) || (nr_read < 0)) { + dprintk("returning nr_read: %ld\n", (long)nr_read); + return nr_read; + } + } + dprintk("nr_read: %ld\n", (long)nr_read); +#endif + + req->nr_transferred = nr_read; + size -= nr_read; + if (size > aio_max_size) + /* We have to split up the request. Pin the mm + * struct for further use with map_user_kvec later. + */ + size = aio_max_size; + else + req->buf = 0; + + req->this_size = size; + + buf += nr_read; + cb.vec = map_user_kvec(rw, buf, size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = req; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (IS_ERR(cb.vec)) + return nr_read ? nr_read : PTR_ERR(cb.vec); + + kvec_op = (rw == READ) ? file->f_op->kvec_read : file->f_op->kvec_write; + + res = kvec_op(file, cb, size, pos); + if (unlikely(res != 0)) { + /* If the first chunk was successful, we have to run + * the callback to attempt the rest of the io. + */ + if (res == size && req->buf) { + cb.fn(cb.data, cb.vec, res); + return 0; + } + + unmap_kvec(cb.vec, rw == READ); + free_kvec(cb.vec); + if (nr_read) { + if (res < 0) + res = 0; + res += nr_read; + } + } + return res; +} + +ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + return generic_aio_rw(READ, file, req, iocb, iocb->aio_nbytes); +} + +ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + return generic_aio_rw(READ, file, req, iocb, 1); +} + +ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size) +{ + return generic_aio_rw(WRITE, file, req, iocb, 1); +#if 0 + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + loff_t pos = iocb.aio_offset; + ssize_t nr_written = 0; + kvec_cb_t cb; + long res; +#if 0 + if (likely(NULL != file->f_op->new_write)) { + nr_written = file->f_op->new_write(file, (void *)buf, size, + &pos, F_ATOMIC); + pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written); + if (-EAGAIN == nr_written) + nr_written = 0; + if ((nr_written >= min_size) || (nr_written < 0)) + return nr_written; + } +#endif + + req->nr_transferred = nr_written; + size -= nr_written; + if (size > aio_max_size) + size = aio_max_size; + req->this_size = size; + buf += nr_written; + cb.vec = map_user_kvec(WRITE, buf, size); + cb.fn = generic_aio_complete_write; + cb.data = req; + + if (IS_ERR(cb.vec)) { + pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec)); + return nr_written ? nr_written : PTR_ERR(cb.vec); + } + + res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset); + pr_debug("generic_aio_write: kvec_write: %ld\n", res); + if (unlikely(res != 0)) { + unmap_kvec(cb.vec, 0); + free_kvec(cb.vec); + if (nr_written) { + if (res < 0) + res = 0; + res += nr_written; + } + } + return res; +#endif +} + +ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + return generic_aio_write(file, req, iocb, iocb->aio_nbytes); +} + +/* lookup_kiocb + * Finds a given iocb for cancellation. + * MUST be called with ctx->lock held. + */ +struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key) +{ + struct list_head *pos; + /* TODO: use a hash or array, this sucks. */ + list_for_each(pos, &ctx->free_reqs) { + struct kiocb *kiocb = list_kiocb(pos); + if (kiocb->user_obj == iocb && kiocb->key == key) + return kiocb; + } + return NULL; +} + +asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb) +{ + int (*cancel)(struct kiocb *iocb); + struct kioctx *ctx; + struct kiocb *kiocb; + u32 key; + int ret; + + ret = get_user(key, &iocb->aio_key); + if (unlikely(ret)) + return ret; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); + if (kiocb && kiocb->cancel) { + cancel = kiocb->cancel; + kiocb->users ++; + } else + cancel = NULL; + spin_unlock_irq(&ctx->lock); + + if (NULL != cancel) { + printk("calling cancel\n"); + ret = cancel(kiocb); + } else + printk("iocb has no cancel operation\n"); + + put_ioctx(ctx); + + return ret; +} + +asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb, const struct timespec *timeout) +{ +#if 0 /* FIXME. later. */ + struct kioctx *ioctx; + long ret = -EINVAL; + unsigned key; + long obj = (long)iocb; + + ioctx = lookup_ioctx(ctx_id); + if (!ioctx) + goto out; + + ret = get_user(key, &iocb->aio_key); + if (ret) + goto out; + + ret = __aio_complete(ioctx, key, obj, !!timeout); + put_ioctx(ioctx); + +out: + return ret; +#endif + return -ENOSYS; +} + +asmlinkage long sys_io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event *events, + struct timespec *timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(NULL != ioctx)) { + ret = read_events(ioctx, min_nr, nr, events, timeout); + put_ioctx(ioctx); + } + + return ret; +} + +__initcall(aio_setup); + +EXPORT_SYMBOL_GPL(generic_file_kvec_read); +EXPORT_SYMBOL_GPL(generic_file_aio_read); +EXPORT_SYMBOL_GPL(generic_file_kvec_write); +EXPORT_SYMBOL_GPL(generic_file_aio_write); +EXPORT_SYMBOL_GPL(generic_file_new_read); diff -urN v2.4.19/fs/buffer.c aio-2.4.19.diff/fs/buffer.c --- v2.4.19/fs/buffer.c Fri Aug 9 13:50:13 2002 +++ aio-2.4.19.diff/fs/buffer.c Mon Sep 16 21:54:13 2002 @@ -3014,3 +3014,220 @@ module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + kvec_cb_t cb; + atomic_t io_count; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_cb_put(struct brw_cb *brw_cb) +{ + if (atomic_dec_and_test(&brw_cb->io_count)) { + ssize_t res = 0, err = 0; + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (!err && buffer_uptodate(bh)) + res += bh->b_size; + else + err = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (!res) + res = err; + + brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + brw_cb_put(brw_cb); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift) +{ + struct kvec *vec = cb.vec; + struct kveclet *veclet; + int err; + int length; + unsigned sector_size = 1 << sector_shift; + int i; + + struct brw_cb *brw_cb; + + if (!vec->nr) + BUG(); + + /* + * First, do some alignment and validity checks + */ + length = 0; + for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) { + length += veclet->length; + if ((veclet->offset & (sector_size-1)) || + (veclet->length & (sector_size-1))) { + printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size); + return -EINVAL; + } + } + + if (length < (blocks << sector_shift)) + BUG(); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + err = 0; + + if (!blocks) { + printk("brw_kiovec_async: !i\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb->cb = cb; + brw_cb->nr = 0; + + /* This is ugly. FIXME. */ + for (i=0, veclet=vec->veclet; inr; i++,veclet++) { + struct page *page = veclet->page; + unsigned offset = veclet->offset; + unsigned length = veclet->length; + + if (!page) + BUG(); + + while (length > 0) { + struct buffer_head *tmp; + tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO); + err = -ENOMEM; + if (!tmp) + goto error; + + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, page, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blknr++; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) + | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + + if (brw_cb->nr >= blocks) + goto submit; + } /* End of block loop */ + } /* End of page loop */ + +submit: + atomic_set(&brw_cb->io_count, brw_cb->nr+1); + /* okay, we've setup all our io requests, now fire them off! */ + for (i=0; inr; i++) + submit_bh(rw, brw_cb->bh[i]); + brw_cb_put(brw_cb); + run_task_queue(&tq_disk); + return 0; + +error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (i=0; inr; i++) + kmem_cache_free(bh_cachep, brw_cb->bh[i]); + kfree(brw_cb); + } + + return err; +} +#if 0 +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} +#endif diff -urN v2.4.19/fs/exec.c aio-2.4.19.diff/fs/exec.c --- v2.4.19/fs/exec.c Fri Aug 9 13:50:13 2002 +++ aio-2.4.19.diff/fs/exec.c Mon Sep 16 21:54:13 2002 @@ -397,6 +397,7 @@ old_mm = current->mm; if (old_mm && atomic_read(&old_mm->mm_users) == 1) { mm_release(); + exit_aio(old_mm); exit_mmap(old_mm); return 0; } diff -urN v2.4.19/fs/ext2/file.c aio-2.4.19.diff/fs/ext2/file.c --- v2.4.19/fs/ext2/file.c Thu Nov 1 16:40:02 2001 +++ aio-2.4.19.diff/fs/ext2/file.c Mon Sep 16 21:54:13 2002 @@ -40,6 +40,8 @@ */ struct file_operations ext2_file_operations = { llseek: generic_file_llseek, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, read: generic_file_read, write: generic_file_write, ioctl: ext2_ioctl, @@ -47,6 +49,8 @@ open: generic_file_open, release: ext2_release_file, fsync: ext2_sync_file, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext2_file_inode_operations = { diff -urN v2.4.19/fs/ext3/file.c aio-2.4.19.diff/fs/ext3/file.c --- v2.4.19/fs/ext3/file.c Mon Nov 26 23:43:08 2001 +++ aio-2.4.19.diff/fs/ext3/file.c Mon Sep 16 21:54:13 2002 @@ -78,6 +78,8 @@ struct file_operations ext3_file_operations = { llseek: generic_file_llseek, /* BKL held */ + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, /* FIXME: attributes */ read: generic_file_read, /* BKL not held. Don't need */ write: ext3_file_write, /* BKL not held. Don't need */ ioctl: ext3_ioctl, /* BKL held */ @@ -85,6 +87,8 @@ open: ext3_open_file, /* BKL not held. Don't need */ release: ext3_release_file, /* BKL not held. Don't need */ fsync: ext3_sync_file, /* BKL held */ + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext3_file_inode_operations = { diff -urN v2.4.19/fs/file_table.c aio-2.4.19.diff/fs/file_table.c --- v2.4.19/fs/file_table.c Mon Sep 24 02:16:04 2001 +++ aio-2.4.19.diff/fs/file_table.c Mon Sep 16 21:54:13 2002 @@ -99,31 +99,35 @@ void fput(struct file * file) { + if (atomic_dec_and_test(&file->f_count)) + __fput(file); +} + +void __fput(struct file * file) +{ struct dentry * dentry = file->f_dentry; struct vfsmount * mnt = file->f_vfsmnt; struct inode * inode = dentry->d_inode; - if (atomic_dec_and_test(&file->f_count)) { - locks_remove_flock(file); + locks_remove_flock(file); - if (file->f_iobuf) - free_kiovec(1, &file->f_iobuf); + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); - if (file->f_op && file->f_op->release) - file->f_op->release(inode, file); - fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); - file_list_lock(); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; - list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - dput(dentry); - mntput(mnt); - } + if (file->f_op && file->f_op->release) + file->f_op->release(inode, file); + fops_put(file->f_op); + if (file->f_mode & FMODE_WRITE) + put_write_access(inode); + file_list_lock(); + file->f_dentry = NULL; + file->f_vfsmnt = NULL; + list_del(&file->f_list); + list_add(&file->f_list, &free_list); + files_stat.nr_free_files++; + file_list_unlock(); + dput(dentry); + mntput(mnt); } struct file * fget(unsigned int fd) diff -urN v2.4.19/fs/locks.c aio-2.4.19.diff/fs/locks.c --- v2.4.19/fs/locks.c Thu Nov 1 16:40:02 2001 +++ aio-2.4.19.diff/fs/locks.c Mon Sep 16 21:54:13 2002 @@ -440,7 +440,7 @@ while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter = list_entry(blocker->fl_block.next, struct file_lock, fl_block); - if (wait) { + if (0) { locks_notify_blocked(waiter); /* Let the blocked process remove waiter from the * block list when it gets scheduled. diff -urN v2.4.19/fs/nfs/file.c aio-2.4.19.diff/fs/nfs/file.c --- v2.4.19/fs/nfs/file.c Thu Mar 7 16:40:04 2002 +++ aio-2.4.19.diff/fs/nfs/file.c Mon Sep 16 21:54:13 2002 @@ -39,9 +39,13 @@ static ssize_t nfs_file_write(struct file *, const char *, size_t, loff_t *); static int nfs_file_flush(struct file *); static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); struct file_operations nfs_file_operations = { llseek: generic_file_llseek, + kvec_read: nfs_kvec_read, + kvec_write: nfs_kvec_write, read: nfs_file_read, write: nfs_file_write, mmap: nfs_file_mmap, @@ -50,6 +54,8 @@ release: nfs_release, fsync: nfs_fsync, lock: nfs_lock, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations nfs_file_inode_operations = { @@ -88,6 +94,28 @@ return status; } +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_write(file, cb, count, pos); + return ret; +} + +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_read(file, cb, count, pos); + return ret; +} + static ssize_t nfs_file_read(struct file * file, char * buf, size_t count, loff_t *ppos) { diff -urN v2.4.19/fs/pipe.c aio-2.4.19.diff/fs/pipe.c --- v2.4.19/fs/pipe.c Fri Aug 9 13:50:14 2002 +++ aio-2.4.19.diff/fs/pipe.c Mon Sep 16 21:54:13 2002 @@ -134,31 +134,235 @@ return ret; } +static int pipe_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos) +{ + return 0; +} + +static int pipe_aio_read_cancel(struct kiocb *iocb) +{ + struct inode *inode = iocb->filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; + struct list_head *pos; + int found = 0; + + pr_debug("cancelling aio pipe read(%p)\n", iocb); + + /* To cancel an aio, we must first prevent writers from + * removing it from the list. We must block here as the + * cancellation may be from the process exit path. + */ + down(PIPE_SEM(*inode)); + + pr_debug("got semaphore\n"); + spin_lock(&pipe->pipe_aio_lock); + + list_for_each(pos, &pipe->read_iocb_list) { + if (pos == &iocb->u.list) { + list_del(pos); + found = 1; + break; + } + } + + spin_unlock(&pipe->pipe_aio_lock); + up(PIPE_SEM(*inode)); + aio_put_req(iocb); + + if (found) { + if (iocb->data) { + unmap_kvec(iocb->data, 1); + free_kvec(iocb->data); + } + + aio_complete(iocb, iocb->nr_transferred, 0); + return 0; + } + + return -EAGAIN; +} + +static ssize_t pipe_aio_read (struct file *file, struct kiocb *iocb, struct iocb *uiocb) +{ + struct inode *inode = file->f_dentry->d_inode; + int queued = 0, failed_sem = 0; + + iocb->data = NULL; + iocb->cancel = pipe_aio_read_cancel; + iocb->this_size = iocb->size; + if (iocb->this_size > aio_max_size) + iocb->this_size = aio_max_size; + + /* 0 length reads are always successful */ + if (unlikely(!iocb->size)) { + aio_complete(iocb, 0, 0); + return 0; + } + + iocb->data = map_user_kvec(READ, iocb->buf, iocb->this_size); + if (unlikely(IS_ERR(iocb->data))) { + pr_debug("pipe_aio_read: map_user_kvec=%ld\n", PTR_ERR(iocb->data)); + return PTR_ERR(iocb->data); + } + + /* down_trylock == 0 if we obtained the semaphore -> if the + * semaphore was not acquired, we queue the read request. + */ + failed_sem = down_trylock(PIPE_SEM(*inode)); + + spin_lock(&inode->i_pipe->pipe_aio_lock); + if (failed_sem || !list_empty(&inode->i_pipe->read_iocb_list)) { + pr_debug("queueing aio pipe read(%p)\n", iocb); + list_add_tail(&iocb->u.list, &inode->i_pipe->read_iocb_list); + queued = 1; + } + spin_unlock(&inode->i_pipe->pipe_aio_lock); + + if (queued) { + if (!failed_sem) + up(PIPE_SEM(*inode)); + return 0; + } + + /* Okay, we're the first read request. Try reading data, otherwise + * fall back and queue. + */ + if (PIPE_EMPTY(*inode)) { +//do_more_read: + /* No writers? EOF. */ + if (!PIPE_WRITERS(*inode)) { + aio_complete(iocb, 0, 0); + goto out; + } + + /* No data. Oh well, queue it at the head. */ + spin_lock(&inode->i_pipe->pipe_aio_lock); + list_add(&iocb->u.list, &inode->i_pipe->read_iocb_list); + spin_unlock(&inode->i_pipe->pipe_aio_lock); + up(PIPE_SEM(*inode)); + return 0; + } + + printk("sorry!\n"); + //BUG(); + spin_lock(&inode->i_pipe->pipe_aio_lock); + list_add(&iocb->u.list, &inode->i_pipe->read_iocb_list); + spin_unlock(&inode->i_pipe->pipe_aio_lock); + up(PIPE_SEM(*inode)); + return 0; + + //pfull = PIPE_FULL(*inode); + +out: + up(PIPE_SEM(*inode)); + /* FIXME: writes may have been queued */ + + unmap_kvec(iocb->data, 1); + free_kvec(iocb->data); + iocb->data = NULL; + + return 0; +} + +/* do_pipe_write_aio: + * Performs a pipe write when there exists an outstanding aio + * read operation. Returns the number of bytes written or -EFAULT. + */ +static inline ssize_t do_pipe_write_aio(struct pipe_inode_info *pipe, + const char *buf, size_t count, struct kiocb *iocb) +{ + ssize_t written = 0; + pr_debug("do_pipe_aio_write\n"); + + while (count > 0) { + size_t len; + len = min(iocb->this_size, count); + if (unlikely(copy_user_to_kvec(iocb->data, iocb->nr_transferred, buf, len))) { + pr_debug("EFAULT?\n"); + break; + } + iocb->nr_transferred += len; + written += len; + buf += len; + count -= len; + + if ((iocb->nr_transferred == iocb->this_size) || + (iocb->filp->f_flags & O_NONBLOCK)) { + struct list_head *first = NULL; + + pr_debug("done this iocb\n"); + + /* Mark the pages as dirty and complete the request. + */ + unmap_kvec(iocb->data, 1); + free_kvec(iocb->data); + + spin_lock(&pipe->pipe_aio_lock); + list_del(&iocb->u.list); + first = list_first(&pipe->read_iocb_list); + spin_unlock(&pipe->pipe_aio_lock); + + aio_complete(iocb, iocb->nr_transferred, 0); + + iocb = NULL; + + /* No more aio reads? */ + if (!first) + break; + + pr_debug("processing another iocb\n"); + iocb = list_entry(first, struct kiocb, u.list); + } + } + + pr_debug("returning: %ld\n", written); + + return written ? written : -EFAULT; +} + static ssize_t pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; + struct list_head *iocb; ssize_t free, written, ret; /* Seeks are not allowed on pipes. */ ret = -ESPIPE; written = 0; - if (ppos != &filp->f_pos) + if (unlikely(ppos != &filp->f_pos)) goto out_nolock; /* Null write succeeds. */ ret = 0; - if (count == 0) + if (unlikely(count == 0)) goto out_nolock; ret = -ERESTARTSYS; - if (down_interruptible(PIPE_SEM(*inode))) + if (unlikely(down_interruptible(PIPE_SEM(*inode)))) goto out_nolock; /* No readers yields SIGPIPE. */ - if (!PIPE_READERS(*inode)) + if (unlikely(!PIPE_READERS(*inode))) goto sigpipe; + spin_lock(&inode->i_pipe->pipe_aio_lock); + iocb = list_first(&inode->i_pipe->read_iocb_list); + spin_unlock(&inode->i_pipe->pipe_aio_lock); + + if (iocb) { + written = do_pipe_write_aio(inode->i_pipe, buf, count, + list_entry(iocb, struct kiocb, u.list)); + if (unlikely(written < 0)) + goto out; + + count -= written; + buf += written; + + if (!count) + goto out; + } + /* If count <= PIPE_BUF, we have to make it atomic. */ free = (count <= PIPE_BUF ? count : 1); @@ -340,6 +544,7 @@ static int pipe_read_open(struct inode *inode, struct file *filp) { + filp->private_data = inode->i_pipe; /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ down(PIPE_SEM(*inode)); @@ -352,6 +557,7 @@ static int pipe_write_open(struct inode *inode, struct file *filp) { + filp->private_data = inode->i_pipe; down(PIPE_SEM(*inode)); PIPE_WRITERS(*inode)++; up(PIPE_SEM(*inode)); @@ -362,6 +568,7 @@ static int pipe_rdwr_open(struct inode *inode, struct file *filp) { + filp->private_data = inode->i_pipe; down(PIPE_SEM(*inode)); if (filp->f_mode & FMODE_READ) PIPE_READERS(*inode)++; @@ -379,6 +586,7 @@ struct file_operations read_fifo_fops = { llseek: no_llseek, read: pipe_read, + aio_read: pipe_aio_read, write: bad_pipe_w, poll: fifo_poll, ioctl: pipe_ioctl, @@ -399,6 +607,7 @@ struct file_operations rdwr_fifo_fops = { llseek: no_llseek, read: pipe_read, + aio_read: pipe_aio_read, write: pipe_write, poll: fifo_poll, ioctl: pipe_ioctl, @@ -409,6 +618,7 @@ struct file_operations read_pipe_fops = { llseek: no_llseek, read: pipe_read, + aio_read: pipe_aio_read, write: bad_pipe_w, poll: pipe_poll, ioctl: pipe_ioctl, @@ -429,6 +639,7 @@ struct file_operations rdwr_pipe_fops = { llseek: no_llseek, read: pipe_read, + aio_read: pipe_aio_read, write: pipe_write, poll: pipe_poll, ioctl: pipe_ioctl, @@ -454,6 +665,9 @@ PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; + spin_lock_init(&inode->i_pipe->pipe_aio_lock); + INIT_LIST_HEAD(&inode->i_pipe->read_iocb_list); + INIT_LIST_HEAD(&inode->i_pipe->write_iocb_list); return inode; fail_page: diff -urN v2.4.19/fs/select.c aio-2.4.19.diff/fs/select.c --- v2.4.19/fs/select.c Mon Sep 24 02:16:05 2001 +++ aio-2.4.19.diff/fs/select.c Mon Sep 16 21:54:13 2002 @@ -12,6 +12,12 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * June 2001 + * Added async_poll implementation. -bcrl + * Nov 2001 + * Async poll improvments from Suparna Bhattacharya + * April 2002 + * smp safe async poll plus cancellation. -bcrl */ #include @@ -19,6 +25,8 @@ #include #include /* for STICKY_TIMEOUTS */ #include +#include +#include #include @@ -26,19 +34,36 @@ #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) struct poll_table_entry { - struct file * filp; wait_queue_t wait; wait_queue_head_t * wait_address; + struct file * filp; + poll_table * p; }; struct poll_table_page { + unsigned long size; struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; #define POLL_TABLE_FULL(table) \ - ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + ((unsigned long)((table)->entry+1) > \ + (table)->size + (unsigned long)(table)) + +/* async poll uses only one entry per poll table as it is linked to an iocb */ +typedef struct async_poll_table_struct { + poll_table pt; + struct worktodo wtd; + int events; /* event mask for async poll */ + int wake; + long sync; + struct poll_table_page pt_page; /* one poll table page hdr */ + struct poll_table_entry entries[1]; /* space for a single entry */ +} async_poll_table; + + +static kmem_cache_t *async_poll_table_cache; /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. @@ -53,7 +78,7 @@ * poll table. */ -void poll_freewait(poll_table* pt) +void __poll_freewait(poll_table* pt, wait_queue_t *wait) { struct poll_table_page * p = pt->table; while (p) { @@ -61,15 +86,154 @@ struct poll_table_page *old; entry = p->entry; + if (entry == p->entries) /* may happen with async poll */ + break; do { entry--; - remove_wait_queue(entry->wait_address,&entry->wait); + if (wait != &entry->wait) + remove_wait_queue(entry->wait_address,&entry->wait); + else + __remove_wait_queue(entry->wait_address,&entry->wait); fput(entry->filp); } while (entry > p->entries); old = p; p = p->next; - free_page((unsigned long) old); + if (old->size == PAGE_SIZE) + free_page((unsigned long) old); } + if (pt->iocb) + kmem_cache_free(async_poll_table_cache, pt); +} + +void poll_freewait(poll_table* pt) +{ + __poll_freewait(pt, NULL); +} + +void async_poll_complete(void *data) +{ + async_poll_table *pasync = data; + poll_table *p = data; + struct kiocb *iocb = p->iocb; + unsigned int mask; + + pasync->wake = 0; + wmb(); + do { + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= pasync->events | POLLERR | POLLHUP; + if (mask) { + poll_table *p2 = xchg(&iocb->data, NULL); + if (p2) { + poll_freewait(p2); + aio_complete(iocb, mask, 0); + } + return; + } + pasync->sync = 0; + wmb(); + } while (pasync->wake); +} + +static void do_hack(async_poll_table *pasync, wait_queue_t *wait) +{ + struct kiocb *iocb = pasync->pt.iocb; + unsigned int mask; + + mask = iocb->filp->f_op->poll(iocb->filp, NULL); + mask &= pasync->events | POLLERR | POLLHUP; + if (mask) { + poll_table *p2 = xchg(&iocb->data, NULL); + if (p2) { + __poll_freewait(p2, wait); + aio_complete(iocb, mask, 0); + } + return; + } +} + +static void async_poll_waiter(wait_queue_t *wait) +{ + struct poll_table_entry *entry = (struct poll_table_entry *)wait; + async_poll_table *pasync = (async_poll_table *)(entry->p); + +#if 1 /*OLS HACK*/ + do_hack(pasync, wait); +#else + /* avoid writes to the cacheline if possible for SMP */ + if (!pasync->wake) { + pasync->wake = 1; + /* ensure only one wake up queues the wtd */ + if (!pasync->sync && !test_and_set_bit(0, &pasync->sync)) + wtd_queue(&pasync->wtd); + } +#endif +} + +int async_poll_cancel(struct kiocb *iocb) +{ + poll_table *p; + + /* FIXME: almost right */ + p = xchg(&iocb->data, NULL); + if (p) { + poll_freewait(p); + aio_complete(iocb, 0, 0); + aio_put_req(iocb); + return 0; + } + return -EAGAIN; +} + +int async_poll(struct kiocb *iocb, int events) +{ + unsigned int mask; + async_poll_table *pasync; + poll_table *p; + + /* Fast path */ + if (iocb->filp->f_op && iocb->filp->f_op->poll) { + mask = iocb->filp->f_op->poll(iocb->filp, NULL); + mask &= events | POLLERR | POLLHUP; + if (mask & events) + return events; + } + + pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL); + if (!pasync) + return -ENOMEM; + + p = (poll_table *)pasync; + poll_initwait(p); + wtd_set_action(&pasync->wtd, async_poll_complete, pasync); + p->iocb = iocb; + pasync->wake = 0; + pasync->sync = 0; + pasync->events = events; + pasync->pt_page.entry = pasync->pt_page.entries; + pasync->pt_page.size = sizeof(pasync->pt_page); + p->table = &pasync->pt_page; + + iocb->data = p; + wmb(); + iocb->cancel = async_poll_cancel; + + mask = DEFAULT_POLLMASK; +#warning broken + iocb->users ++; + if (iocb->filp->f_op && iocb->filp->f_op->poll) + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= events | POLLERR | POLLHUP; + if (mask && !test_and_set_bit(0, &pasync->sync)) + aio_complete(iocb, mask, 0); + + if (aio_put_req(iocb)) + /* Must be freed after aio_complete to synchronise with + * cancellation of the request. + */ + poll_freewait(p); + + return 0; } void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -85,6 +249,7 @@ __set_current_state(TASK_RUNNING); return; } + new_table->size = PAGE_SIZE; new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; @@ -98,7 +263,11 @@ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->p = p; + if (p->iocb) /* async poll */ + init_waitqueue_func_entry(&entry->wait, async_poll_waiter); + else + init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } } @@ -494,3 +663,14 @@ poll_freewait(&table); return err; } + +static int __init async_poll_init(void) +{ + async_poll_table_cache = kmem_cache_create("async poll table", + sizeof(async_poll_table), 0, 0, NULL, NULL); + if (!async_poll_table_cache) + panic("unable to alloc poll_table_cache"); + return 0; +} + +module_init(async_poll_init); diff -urN v2.4.19/include/asm-i386/errno.h aio-2.4.19.diff/include/asm-i386/errno.h --- v2.4.19/include/asm-i386/errno.h Fri Aug 9 13:50:22 2002 +++ aio-2.4.19.diff/include/asm-i386/errno.h Mon Sep 16 21:54:13 2002 @@ -128,5 +128,6 @@ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ +#define ECANCELED 125 /* Operation canceled */ #endif diff -urN v2.4.19/include/asm-i386/kmap_types.h aio-2.4.19.diff/include/asm-i386/kmap_types.h --- v2.4.19/include/asm-i386/kmap_types.h Mon Sep 24 02:16:05 2001 +++ aio-2.4.19.diff/include/asm-i386/kmap_types.h Mon Sep 16 21:54:13 2002 @@ -7,6 +7,8 @@ KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, + KM_IRQ0, + KM_IRQ1, KM_TYPE_NR }; diff -urN v2.4.19/include/asm-i386/param.h aio-2.4.19.diff/include/asm-i386/param.h --- v2.4.19/include/asm-i386/param.h Fri Oct 27 14:04:43 2000 +++ aio-2.4.19.diff/include/asm-i386/param.h Mon Sep 16 21:54:13 2002 @@ -2,7 +2,8 @@ #define _ASMi386_PARAM_H #ifndef HZ -#define HZ 100 +//#define HZ 100 +#define HZ 1024 #endif #define EXEC_PAGESIZE 4096 @@ -18,7 +19,7 @@ #define MAXHOSTNAMELEN 64 /* max length of hostname */ #ifdef __KERNEL__ -# define CLOCKS_PER_SEC 100 /* frequency at which times() counts */ +# define CLOCKS_PER_SEC 8192 /* frequency at which times() counts */ #endif #endif diff -urN v2.4.19/include/asm-i386/semaphore.h aio-2.4.19.diff/include/asm-i386/semaphore.h --- v2.4.19/include/asm-i386/semaphore.h Fri Aug 9 13:50:23 2002 +++ aio-2.4.19.diff/include/asm-i386/semaphore.h Mon Sep 16 21:54:13 2002 @@ -131,6 +131,31 @@ :"memory"); } +/* Returns 0 if we acquired the semaphore, 1 if it was queued. */ +struct worktodo; +static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem) +{ + int ret = 0; +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + + __asm__ __volatile__( + "# atomic down operation\n\t" + LOCK "decl %0\n\t" /* --sem->count */ + "js 2f\n" + "1:\n" + LOCK_SECTION_START("") + "2:\tcall __wtd_down_failed\n\t" + "movl $1,%1\n\t" + "jmp 1b\n" + LOCK_SECTION_END + :"=m" (sem->count), "=r" (ret) + :"c" (sem), "1" (ret), "d" (wtd) + :"memory"); + return ret; +} + /* * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR diff -urN v2.4.19/include/asm-i386/unistd.h aio-2.4.19.diff/include/asm-i386/unistd.h --- v2.4.19/include/asm-i386/unistd.h Fri Aug 9 13:50:23 2002 +++ aio-2.4.19.diff/include/asm-i386/unistd.h Mon Sep 16 21:54:13 2002 @@ -248,6 +248,12 @@ #define __NR_sched_setaffinity 241 #define __NR_sched_getaffinity 242 +#define __NR_io_setup 245 +#define __NR_io_destroy 246 +#define __NR_io_getevents 247 +#define __NR_io_submit 248 +#define __NR_io_cancel 249 + /* user-visible error numbers are in the range -1 - -124: see */ #define __syscall_return(type, res) \ diff -urN v2.4.19/include/asm-ia64/kmap_types.h aio-2.4.19.diff/include/asm-ia64/kmap_types.h --- v2.4.19/include/asm-ia64/kmap_types.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/asm-ia64/kmap_types.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,15 @@ +#ifndef _ASM_KMAP_TYPES_H +#define _ASM_KMAP_TYPES_H + +enum km_type { + KM_BOUNCE_READ, + KM_SKB_DATA, + KM_SKB_DATA_SOFTIRQ, + KM_USER0, + KM_USER1, + KM_IRQ0, + KM_IRQ1, + KM_TYPE_NR +}; + +#endif diff -urN v2.4.19/include/linux/aio.h aio-2.4.19.diff/include/linux/aio.h --- v2.4.19/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/linux/aio.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,130 @@ +#ifndef __LINUX__AIO_H +#define __LINUX__AIO_H + +#include +#include +#include +#include + +#include + +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kioctx; + +/* Notes on cancelling a kiocb: + * If a kiocb is cancelled, aio_complete may return 0 to indicate + * that cancel has not yet disposed of the kiocb. All cancel + * operations *must* call aio_put_req to dispose of the kiocb + * to guard against races with the completion code. + */ +#define KIOCB_C_CANCELLED 0x01 +#define KIOCB_C_COMPLETE 0x02 + +struct kiocb { + struct list_head list; + struct file *filp; + struct kioctx *ctx; + void *user_obj; + __u64 user_data; + loff_t pos; + unsigned long buf; + size_t nr_transferred; /* used for chunking */ + size_t size; + size_t this_size; + unsigned key; /* id of this request */ + int (*cancel)(struct kiocb *kiocb); + void *data; /* for use by the the async op */ + int users; + union { + struct tq_struct tq; /* argh. */ + struct list_head list; + } u; + unsigned long rlim_fsize; +}; + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned woke; /* set when a wakeup was sent */ + unsigned pad[3]; + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr) + +#define AIO_RING_PAGES 8 +struct aio_ring_info { + //struct file *mmap_file; + struct kvec *kvec; + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + spinlock_t ring_lock; + unsigned nr_pages; + + unsigned nr, tail; + + struct page *internal_pages[AIO_RING_PAGES]; +}; + +struct kioctx { + atomic_t users; + int dead; + struct mm_struct *mm; + + /* This needs improving */ + unsigned long user_id; + struct kioctx *next; + + wait_queue_head_t wait; + + spinlock_t lock; + + int reqs_active; + struct list_head free_reqs; + struct list_head active_reqs; /* used for cancellation */ + + unsigned max_reqs; + + struct aio_ring_info ring_info; +}; + +/* prototypes */ +extern unsigned aio_max_size; + +extern int FASTCALL(aio_put_req(struct kiocb *iocb)); +extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2)); +extern void FASTCALL(__put_ioctx(struct kioctx *ctx)); +struct mm_struct; +extern void FASTCALL(exit_aio(struct mm_struct *mm)); + +#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) +#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) + +#include + +static inline struct kiocb *list_kiocb(struct list_head *h) +{ + return list_entry(h, struct kiocb, list); +} + +struct file; +extern ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb); +extern ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size); +extern ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size); +extern ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); +extern ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); +extern ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); + +/* for sysctl: */ +extern unsigned aio_nr, aio_max_nr, aio_max_size, aio_max_pinned; + +#endif /* __LINUX__AIO_H */ diff -urN v2.4.19/include/linux/aio_abi.h aio-2.4.19.diff/include/linux/aio_abi.h --- v2.4.19/include/linux/aio_abi.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/linux/aio_abi.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,87 @@ +/* linux/aio_abi.h + * + * Copyright 2000,2001,2002 Red Hat. + * + * Written by Benjamin LaHaise + * + * Permission to use, copy, modify, and distribute this software and its + * documentation is hereby granted, provided that the above copyright + * notice appears in all copies. This software is provided without any + * warranty, express or implied. Red Hat makes no representations about + * the suitability of this software for any purpose. + * + * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, + * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF + * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND + * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, + * ENHANCEMENTS, OR MODIFICATIONS. + */ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +#include + +typedef unsigned long aio_context_t; + +enum { + IOCB_CMD_PREAD = 0, + IOCB_CMD_PWRITE = 1, + IOCB_CMD_FSYNC = 2, + IOCB_CMD_FDSYNC = 3, + IOCB_CMD_PREADX = 4, + IOCB_CMD_POLL = 5, + IOCB_CMD_NOOP = 6, +}; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#if defined(__LITTLE_ENDIAN) +#define PADDED(x,y) x, y +#elif defined(__BIG_ENDIAN) +#define PADDED(x,y) y, x +#else +#error edit for your odd byteorder. +#endif + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + */ + +struct iocb { + /* these are internal to the kernel/libc. */ + __u64 aio_data; /* data to be returned in event's data */ + __u32 PADDED(aio_key, aio_reserved1); + /* the kernel sets aio_key to the req # */ + + /* common fields */ + __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ + __s16 aio_reqprio; + __u32 aio_fildes; + + __u64 aio_buf; + __u64 aio_nbytes; + __s64 aio_offset; + + /* extra parameters */ + __u64 aio_reserved2; + __u64 aio_reserved3; +}; /* 64 bytes */ + +#undef IFBIG +#undef IFLITTLE + +#endif /* __LINUX__AIO_ABI_H */ + diff -urN v2.4.19/include/linux/brlock.h aio-2.4.19.diff/include/linux/brlock.h --- v2.4.19/include/linux/brlock.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/brlock.h Mon Sep 16 21:54:13 2002 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_AIO_REQ_LOCK, __BR_END }; diff -urN v2.4.19/include/linux/errno.h aio-2.4.19.diff/include/linux/errno.h --- v2.4.19/include/linux/errno.h Tue Jun 11 22:19:17 2002 +++ aio-2.4.19.diff/include/linux/errno.h Mon Sep 16 21:54:13 2002 @@ -21,6 +21,9 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ +/* Defined for TUX async IO */ +#define EWOULDBLOCKIO 530 /* Would block due to block-IO */ + #endif #endif diff -urN v2.4.19/include/linux/file.h aio-2.4.19.diff/include/linux/file.h --- v2.4.19/include/linux/file.h Fri Aug 9 13:50:40 2002 +++ aio-2.4.19.diff/include/linux/file.h Mon Sep 16 21:54:13 2002 @@ -5,6 +5,7 @@ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H +extern void FASTCALL(__fput(struct file *)); extern void FASTCALL(fput(struct file *)); extern struct file * FASTCALL(fget(unsigned int fd)); diff -urN v2.4.19/include/linux/fs.h aio-2.4.19.diff/include/linux/fs.h --- v2.4.19/include/linux/fs.h Fri Aug 9 13:50:40 2002 +++ aio-2.4.19.diff/include/linux/fs.h Mon Sep 16 21:54:13 2002 @@ -196,6 +196,8 @@ #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ #ifdef __KERNEL__ +#include +#include #include #include @@ -825,6 +827,10 @@ * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. */ + +#define F_ATOMIC 0x0001 +#define F_OFFSETOK 0x0002 + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -844,6 +850,16 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + + /* in-kernel fully async api */ + int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t); + int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t); + + /* userland aio ops */ + ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb *); + ssize_t (*aio_readx)(struct file *, struct kiocb *, struct iocb *); + ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb *); + ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb *); }; struct inode_operations { @@ -1433,12 +1449,16 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); +extern ssize_t generic_file_new_read(struct file *, char *, size_t, loff_t *, int); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); -extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); -extern loff_t no_llseek(struct file *file, loff_t offset, int origin); -extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int); +extern int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); +extern int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); + extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern int generic_file_open(struct inode * inode, struct file * filp); +extern loff_t no_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern struct file_operations generic_ro_fops; diff -urN v2.4.19/include/linux/iobuf.h aio-2.4.19.diff/include/linux/iobuf.h --- v2.4.19/include/linux/iobuf.h Sat Jun 15 05:08:17 2002 +++ aio-2.4.19.diff/include/linux/iobuf.h Mon Sep 16 21:54:13 2002 @@ -53,8 +53,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -80,6 +82,8 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], kdev_t dev, unsigned long b[], int size); diff -urN v2.4.19/include/linux/kiovec.h aio-2.4.19.diff/include/linux/kiovec.h --- v2.4.19/include/linux/kiovec.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/linux/kiovec.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,125 @@ +#ifndef __LINUX__KIOVEC_H +#define __LINUX__KIOVEC_H + +struct page; +#include + +struct kveclet { + struct page *page; + unsigned offset; + unsigned length; +}; + +struct kvec { + unsigned max_nr; + unsigned nr; + struct kveclet veclet[0]; +}; + +struct kvec_cb { + struct kvec *vec; + void (*fn)(void *data, struct kvec *vec, ssize_t res); + void *data; +}; + +struct kvec_cb_list { + struct list_head list; + struct kvec_cb cb; +}; + +#ifndef _LINUX_TYPES_H +#include +#endif +#ifndef _LINUX_KDEV_T_H +#include +#endif +#ifndef _ASM_KMAP_TYPES_H +#include +#endif + +extern struct kvec *FASTCALL(map_user_kvec(int rw, unsigned long va, size_t len)); +extern struct kvec *FASTCALL(mm_map_user_kvec(struct mm_struct *, int rw, + unsigned long va, size_t len)); +extern void FASTCALL(unmap_kvec(struct kvec *, int dirtied)); +extern void FASTCALL(free_kvec(struct kvec *)); + +/* brw_kvec_async: + * Performs direct io to/from disk into cb.vec. Count is the number + * of sectors to read, sector_shift is the blocksize (which must be + * compatible with the kernel's current idea of the device's sector + * size) in log2. blknr is the starting sector offset on dev. + * + */ +extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count, + unsigned long blknr, int sector_shift); + +/* Memory copy helpers usage: + * void foo(... struct kveclet *veclet...) + * + * struct kvec_dst dst; + * + * kvec_dst_init(&dst, KM_USER0); -- resets type + * kvec_dst_set(&dst, veclet); -- set target & clear offset + * kvec_dst_map(&dst); -- activates kmap + * for (...) + * memcpy_to_kvec_dst(&dst, data, size); -- each copy appends + * kvec_dst_unmap(&dst); -- releases kmap + * + * Note that scheduling is not permitted between kvec_dst_map() and + * kvec_dst_unmap(). This is because internally the routines make use + * of an atomic kmap. + */ +struct kvec_dst { + char *addr; + char *dst; + struct kveclet *let; + int space; + int offset; + enum km_type type; +}; + + +#define kvec_dst_set(Xdst, Xlet) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = (Xlet); \ + _dst->let = _let; \ + _dst->space = _let->length; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_map(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = _dst->let; \ + _dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\ + _dst->dst += _let->offset + _dst->offset; \ + _dst->space = _let->length - _dst->offset; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_init(Xdst, Xtype) \ + do { \ + (Xdst)->space = 0; \ + (Xdst)->addr = 0; \ + (Xdst)->offset = 0; \ + (Xdst)->type = Xtype; \ + } while(0) + +#define kvec_dst_unmap(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + kunmap_atomic(_dst->addr, _dst->type); \ + _dst->offset = _dst->dst - _dst->addr; \ + _dst->offset -= _dst->let->offset; \ + _dst->addr = NULL; \ + } while(0) + +extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst, + const char *from, long len)); +extern void FASTCALL(memcpy_from_kvec_dst(char *to, + struct kvec_dst *from, long len)); +extern int FASTCALL(copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len)); + + +#endif /* __LINUX__KIOVEC_H */ diff -urN v2.4.19/include/linux/list.h aio-2.4.19.diff/include/linux/list.h --- v2.4.19/include/linux/list.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/list.h Mon Sep 16 21:54:13 2002 @@ -170,7 +170,8 @@ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ pos = pos->prev, prefetch(pos->prev)) - + +#define list_first(head) (((head)->next != (head)) ? (head)->next: (struct list_head *) 0) #endif /* __KERNEL__ || _LVM_H_INCLUDE */ diff -urN v2.4.19/include/linux/mm.h aio-2.4.19.diff/include/linux/mm.h --- v2.4.19/include/linux/mm.h Fri Aug 9 13:50:41 2002 +++ aio-2.4.19.diff/include/linux/mm.h Mon Sep 16 21:54:13 2002 @@ -653,7 +653,7 @@ } /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * FASTCALL(find_vma(struct mm_struct * mm, unsigned long addr)); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); diff -urN v2.4.19/include/linux/net.h aio-2.4.19.diff/include/linux/net.h --- v2.4.19/include/linux/net.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/net.h Mon Sep 16 21:54:13 2002 @@ -83,6 +83,9 @@ struct scm_cookie; struct vm_area_struct; struct page; +struct iocb; +struct kioctx; +#include /* shut gcc up */ struct proto_ops { int family; @@ -110,6 +113,8 @@ int (*recvmsg) (struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); + int (*kvec_read) (struct socket *sock, kvec_cb_t cb, size_t size); + int (*kvec_write) (struct socket *sock, kvec_cb_t cb, size_t size); }; struct net_proto_family diff -urN v2.4.19/include/linux/pagemap.h aio-2.4.19.diff/include/linux/pagemap.h --- v2.4.19/include/linux/pagemap.h Sat Jun 15 05:08:23 2002 +++ aio-2.4.19.diff/include/linux/pagemap.h Mon Sep 16 21:54:13 2002 @@ -88,6 +88,7 @@ extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index); extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index); extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash); +extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page)); extern void ___wait_on_page(struct page *); diff -urN v2.4.19/include/linux/pipe_fs_i.h aio-2.4.19.diff/include/linux/pipe_fs_i.h --- v2.4.19/include/linux/pipe_fs_i.h Thu May 3 11:22:20 2001 +++ aio-2.4.19.diff/include/linux/pipe_fs_i.h Mon Sep 16 21:54:13 2002 @@ -1,6 +1,9 @@ #ifndef _LINUX_PIPE_FS_I_H #define _LINUX_PIPE_FS_I_H +#include +#include + #define PIPEFS_MAGIC 0x50495045 struct pipe_inode_info { wait_queue_head_t wait; @@ -13,6 +16,10 @@ unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + + spinlock_t pipe_aio_lock; + struct list_head read_iocb_list; + struct list_head write_iocb_list; }; /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual diff -urN v2.4.19/include/linux/poll.h aio-2.4.19.diff/include/linux/poll.h --- v2.4.19/include/linux/poll.h Sat Jun 15 05:08:17 2002 +++ aio-2.4.19.diff/include/linux/poll.h Mon Sep 16 21:54:13 2002 @@ -9,12 +9,15 @@ #include #include #include +#include struct poll_table_page; +struct kiocb; typedef struct poll_table_struct { - int error; - struct poll_table_page * table; + int error; + struct poll_table_page *table; + struct kiocb *iocb; /* iocb for async poll */ } poll_table; extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); @@ -29,8 +32,11 @@ { pt->error = 0; pt->table = NULL; + pt->iocb = NULL; } + extern void poll_freewait(poll_table* pt); +extern int async_poll(struct kiocb *iocb, int events); /* diff -urN v2.4.19/include/linux/sched.h aio-2.4.19.diff/include/linux/sched.h --- v2.4.19/include/linux/sched.h Fri Aug 9 13:50:43 2002 +++ aio-2.4.19.diff/include/linux/sched.h Mon Sep 16 21:54:13 2002 @@ -207,6 +207,7 @@ extern int max_map_count; +struct kioctx; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ rb_root_t mm_rb; @@ -235,6 +236,9 @@ /* Architecture-specific MM context */ mm_context_t context; + + struct kioctx *ioctx_list; + unsigned long new_ioctx_id; }; extern int mmlist_nr; @@ -802,6 +806,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff -urN v2.4.19/include/linux/skbuff.h aio-2.4.19.diff/include/linux/skbuff.h --- v2.4.19/include/linux/skbuff.h Fri Aug 9 13:50:44 2002 +++ aio-2.4.19.diff/include/linux/skbuff.h Mon Sep 16 21:54:13 2002 @@ -1128,6 +1128,15 @@ extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); +/* skb <-> kvec helpers */ +extern void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset, + struct kvec *vec, int len); +extern int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb, + int offset, struct kvec *vec, int len); +extern int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len, + void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb)); + + extern void skb_init(void); extern void skb_add_mtu(int mtu); diff -urN v2.4.19/include/linux/sysctl.h aio-2.4.19.diff/include/linux/sysctl.h --- v2.4.19/include/linux/sysctl.h Fri Aug 9 13:50:44 2002 +++ aio-2.4.19.diff/include/linux/sysctl.h Mon Sep 16 21:54:13 2002 @@ -546,6 +546,13 @@ FS_LEASES=13, /* int: leases enabled */ FS_DIR_NOTIFY=14, /* int: directory notification enabled */ FS_LEASE_TIME=15, /* int: maximum time to wait for a lease break */ + /* 16 == jbd-debug */ + /* 17 == jbd-oom-retry */ + + FS_AIO_NR=18, /* int: current number of aio requests */ + FS_AIO_MAX_NR=19, /* int: max system wide aio requests */ + FS_AIO_MAX_SIZE=20, /* int: max size of read/write chunks */ + FS_AIO_MAX_PINNED=21, /* long: max memory pinned (in pages) */ }; /* CTL_DEBUG names: */ diff -urN v2.4.19/include/linux/tasklet.h aio-2.4.19.diff/include/linux/tasklet.h --- v2.4.19/include/linux/tasklet.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/linux/tasklet.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,154 @@ +#ifndef __LINUX__TASKLET_H +#define __LINUX__TASKLET_H + +#include +#include +#include +#include /* for smp_mb */ + +/* Tasklets --- multithreaded analogue of BHs. + + Main feature differing them of generic softirqs: tasklet + is running only on one CPU simultaneously. + + Main feature differing them of BHs: different tasklets + may be run simultaneously on different CPUs. + + Properties: + * If tasklet_schedule() is called, then tasklet is guaranteed + to be executed on some cpu at least once after this. + * If the tasklet is already scheduled, but its excecution is still not + started, it will be executed only once. + * If this tasklet is already running on another CPU (or schedule is called + from tasklet itself), it is rescheduled for later. + * Tasklet is strictly serialized wrt itself, but not + wrt another tasklets. If client needs some intertask synchronization, + he makes it with spinlocks. + */ + +struct tasklet_struct +{ + struct tasklet_struct *next; + unsigned long state; + atomic_t count; + void (*func)(unsigned long); + unsigned long data; + int *unlocked; +}; + +#define DECLARE_TASKLET(name, func, data) \ +struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data, NULL } + +#define DECLARE_TASKLET_DISABLED(name, func, data) \ +struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data, NULL } + + +enum +{ + TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ + TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ +}; + +struct tasklet_head +{ + struct tasklet_struct *list; +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + +extern struct tasklet_head tasklet_vec[NR_CPUS]; +extern struct tasklet_head tasklet_hi_vec[NR_CPUS]; + +#ifdef CONFIG_SMP +static inline int tasklet_trylock(struct tasklet_struct *t) +{ + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); +} + +static inline void tasklet_unlock(struct tasklet_struct *t) +{ + smp_mb__before_clear_bit(); + clear_bit(TASKLET_STATE_RUN, &(t)->state); +} + +static inline void tasklet_unlock_self(struct tasklet_struct *t) +{ + *t->unlocked = 1; + t->unlocked = NULL; + tasklet_unlock(t); +} + +static inline void tasklet_unlock_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } +} +#else +#define tasklet_trylock(t) 1 +#define tasklet_unlock_wait(t) do { } while (0) +#define tasklet_unlock(t) do { } while (0) +#endif + +extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t)); + +static inline void tasklet_schedule(struct tasklet_struct *t) +{ + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + __tasklet_schedule(t); +} + +extern void FASTCALL(__tasklet_hi_schedule(struct tasklet_struct *t)); + +static inline void tasklet_hi_schedule(struct tasklet_struct *t) +{ + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + __tasklet_hi_schedule(t); +} + + +static inline void tasklet_disable_nosync(struct tasklet_struct *t) +{ + atomic_inc(&t->count); + smp_mb__after_atomic_inc(); +} + +static inline void tasklet_disable(struct tasklet_struct *t) +{ + tasklet_disable_nosync(t); + tasklet_unlock_wait(t); + smp_mb(); +} + +static inline void tasklet_enable(struct tasklet_struct *t) +{ + smp_mb__before_atomic_dec(); + atomic_dec(&t->count); +} + +static inline void tasklet_hi_enable(struct tasklet_struct *t) +{ + smp_mb__before_atomic_dec(); + atomic_dec(&t->count); +} + +extern void tasklet_kill(struct tasklet_struct *t); +extern void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data); + +#ifdef CONFIG_SMP + +#define SMP_TIMER_NAME(name) name##__thr + +#define SMP_TIMER_DEFINE(name, task) \ +DECLARE_TASKLET(task, name##__thr, 0); \ +static void name (unsigned long dummy) \ +{ \ + tasklet_schedule(&(task)); \ +} + +#else /* CONFIG_SMP */ + +#define SMP_TIMER_NAME(name) name +#define SMP_TIMER_DEFINE(name, task) + +#endif /* CONFIG_SMP */ + + +#endif /* __LINUX__TASKLET_H */ diff -urN v2.4.19/include/linux/timex.h aio-2.4.19.diff/include/linux/timex.h --- v2.4.19/include/linux/timex.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/timex.h Mon Sep 16 21:54:13 2002 @@ -74,6 +74,10 @@ # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 +#elif HZ >= 1536 && HZ < 3120 +# define SHIFT_HZ 11 +#elif HZ >= 3120 && HZ < 6240 +# define SHIFT_HZ 12 #else # error You lose. #endif diff -urN v2.4.19/include/linux/tqueue.h aio-2.4.19.diff/include/linux/tqueue.h --- v2.4.19/include/linux/tqueue.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/tqueue.h Mon Sep 16 21:54:13 2002 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff -urN v2.4.19/include/linux/types.h aio-2.4.19.diff/include/linux/types.h --- v2.4.19/include/linux/types.h Fri Aug 9 13:50:44 2002 +++ aio-2.4.19.diff/include/linux/types.h Mon Sep 16 21:54:13 2002 @@ -127,4 +127,9 @@ char f_fpack[6]; }; +/* kernel typedefs -- they belong here. */ +#ifdef __KERNEL__ +typedef struct kvec_cb kvec_cb_t; +#endif /* __KERNEL__ */ + #endif /* _LINUX_TYPES_H */ diff -urN v2.4.19/include/linux/wait.h aio-2.4.19.diff/include/linux/wait.h --- v2.4.19/include/linux/wait.h Sat Jun 15 05:08:15 2002 +++ aio-2.4.19.diff/include/linux/wait.h Mon Sep 16 21:54:13 2002 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -137,6 +140,7 @@ #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -174,6 +178,22 @@ #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -231,6 +251,38 @@ list_del(&old->task_list); } +#define add_wait_queue_cond(q, wait, cond) \ + ({ \ + unsigned long flags; \ + int _raced = 0; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + __add_wait_queue((q), (wait)); \ + rmb(); \ + if (!(cond)) { \ + _raced = 1; \ + __remove_wait_queue((q), (wait)); \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + _raced; \ + }) + +#define add_wait_queue_exclusive_cond(q, wait, cond) \ + ({ \ + unsigned long flags; \ + int _raced = 0; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = WQ_FLAG_EXCLUSIVE; \ + __add_wait_queue_tail((q), (wait)); \ + rmb(); \ + if (!(cond)) { \ + _raced = 1; \ + __remove_wait_queue((q), (wait)); \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + _raced; \ + }) + #endif /* __KERNEL__ */ #endif diff -urN v2.4.19/include/linux/worktodo.h aio-2.4.19.diff/include/linux/worktodo.h --- v2.4.19/include/linux/worktodo.h Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/include/linux/worktodo.h Mon Sep 16 21:54:13 2002 @@ -0,0 +1,77 @@ +/* + * Written by Benjamin LaHaise. + * + * Copyright 2000-2001 Red Hat, Inc. + * + * #include "gpl.h" + * + * Basic design idea from Jeff Merkey. + * Stack based on ideas from Ingo Molnar. + */ +#ifndef __LINUX__WORKTODO_H +#define __LINUX__WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct wtd_stack { + void (*fn)(void *data); + void *data; +}; + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ + + int sp; + struct wtd_stack stack[3]; +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_init(wtd, routine) do { \ + INIT_TQUEUE(&(wtd)->tq, (routine), (wtd)); \ + (wtd)->data = 0; \ + (wtd)->sp = 0; \ +} while (0) + +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_push(wtd, action, wtddata) \ +do { \ + (wtd)->stack[(wtd)->sp].fn = (wtd)->tq.routine; \ + (wtd)->stack[(wtd)->sp++].data = (wtd)->tq.data;\ + (wtd)->tq.routine = action; \ + (wtd)->tq.data = wtddata; \ +} while (0) + +static inline void wtd_pop(struct worktodo *wtd) +{ + if (wtd->sp) { + wtd->sp--; + wtd->tq.routine = wtd->stack[wtd->sp].fn; + wtd->tq.data = wtd->stack[wtd->sp].data; + } +} + +#define wtd_set_action(wtd, action, wtddata) INIT_TQUEUE(&(wtd)->tq, action, wtddata) + +struct page; +struct buffer_head; +struct semaphore; +extern int wtd_lock_page(struct worktodo *wtd, struct page *page); +extern int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); +extern int wtd_down(struct worktodo *wtd, struct semaphore *sem); + +#if 0 /* not implemented yet */ +extern int wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* __LINUX__WORKTODO_H */ diff -urN v2.4.19/include/net/sock.h aio-2.4.19.diff/include/net/sock.h --- v2.4.19/include/net/sock.h Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/include/net/sock.h Mon Sep 16 21:54:13 2002 @@ -105,8 +105,15 @@ #include #include +#include +struct sock_iocb { + struct list_head list; + kvec_cb_t cb; + struct kvec_dst dst; +}; + /* The AF_UNIX specific socket options */ struct unix_opt { struct unix_address *addr; @@ -560,6 +567,9 @@ struct sk_buff *tail; } backlog; + struct list_head kvec_read_list; + struct list_head kvec_write_list; + rwlock_t callback_lock; /* Error queue, rarely used. */ @@ -721,6 +731,8 @@ int (*recvmsg)(struct sock *sk, struct msghdr *msg, int len, int noblock, int flags, int *addr_len); + int (*kvec_read)(struct sock *, kvec_cb_t cb, int len); + int (*kvec_write)(struct sock *, kvec_cb_t cb, int len); int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); @@ -795,7 +807,7 @@ if ((__sk)->backlog.tail != NULL) \ __release_sock(__sk); \ (__sk)->lock.users = 0; \ - if (waitqueue_active(&((__sk)->lock.wq))) wake_up(&((__sk)->lock.wq)); \ + wake_up(&((__sk)->lock.wq)); \ spin_unlock_bh(&((__sk)->lock.slock)); \ } while(0) diff -urN v2.4.19/include/net/tcp.h aio-2.4.19.diff/include/net/tcp.h --- v2.4.19/include/net/tcp.h Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/include/net/tcp.h Mon Sep 16 21:54:13 2002 @@ -732,6 +732,8 @@ struct msghdr *msg, int len, int nonblock, int flags, int *addr_len); +extern int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int len); +extern int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len); extern int tcp_listen_start(struct sock *sk); diff -urN v2.4.19/kernel/fork.c aio-2.4.19.diff/kernel/fork.c --- v2.4.19/kernel/fork.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/kernel/fork.c Mon Sep 16 21:54:13 2002 @@ -48,6 +48,16 @@ wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -228,6 +238,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) { + mm->ioctx_list = NULL; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); @@ -263,6 +274,8 @@ */ inline void __mmdrop(struct mm_struct *mm) { + if (mm->ioctx_list) + BUG(); BUG_ON(mm == &init_mm); pgd_free(mm->pgd); destroy_context(mm); @@ -281,6 +294,7 @@ list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); + exit_aio(mm); exit_mmap(mm); mmdrop(mm); } diff -urN v2.4.19/kernel/sched.c aio-2.4.19.diff/kernel/sched.c --- v2.4.19/kernel/sched.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/kernel/sched.c Mon Sep 16 21:54:13 2002 @@ -705,33 +705,44 @@ } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small + * +ve number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by contonuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) { - struct list_head *tmp; + struct list_head *tmp, *next; struct task_struct *p; CHECK_MAGIC_WQHEAD(q); WQ_CHECK_LIST_HEAD(&q->task_list); - list_for_each(tmp,&q->task_list) { + list_for_each_safe(tmp, next, &q->task_list) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_func_t func; CHECK_MAGIC(curr->__magic); + func = curr->func; + if (func) { + unsigned flags = curr->flags; + func(curr); + if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if (state & mode) { WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if (try_to_wake_up(p, sync) && + (curr->flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) break; } } diff -urN v2.4.19/kernel/sysctl.c aio-2.4.19.diff/kernel/sysctl.c --- v2.4.19/kernel/sysctl.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/kernel/sysctl.c Mon Sep 16 21:54:13 2002 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -284,6 +285,8 @@ {0} }; +extern int user_pinned_pages; + static ctl_table fs_table[] = { {FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int), 0444, NULL, &proc_dointvec}, @@ -309,6 +312,16 @@ sizeof(int), 0644, NULL, &proc_dointvec}, {FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {FS_AIO_NR, "aio-nr", &aio_nr, sizeof(aio_nr), + 0444, NULL, &proc_dointvec}, + {FS_AIO_MAX_NR, "aio-max-nr", &aio_max_nr, sizeof(aio_max_nr), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_SIZE, "aio-max-size", &aio_max_size, sizeof(aio_max_size), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_PINNED, "aio-max-pinned", &aio_max_pinned, sizeof(aio_max_pinned), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_PINNED+1, "aio-pinned", &user_pinned_pages, 4, + 0644, NULL, &proc_dointvec}, {0} }; diff -urN v2.4.19/mm/Makefile aio-2.4.19.diff/mm/Makefile --- v2.4.19/mm/Makefile Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/mm/Makefile Mon Sep 16 21:54:13 2002 @@ -17,5 +17,6 @@ shmem.o obj-$(CONFIG_HIGHMEM) += highmem.o +obj-y += wtd.o include $(TOPDIR)/Rules.make diff -urN v2.4.19/mm/filemap.c aio-2.4.19.diff/mm/filemap.c --- v2.4.19/mm/filemap.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/mm/filemap.c Mon Sep 16 21:54:13 2002 @@ -29,6 +29,8 @@ #include #include +#include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -775,7 +777,7 @@ * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static inline wait_queue_head_t *page_waitqueue(struct page *page) +static inline wait_queue_head_t *__page_waitqueue(struct page *page) { const zone_t *zone = page_zone(page); wait_queue_head_t *wait = zone->wait_table; @@ -806,6 +808,13 @@ return &wait[hash]; } +wait_queue_head_t *page_waitqueue(struct page *page) +{ + return __page_waitqueue(page); +} + +#define page_waitqueue(page) __page_waitqueue(page) + /* * Wait for a page to get unlocked. * @@ -1186,7 +1195,7 @@ static void generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, - struct page * page) + struct page * page, int flags) { unsigned long end_index; unsigned long index = page->index; @@ -1316,7 +1325,7 @@ * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int flags) { struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; @@ -1325,10 +1334,17 @@ int reada_ok; int error; int max_readahead = get_max_readahead(inode); + loff_t pos; + + pos = *ppos; + if (unlikely(pos < 0)) { + desc->error = -EINVAL; + return; + } cached_page = NULL; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; /* * If the current position is outside the previous read-ahead window, @@ -1375,13 +1391,17 @@ end_index = inode->i_size >> PAGE_CACHE_SHIFT; - if (index > end_index) + if (index > end_index) { + desc->error = 0; break; + } nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = inode->i_size & ~PAGE_CACHE_MASK; - if (nr <= offset) + if (nr <= offset) { + desc->error = 0; break; + } } nr = nr - offset; @@ -1401,7 +1421,7 @@ if (!Page_Uptodate(page)) goto page_not_up_to_date; - generic_file_readahead(reada_ok, filp, inode, page); + generic_file_readahead(reada_ok, filp, inode, page, flags); page_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -1441,13 +1461,23 @@ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. */ page_not_up_to_date: - generic_file_readahead(reada_ok, filp, inode, page); + generic_file_readahead(reada_ok, filp, inode, page, flags); if (Page_Uptodate(page)) goto page_ok; /* Get exclusive access to the page ... */ - lock_page(page); + if (flags & F_ATOMIC) { + if (TryLockPage(page)) { + if (Page_Uptodate(page)) + goto page_ok; + desc->error = -EWOULDBLOCKIO; + page_cache_release(page); + break; + } + printk("page_not_up_to_date: atomic trylock succeeded\n"); + } else + lock_page(page); /* Did it get unhashed before we got the lock? */ if (!page->mapping) { @@ -1471,11 +1501,12 @@ goto page_ok; /* Again, try some read-ahead while waiting for the page to finish.. */ - generic_file_readahead(reada_ok, filp, inode, page); - wait_on_page(page); + generic_file_readahead(reada_ok, filp, inode, page, flags); + if (!(flags & F_ATOMIC)) + wait_on_page(page); if (Page_Uptodate(page)) goto page_ok; - error = -EIO; + error = (flags & F_ATOMIC) ? -EWOULDBLOCKIO : -EIO; } /* UHHUH! A synchronous read error occurred. Report it */ @@ -1484,6 +1515,11 @@ break; no_cached_page: + if (flags & F_ATOMIC) { + spin_unlock(&pagecache_lock); + desc->error = -EWOULDBLOCKIO; + break; + } /* * Ok, it wasn't cached, so we need to create a new * page.. @@ -1638,6 +1674,11 @@ */ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) { + return generic_file_new_read(filp, buf, count, ppos, 0); +} + +ssize_t generic_file_new_read(struct file * filp, char * buf, size_t count, loff_t *ppos, int flags) +{ ssize_t retval; if ((ssize_t) count < 0) @@ -1657,7 +1698,7 @@ desc.count = count; desc.buf = buf; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc, file_read_actor, flags); retval = desc.written; if (!retval) @@ -1782,7 +1823,7 @@ desc.count = count; desc.buf = (char *) out_file; desc.error = 0; - do_generic_file_read(in_file, ppos, &desc, file_send_actor); + do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0); retval = desc.written; if (!retval) @@ -3178,3 +3219,681 @@ panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + if (unlikely(nr <= 0)) { + *nr_newp = nr_new; + return 0; + } + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_release(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + loff_t pos; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + kvec_cb_t cb; + + size_t size; + unsigned long transferred; + unsigned offset; + struct kveclet *veclet; + + struct kvec_dst src; + + int sync; + unsigned long rlimit_fsize; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io, int unlock) +{ + kvec_cb_t cb; + ssize_t res; + + if (unlock) { + unsigned i; + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + page_cache_release(page); + } + } else { + unsigned i; + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + } + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + + cb = io->cb; + res = io->transferred ? io->transferred : io->err; + kfree(io); + + cb.fn(cb.data, cb.vec, res); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + pr_debug("__iodesc_finish_write(%p)\n", io); + + __iodesc_free(io, WRITE == io->rw); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + char *kaddr = kmap(page); + unsigned long bytes; + unsigned long offset; + long status; + int done = 0; + + offset = io->offset; + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (unlikely(io->err)) { + pr_debug("prepare_write: %d\n", io->err); + kunmap(page); + return 1; + } + + kvec_dst_map(&io->src); + memcpy_from_kvec_dst(kaddr, &io->src, bytes); + kvec_dst_unmap(&io->src); /* commit_write may block */ + + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; + + if (likely(status > 0)) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + } else { + io->err = status; + done = 1; + } + + kunmap(page); + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { + pr_debug("waiting on bh=%pi io=%p\n", bh, io); + if (!wtd_wait_on_buffer(&io->wtd, bh)) + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { + pr_debug("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + + pr_debug("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + for (i=0; inr_pages; i++) { + if (__iodesc_write_page(io, io->pages[i])) + break; + } + + up(&io->file->f_dentry->d_inode->i_sem); + + if (io->sync) { + io->good_idx = 0; + + pr_debug("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + + pr_debug("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + if (!wtd_lock_page(&io->wtd, io->good_page)) + return; + } + } + + //Is this faster? __iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + struct page **src_pagep; + char *dst_addr, *src_addr; + int src_off; + size_t size; + size_t valid; + + struct kveclet *veclet = io->veclet; + struct page *dst_page = veclet->page; + int dst_len = veclet->length; + int dst_off = veclet->offset; + + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + dst_addr = kmap(veclet->page); + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + io->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", + this, io->transferred); + + if (size <= 0) + break; + + if (dst_len <= 0) { + kunmap(dst_page); + veclet++; + dst_page = veclet->page; + dst_off = veclet->offset; + dst_len = veclet->length; + dst_addr = kmap(dst_page); + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); + pr_debug("page(%lu)->count = %d\n", + (*src_pagep)->index, + atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(dst_page); + kunmap(*src_pagep); +no_data: + __iodesc_free(io, 0); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); +again: + while (Page_Uptodate(page)) { + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + if (!wtd_lock_page(&io->wtd, page)) + return; + locked = 1; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + pr_debug("attempting to read %lu\n", page->index); + io->did_read = 1; + locked = 0; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + if (wtd_lock_page(&io->wtd, page)) { + locked = 1; + goto again; + } + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int ret; + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + ret = readpage(io->file, io->new_pages[i]); + if (ret) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + if (!TryLockPage(page)) { + int ret = readpage(io->file, page); + if (ret) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + if (wtd_lock_page(&io->wtd, io->good_page)) + __iodesc_make_uptodate(io); + return; +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos); + +int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, READ, cb, size, pos); +} + +int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, WRITE, cb, size, pos); +} + +void wtd_rw_kvec_core(void *); +int rw_kvec_core(struct iodesc *io); + +int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + int append = file->f_flags & O_APPEND; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (unlikely(rw != READ && rw != WRITE)) + goto out; + + /* Don't check pos when appending, but otherwise do santity + * checks before allocating memory. -'ve offsets are invalid. + */ + if (unlikely(!append && pos < 0)) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->file = file; + io->rw = rw; + io->cb = cb; + io->size = size; + io->pos = pos; + io->rlimit_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur; + wtd_set_action(&io->wtd, wtd_rw_kvec_core, io); + + if ((rw == READ) || (0 == wtd_down(&io->wtd, &inode->i_sem))) + return rw_kvec_core(io); + + return 0; + +out: + if (!ret) + cb.fn(cb.data, cb.vec, ret); + return ret; +} + +void wtd_rw_kvec_core(void *data) +{ + struct iodesc *io = data; + kvec_cb_t cb = io->cb; + int ret = rw_kvec_core(io); + if (ret) + cb.fn(cb.data, cb.vec, ret); +} + +int rw_kvec_core(struct iodesc *io) +{ + int append = io->file->f_flags & O_APPEND; + struct inode *inode = io->file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + int ret; + + if (io->rw == WRITE) { + unsigned long long tmp; + loff_t limit; + + /* We've already down'd the inode semaphore */ + if (append) + io->pos = inode->i_size; + + limit = io->rlimit_fsize; + if (likely(RLIM_INFINITY == limit)) + limit = OFFSET_MAX; + + /* Filesystem limits take precedence over user limits */ + if (likely(inode->i_sb->s_maxbytes < limit)) + limit = inode->i_sb->s_maxbytes; + + if (unlikely(io->pos >= limit)) { + pr_debug("maxbytes: %Ld\n", limit); + ret = 0; + if (io->size || io->pos > limit) + ret = -EFBIG; + goto out_io; + } + + /* Clamp writes straddling limit. */ + tmp = io->pos + io->size; + if (unlikely(tmp > (unsigned long long)limit)) + io->size = limit - io->pos; + } + + if (READ == io->rw) { + pr_debug("io->pos=%Ld i_size=%Ld\n", io->pos, inode->i_size); + + if (io->pos > inode->i_size) + io->size = 0; + else if ((io->pos + io->size) > inode->i_size) { + size_t size = inode->i_size - io->pos; + if (size < io->size) + io->size = size; + } + + pr_debug("io->size=%d\n", io->size); + } + + ret = 0; + if (unlikely(!io->size)) + goto out_io; + + index = io->pos >> PAGE_CACHE_SHIFT; + eindex = (io->pos + io->size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->as = as; + io->offset = (unsigned long)io->pos & (PAGE_CACHE_SIZE - 1); + kvec_dst_init(&io->src, KM_USER0); + kvec_dst_set(&io->src, io->cb.vec->veclet); + io->veclet = io->cb.vec->veclet; + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (io->rw == READ) + __generic_file_read_iodesc(io, 0); + else if (io->rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + if (io->rw == WRITE) + up(&inode->i_sem); + if (!ret) + io->cb.fn(io->cb.data, io->cb.vec, ret); + kfree(io); + return ret; +} diff -urN v2.4.19/mm/filemap.c.old aio-2.4.19.diff/mm/filemap.c.old --- v2.4.19/mm/filemap.c.old Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/mm/filemap.c.old Mon Sep 16 21:54:13 2002 @@ -0,0 +1,3871 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli + */ + +atomic_t page_cache_size = ATOMIC_INIT(0); +unsigned int page_hash_bits; +struct page **page_hash_table; + +int vm_max_readahead = 31; +int vm_min_readahead = 3; +EXPORT_SYMBOL(vm_max_readahead); +EXPORT_SYMBOL(vm_min_readahead); + + +spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +/* + * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock + * with the pagecache_lock held. + * + * Ordering: + * swap_lock -> + * pagemap_lru_lock -> + * pagecache_lock + */ +spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +#define CLUSTER_PAGES (1 << page_cluster) +#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) + +static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); +static void add_page_to_hash_queue(struct page * page, struct page **p) +{ + struct page *next = *p; + + *p = page; + page->next_hash = next; + page->pprev_hash = p; + if (next) + next->pprev_hash = &page->next_hash; + if (page->buffers) + PAGE_BUG(page); + atomic_inc(&page_cache_size); +} + +static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) +{ + struct list_head *head = &mapping->clean_pages; + + mapping->nrpages++; + list_add(&page->list, head); + page->mapping = mapping; +} + +static inline void remove_page_from_inode_queue(struct page * page) +{ + struct address_space * mapping = page->mapping; + + mapping->nrpages--; + list_del(&page->list); + page->mapping = NULL; +} + +static inline void remove_page_from_hash_queue(struct page * page) +{ + struct page *next = page->next_hash; + struct page **pprev = page->pprev_hash; + + if (next) + next->pprev_hash = pprev; + *pprev = next; + page->pprev_hash = NULL; + atomic_dec(&page_cache_size); +} + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. + */ +void __remove_inode_page(struct page *page) +{ + if (PageDirty(page)) BUG(); + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); +} + +void remove_inode_page(struct page *page) +{ + if (!PageLocked(page)) + PAGE_BUG(page); + + spin_lock(&pagecache_lock); + __remove_inode_page(page); + spin_unlock(&pagecache_lock); +} + +static inline int sync_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + return mapping->a_ops->sync_page(page); + return 0; +} + +/* + * Add a page to the dirty page list. + */ +void set_page_dirty(struct page *page) +{ + if (!test_and_set_bit(PG_dirty, &page->flags)) { + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock(&pagecache_lock); + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + spin_unlock(&pagecache_lock); + + if (mapping->host) + mark_inode_dirty_pages(mapping->host); + } + } +} + +/** + * invalidate_inode_pages - Invalidate all the unlocked pages of one inode + * @inode: the inode which pages we want to invalidate + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + */ + +void invalidate_inode_pages(struct inode * inode) +{ + struct list_head *head, *curr; + struct page * page; + + head = &inode->i_mapping->clean_pages; + + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + curr = head->next; + + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + + /* We cannot invalidate something in dirty.. */ + if (PageDirty(page)) + continue; + + /* ..or locked */ + if (TryLockPage(page)) + continue; + + if (page->buffers && !try_to_free_buffers(page, 0)) + goto unlock; + + if (page_count(page) != 1) + goto unlock; + + __lru_cache_del(page); + __remove_inode_page(page); + UnlockPage(page); + page_cache_release(page); + continue; +unlock: + UnlockPage(page); + continue; + } + + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +static int do_flushpage(struct page *page, unsigned long offset) +{ + int (*flushpage) (struct page *, unsigned long); + flushpage = page->mapping->a_ops->flushpage; + if (flushpage) + return (*flushpage)(page, offset); + return block_flushpage(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (page->buffers) + do_flushpage(page, partial); +} + +static void truncate_complete_page(struct page *page) +{ + /* Leave it on the LRU if it gets converted into anonymous buffers */ + if (!page->buffers || do_flushpage(page, 0)) + lru_cache_del(page); + + /* + * We remove the page from the page cache _after_ we have + * destroyed all buffer-cache references to it. Otherwise some + * other process might think this inode page is not in the + * page cache and creates a buffer-cache alias to it causing + * all sorts of fun problems ... + */ + ClearPageDirty(page); + ClearPageUptodate(page); + remove_inode_page(page); + page_cache_release(page); +} + +static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); +static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + unsigned long offset; + + page = list_entry(curr, struct page, list); + offset = page->index; + + /* Is one of the pages to truncate? */ + if ((offset >= start) || (*partial && (offset + 1) == start)) { + int failed; + + page_cache_get(page); + failed = TryLockPage(page); + + list_del(head); + if (!failed) + /* Restart after this page */ + list_add_tail(head, curr); + else + /* Restart on this page */ + list_add(head, curr); + + spin_unlock(&pagecache_lock); + unlocked = 1; + + if (!failed) { + if (*partial && (offset + 1) == start) { + truncate_partial_page(page, *partial); + *partial = 0; + } else + truncate_complete_page(page); + + UnlockPage(page); + } else + wait_on_page(page); + + page_cache_release(page); + + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + curr = curr->prev; + } + return unlocked; +} + + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * + * Truncate the page cache at a set offset, removing the pages + * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. + */ +void truncate_inode_pages(struct address_space * mapping, loff_t lstart) +{ + unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + } while (unlocked); + /* Traversed all three lists without dropping the lock */ + spin_unlock(&pagecache_lock); +} + +static inline int invalidate_this_page2(struct page * page, + struct list_head * curr, + struct list_head * head) +{ + int unlocked = 1; + + /* + * The page is locked and we hold the pagecache_lock as well + * so both page_count(page) and page->buffers stays constant here. + */ + if (page_count(page) == 1 + !!page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + unlocked = 0; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return unlocked; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + page = list_entry(curr, struct page, list); + + if (!TryLockPage(page)) { + int __unlocked; + + __unlocked = invalidate_this_page2(page, curr, head); + UnlockPage(page); + unlocked |= __unlocked; + if (!__unlocked) { + curr = curr->prev; + continue; + } + } else { + /* Restart on this page */ + list_del(head); + list_add(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + unlocked = 1; + wait_on_page(page); + } + + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + return unlocked; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = invalidate_list_pages2(&mapping->clean_pages); + unlocked |= invalidate_list_pages2(&mapping->dirty_pages); + unlocked |= invalidate_list_pages2(&mapping->locked_pages); + } while (unlocked); + spin_unlock(&pagecache_lock); +} + +static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) +{ + goto inside; + + for (;;) { + page = page->next_hash; +inside: + if (!page) + goto not_found; + if (page->mapping != mapping) + continue; + if (page->index == offset) + break; + } + +not_found: + return page; +} + +static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) +{ + struct list_head *curr; + struct page *page; + int retval = 0; + + spin_lock(&pagecache_lock); + curr = head->next; + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + if (!page->buffers) + continue; + if (page->index >= end) + continue; + if (page->index < start) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + lock_page(page); + + /* The buffers could have been free'd while we waited for the page lock */ + if (page->buffers) + retval |= fn(page); + + UnlockPage(page); + spin_lock(&pagecache_lock); + curr = page->list.next; + page_cache_release(page); + } + spin_unlock(&pagecache_lock); + + return retval; +} + +/* + * Two-stage data sync: first start the IO, then go back and + * collect the information.. + */ +int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) +{ + int retval; + + /* writeout dirty buffers on pages from both clean and dirty lists */ + retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); + + /* now wait for locked buffers on pages from both clean and dirty lists */ + retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); + + return retval; +} + +/* + * In-memory filesystems have to fail their + * writepage function - and this has to be + * worked around in the VM layer.. + * + * We + * - mark the page dirty again (but do NOT + * add it back to the inode dirty list, as + * that would livelock in fdatasync) + * - activate the page so that the page stealer + * doesn't try to write it out over and over + * again. + */ +int fail_writepage(struct page *page) +{ + /* Only activate on memory-pressure, not fsync.. */ + if (PageLaunder(page)) { + activate_page(page); + SetPageReferenced(page); + } + + /* Set the page dirty again, unlock */ + SetPageDirty(page); + UnlockPage(page); + return 0; +} + +EXPORT_SYMBOL(fail_writepage); + +/** + * filemap_fdatasync - walk the list of dirty pages of the given address space + * and writepage() all of them. + * + * @mapping: address space structure to write + * + */ +int filemap_fdatasync(struct address_space * mapping) +{ + int ret = 0; + int (*writepage)(struct page *) = mapping->a_ops->writepage; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->dirty_pages)) { + struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->locked_pages); + + if (!PageDirty(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + lock_page(page); + + if (PageDirty(page)) { + int err; + ClearPageDirty(page); + err = writepage(page); + if (err && !ret) + ret = err; + } else + UnlockPage(page); + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); + return ret; +} + +/** + * filemap_fdatawait - walk the list of locked pages of the given address space + * and wait for all of them. + * + * @mapping: address space structure to wait for + * + */ +int filemap_fdatawait(struct address_space * mapping) +{ + int ret = 0; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->locked_pages)) { + struct page *page = list_entry(mapping->locked_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->clean_pages); + + if (!PageLocked(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + ___wait_on_page(page); + if (PageError(page)) + ret = -EIO; + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); + return ret; +} + +/* + * Add a page to the inode page cache. + * + * The caller must have locked the page and + * set all the page flags correctly.. + */ +void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) +{ + if (!PageLocked(page)) + BUG(); + + page->index = index; + page_cache_get(page); + spin_lock(&pagecache_lock); + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, page_hash(mapping, index)); + spin_unlock(&pagecache_lock); + + lru_cache_add(page); +} + +/* + * This adds a page to the page cache, starting out as locked, + * owned by us, but unreferenced, not uptodate and with no errors. + */ +static inline void __add_to_page_cache(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + unsigned long flags; + + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked); + page->flags = flags | (1 << PG_locked); + page_cache_get(page); + page->index = offset; + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, hash); +} + +void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) +{ + spin_lock(&pagecache_lock); + __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); + spin_unlock(&pagecache_lock); + lru_cache_add(page); +} + +int add_to_page_cache_unique(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + int err; + struct page *alias; + + spin_lock(&pagecache_lock); + alias = __find_page_nolock(mapping, offset, *hash); + + err = 1; + if (!alias) { + __add_to_page_cache(page,mapping,offset,hash); + err = 0; + } + + spin_unlock(&pagecache_lock); + if (!err) + lru_cache_add(page); + return err; +} + +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page **hash = page_hash(mapping, offset); + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + if (page) + return 0; + + page = page_cache_alloc(mapping); + if (!page) + return -ENOMEM; + + if (!add_to_page_cache_unique(page, mapping, offset, hash)) { + int error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first. + */ + page_cache_release(page); + return 0; +} + +/* + * Read in an entire cluster at once. A cluster is usually a 64k- + * aligned block that includes the page requested in "offset." + */ +static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize)); +static int read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize) +{ + unsigned long pages = CLUSTER_PAGES; + + offset = CLUSTER_OFFSET(offset); + while ((pages-- > 0) && (offset < filesize)) { + int error = page_cache_read(file, offset); + if (error < 0) + return error; + offset ++; + } + + return 0; +} + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static inline wait_queue_head_t *__page_waitqueue(struct page *page) +{ + const zone_t *zone = page_zone(page); + wait_queue_head_t *wait = zone->wait_table; + unsigned long hash = (unsigned long)page; + +#if BITS_PER_LONG == 64 + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + unsigned long n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; +#else + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; +#endif + + hash >>= zone->wait_table_shift; + + return &wait[hash]; +} + +wait_queue_head_t *page_waitqueue(struct page *page) +{ + return __page_waitqueue(page); +} + +#define page_waitqueue(page) __page_waitqueue(page) + +/* + * Wait for a page to get unlocked. + * + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. + */ +void ___wait_on_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(waitqueue, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page)) + break; + sync_page(page); + schedule(); + } while (PageLocked(page)); + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(waitqueue, &wait); +} + +/* + * Unlock the page and wake up sleepers in ___wait_on_page. + */ +void unlock_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + clear_bit(PG_launder, &(page)->flags); + smp_mb__before_clear_bit(); + if (!test_and_clear_bit(PG_locked, &(page)->flags)) + BUG(); + smp_mb__after_clear_bit(); + if (waitqueue_active(waitqueue)) + wake_up_all(waitqueue); +} + +/* + * Get a lock on the page, assuming we need to sleep + * to get it.. + */ +static void __lock_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue_exclusive(waitqueue, &wait); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + sync_page(page); + schedule(); + } + if (!TryLockPage(page)) + break; + } + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(waitqueue, &wait); +} + +/* + * Get an exclusive lock on the page, optimistically + * assuming it's not locked.. + */ +void lock_page(struct page *page) +{ + if (TryLockPage(page)) + __lock_page(page); +} + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * __find_get_page(struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) + page_cache_get(page); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + struct page **hash = page_hash(mapping, offset); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) { + if (TryLockPage(page)) + page = NULL; + } + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Must be called with the pagecache lock held, + * will return with it held (but it may be dropped + * during blocking operations.. + */ +static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); +static struct page * __find_lock_page_helper(struct address_space *mapping, + unsigned long offset, struct page *hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + page = __find_page_nolock(mapping, offset, hash); + if (page) { + page_cache_get(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + lock_page(page); + spin_lock(&pagecache_lock); + + /* Has the page been re-allocated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + UnlockPage(page); + page_cache_release(page); + goto repeat; + } + } + } + return page; +} + +/* + * Same as the above, but lock the page too, verifying that + * it's still valid once we own it. + */ +struct page * __find_lock_page (struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but create the page if required.. + */ +struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) +{ + struct page *page; + struct page **hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page) { + struct page *newpage = alloc_page(gfp_mask); + if (newpage) { + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + if (likely(!page)) { + page = newpage; + __add_to_page_cache(page, mapping, index, hash); + newpage = NULL; + } + spin_unlock(&pagecache_lock); + if (newpage == NULL) + lru_cache_add(page); + else + page_cache_release(newpage); + } + } + return page; +} + +/* + * Returns locked page at given index in given cache, creating it if needed. + */ +struct page *grab_cache_page(struct address_space *mapping, unsigned long index) +{ + return find_or_create_page(mapping, index, mapping->gfp_mask); +} + + +/* + * Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + */ +struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page, **hash; + + hash = page_hash(mapping, index); + page = __find_get_page(mapping, index, hash); + + if ( page ) { + if ( !TryLockPage(page) ) { + /* Page found and locked */ + /* This test is overly paranoid, but what the heck... */ + if ( unlikely(page->mapping != mapping || page->index != index) ) { + /* Someone reallocated this page under us. */ + UnlockPage(page); + page_cache_release(page); + return NULL; + } else { + return page; + } + } else { + /* Page locked by someone else */ + page_cache_release(page); + return NULL; + } + } + + page = page_cache_alloc(mapping); + if ( unlikely(!page) ) + return NULL; /* Failed to allocate a page */ + + if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) { + /* Someone else grabbed the page already. */ + page_cache_release(page); + return NULL; + } + + return page; +} + +#if 0 +#define PROFILE_READAHEAD +#define DEBUG_READAHEAD +#endif + +/* + * Read-ahead profiling information + * -------------------------------- + * Every PROFILE_MAXREADCOUNT, the following information is written + * to the syslog: + * Percentage of asynchronous read-ahead. + * Average of read-ahead fields context value. + * If DEBUG_READAHEAD is defined, a snapshot of these fields is written + * to the syslog. + */ + +#ifdef PROFILE_READAHEAD + +#define PROFILE_MAXREADCOUNT 1000 + +static unsigned long total_reada; +static unsigned long total_async; +static unsigned long total_ramax; +static unsigned long total_ralen; +static unsigned long total_rawin; + +static void profile_readahead(int async, struct file *filp) +{ + unsigned long flags; + + ++total_reada; + if (async) + ++total_async; + + total_ramax += filp->f_ramax; + total_ralen += filp->f_ralen; + total_rawin += filp->f_rawin; + + if (total_reada > PROFILE_MAXREADCOUNT) { + save_flags(flags); + cli(); + if (!(total_reada > PROFILE_MAXREADCOUNT)) { + restore_flags(flags); + return; + } + + printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", + total_ramax/total_reada, + total_ralen/total_reada, + total_rawin/total_reada, + (total_async*100)/total_reada); +#ifdef DEBUG_READAHEAD + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", + filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); +#endif + + total_reada = 0; + total_async = 0; + total_ramax = 0; + total_ralen = 0; + total_rawin = 0; + + restore_flags(flags); + } +} +#endif /* defined PROFILE_READAHEAD */ + +/* + * Read-ahead context: + * ------------------- + * The read ahead context fields of the "struct file" are the following: + * - f_raend : position of the first byte after the last page we tried to + * read ahead. + * - f_ramax : current read-ahead maximum size. + * - f_ralen : length of the current IO read block we tried to read-ahead. + * - f_rawin : length of the current read-ahead window. + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen + * + * Read-ahead limits: + * ------------------ + * MIN_READAHEAD : minimum read-ahead size when read-ahead. + * MAX_READAHEAD : maximum read-ahead size when read-ahead. + * + * Synchronous read-ahead benefits: + * -------------------------------- + * Using reasonable IO xfer length from peripheral devices increase system + * performances. + * Reasonable means, in this context, not too large but not too small. + * The actual maximum value is: + * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined + * and 32K if defined (4K page size assumed). + * + * Asynchronous read-ahead benefits: + * --------------------------------- + * Overlapping next read request and user process execution increase system + * performance. + * + * Read-ahead risks: + * ----------------- + * We have to guess which further data are needed by the user process. + * If these data are often not really needed, it's bad for system + * performances. + * However, we know that files are often accessed sequentially by + * application programs and it seems that it is possible to have some good + * strategy in that guessing. + * We only try to read-ahead files that seems to be read sequentially. + * + * Asynchronous read-ahead risks: + * ------------------------------ + * In order to maximize overlapping, we must start some asynchronous read + * request from the device, as soon as possible. + * We must be very careful about: + * - The number of effective pending IO read requests. + * ONE seems to be the only reasonable value. + * - The total memory pool usage for the file access stream. + * This maximum memory usage is implicitly 2 IO read chunks: + * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, + * 64k if defined (4K page size assumed). + */ + +static inline int get_max_readahead(struct inode * inode) +{ + if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) + return vm_max_readahead; + return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; +} + +static void generic_file_readahead(int reada_ok, + struct file * filp, struct inode * inode, + struct page * page, int flags) +{ + unsigned long end_index; + unsigned long index = page->index; + unsigned long max_ahead, ahead; + unsigned long raend; + int max_readahead = get_max_readahead(inode); + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + raend = filp->f_raend; + max_ahead = 0; + +/* + * The current page is locked. + * If the current position is inside the previous read IO request, do not + * try to reread previously read ahead pages. + * Otherwise decide or not to read ahead some pages synchronously. + * If we are not going to read ahead, set the read ahead context for this + * page only. + */ + if (PageLocked(page)) { + if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { + raend = index; + if (raend < end_index) + max_ahead = filp->f_ramax; + filp->f_rawin = 0; + filp->f_ralen = 1; + if (!max_ahead) { + filp->f_raend = index + filp->f_ralen; + filp->f_rawin += filp->f_ralen; + } + } + } +/* + * The current page is not locked. + * If we were reading ahead and, + * if the current max read ahead size is not zero and, + * if the current position is inside the last read-ahead IO request, + * it is the moment to try to read ahead asynchronously. + * We will later force unplug device in order to force asynchronous read IO. + */ + else if (reada_ok && filp->f_ramax && raend >= 1 && + index <= raend && index + filp->f_ralen >= raend) { +/* + * Add ONE page to max_ahead in order to try to have about the same IO max size + * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. + * Compute the position of the last page we have tried to read in order to + * begin to read ahead just at the next page. + */ + raend -= 1; + if (raend < end_index) + max_ahead = filp->f_ramax + 1; + + if (max_ahead) { + filp->f_rawin = filp->f_ralen; + filp->f_ralen = 0; + reada_ok = 2; + } + } +/* + * Try to read ahead pages. + * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the + * scheduler, will work enough for us to avoid too bad actuals IO requests. + */ + ahead = 0; + while (ahead < max_ahead) { + ahead ++; + if ((raend + ahead) >= end_index) + break; + if (page_cache_read(filp, raend + ahead) < 0) + break; + } +/* + * If we tried to read ahead some pages, + * If we tried to read ahead asynchronously, + * Try to force unplug of the device in order to start an asynchronous + * read IO request. + * Update the read-ahead context. + * Store the length of the current read-ahead window. + * Double the current max read ahead size. + * That heuristic avoid to do some large IO for files that are not really + * accessed sequentially. + */ + if (ahead) { + filp->f_ralen += ahead; + filp->f_rawin += filp->f_ralen; + filp->f_raend = raend + ahead + 1; + + filp->f_ramax += filp->f_ramax; + + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + +#ifdef PROFILE_READAHEAD + profile_readahead((reada_ok == 2), filp); +#endif + } + + return; +} + +/* + * Mark a page as having seen activity. + * + * If it was already so marked, move it + * to the active queue and drop the referenced + * bit. Otherwise, just mark it for future + * action.. + */ +void mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && PageReferenced(page)) { + activate_page(page); + ClearPageReferenced(page); + return; + } + + /* Mark the page referenced, AFTER checking for previous usage.. */ + SetPageReferenced(page); +} + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int flags) +{ + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long index, offset; + struct page *cached_page; + int reada_ok; + int error; + int max_readahead = get_max_readahead(inode); + loff_t pos; + + pos = *ppos; + if (unlikely(pos < 0)) { + desc->error = -EINVAL; + return; + } + + cached_page = NULL; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + +/* + * If the current position is outside the previous read-ahead window, + * we reset the current read-ahead context and set read ahead max to zero + * (will be set to just needed value later), + * otherwise, we assume that the file accesses are sequential enough to + * continue read-ahead. + */ + if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { + reada_ok = 0; + filp->f_raend = 0; + filp->f_ralen = 0; + filp->f_ramax = 0; + filp->f_rawin = 0; + } else { + reada_ok = 1; + } +/* + * Adjust the current value of read-ahead max. + * If the read operation stay in the first half page, force no readahead. + * Otherwise try to increase read ahead max just enough to do the read request. + * Then, at least MIN_READAHEAD if read ahead is ok, + * and at most MAX_READAHEAD in all cases. + */ + if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { + filp->f_ramax = 0; + } else { + unsigned long needed; + + needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; + + if (filp->f_ramax < needed) + filp->f_ramax = needed; + + if (reada_ok && filp->f_ramax < vm_min_readahead) + filp->f_ramax = vm_min_readahead; + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + } + + for (;;) { + struct page *page, **hash; + unsigned long end_index, nr, ret; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + if (index > end_index) { + desc->error = 0; + break; + } + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) { + desc->error = 0; + break; + } + } + + nr = nr - offset; + + /* + * Try to find the data in the page cache.. + */ + hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (!page) + goto no_cached_page; +found_page: + page_cache_get(page); + spin_unlock(&pagecache_lock); + + if (!Page_Uptodate(page)) + goto page_not_up_to_date; + generic_file_readahead(reada_ok, filp, inode, page, flags); +page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + + /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + mark_page_accessed(page); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + break; + +/* + * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. + */ +page_not_up_to_date: + generic_file_readahead(reada_ok, filp, inode, page, flags); + + if (Page_Uptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + if (flags & F_ATOMIC) { + if (TryLockPage(page)) { + if (Page_Uptodate(page)) + goto page_ok; + desc->error = -EWOULDBLOCKIO; + page_cache_release(page); + break; + } + printk("page_not_up_to_date: atomic trylock succeeded\n"); + } else + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (!error) { + if (Page_Uptodate(page)) + goto page_ok; + + /* Again, try some read-ahead while waiting for the page to finish.. */ + generic_file_readahead(reada_ok, filp, inode, page, flags); + if (!(flags & F_ATOMIC)) + wait_on_page(page); + if (Page_Uptodate(page)) + goto page_ok; + error = (flags & F_ATOMIC) ? -EWOULDBLOCKIO : -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + +no_cached_page: + if (flags & F_ATOMIC) { + spin_unlock(&pagecache_lock); + desc->error = -EWOULDBLOCKIO; + break; + } + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + * + * We get here with the page cache lock held. + */ + if (!cached_page) { + spin_unlock(&pagecache_lock); + cached_page = page_cache_alloc(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + break; + } + + /* + * Somebody may have added the page while we + * dropped the page cache lock. Check for that. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (page) + goto found_page; + } + + /* + * Ok, add the new page to the hash-queues... + */ + page = cached_page; + __add_to_page_cache(page, mapping, index, hash); + spin_unlock(&pagecache_lock); + lru_cache_add(page); + cached_page = NULL; + + goto readpage; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + filp->f_reada = 1; + if (cached_page) + page_cache_release(cached_page); + UPDATE_ATIME(inode); +} + +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; + struct kiobuf * iobuf; + struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; + struct inode * inode = mapping->host; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = 1 << inode->i_blkbits; + blocksize_bits = inode->i_blkbits; + blocksize_mask = blocksize - 1; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask)) + goto out_free; + if (!mapping->a_ops->direct_IO) + goto out_free; + + /* + * Flush to disk exclusively the _data_, metadata must remain + * completly asynchronous or performance will go to /dev/null. + */ + retval = filemap_fdatasync(mapping); + if (retval == 0) + retval = fsync_inode_data_buffers(inode); + if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval < 0) + goto out_free; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + +int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + return generic_file_new_read(filp, buf, count, ppos, 0); +} + +ssize_t generic_file_new_read(struct file * filp, char * buf, size_t count, loff_t *ppos, int flags) +{ + ssize_t retval; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + goto o_direct; + + retval = -EFAULT; + if (access_ok(VERIFY_WRITE, buf, count)) { + retval = 0; + + if (count) { + read_descriptor_t desc; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor, flags); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + } + out: + return retval; + + o_direct: + { + loff_t pos = *ppos, size; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + retval = 0; + if (!count) + goto out; /* skip atime */ + size = inode->i_size; + if (pos < size) { + if (pos + count > size) + count = size - pos; + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + } + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } +} + +static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + + if (size > count) + size = count; + + if (file->f_op->sendpage) { + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, sizef_op->write(file, kaddr + offset, size, &file->f_pos); + kunmap(page); + + set_fs(old_fs); + } + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + ssize_t retval; + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget(in_fd); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -EINVAL; + in_inode = in_file->f_dentry->d_inode; + if (!in_inode) + goto fput_in; + if (!in_inode->i_mapping->a_ops->readpage) + goto fput_in; + retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count); + if (retval) + goto fput_in; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget(out_fd); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + if (!out_file->f_op || !out_file->f_op->write) + goto fput_out; + out_inode = out_file->f_dentry->d_inode; + retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count); + if (retval) + goto fput_out; + + retval = 0; + if (count) { + read_descriptor_t desc; + loff_t pos = 0, *ppos; + + retval = -EFAULT; + ppos = &in_file->f_pos; + if (offset) { + if (get_user(pos, offset)) + goto fput_out; + ppos = &pos; + } + + desc.written = 0; + desc.count = count; + desc.buf = (char *) out_file; + desc.error = 0; + do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0); + + retval = desc.written; + if (!retval) + retval = desc.error; + if (offset) + put_user(pos, offset); + } + +fput_out: + fput(out_file); +fput_in: + fput(in_file); +out: + return retval; +} + +static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + unsigned long max; + + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + /* Limit it to the size of the file.. */ + max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT; + if (index > max) + return 0; + max -= index; + if (nr > max) + nr = max; + + /* And limit it to a sane percentage of the inactive list.. */ + max = nr_inactive_pages / 2; + if (nr > max) + nr = max; + + while (nr) { + page_cache_read(file, index); + index++; + nr--; + } + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT; + ret = do_readahead(file, start, len); + } + fput(file); + } + return ret; +} + +/* + * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are + * sure this is sequential access, we don't need a flexible read-ahead + * window size -- we can always use a large fixed size window. + */ +static void nopage_sequential_readahead(struct vm_area_struct * vma, + unsigned long pgoff, unsigned long filesize) +{ + unsigned long ra_window; + + ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); + ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); + + /* vm_raend is zero if we haven't read ahead in this area yet. */ + if (vma->vm_raend == 0) + vma->vm_raend = vma->vm_pgoff + ra_window; + + /* + * If we've just faulted the page half-way through our window, + * then schedule reads for the next window, and release the + * pages in the previous window. + */ + if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { + unsigned long start = vma->vm_pgoff + vma->vm_raend; + unsigned long end = start + ra_window; + + if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) + end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; + if (start > end) + return; + + while ((start < end) && (start < filesize)) { + if (read_cluster_nonblocking(vma->vm_file, + start, filesize) < 0) + break; + start += CLUSTER_PAGES; + } + run_task_queue(&tq_disk); + + /* if we're far enough past the beginning of this area, + recycle pages that are in the previous window. */ + if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { + unsigned long window = ra_window << PAGE_SHIFT; + + end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); + end -= window + window; + filemap_sync(vma, end - window, window, MS_INVALIDATE); + } + + vma->vm_raend += ra_window; + } + + return; +} + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + struct page *page, **hash; + unsigned long size, pgoff, endoff; + + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if ((pgoff >= size) && (area->vm_mm == current->mm)) + return NULL; + + /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ + if (size > endoff) + size = endoff; + + /* + * Do we have something in the page cache already? + */ + hash = page_hash(mapping, pgoff); +retry_find: + page = __find_get_page(mapping, pgoff, hash); + if (!page) + goto no_cached_page; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!Page_Uptodate(page)) + goto page_not_uptodate; + +success: + /* + * Try read-ahead for sequential areas. + */ + if (VM_SequentialReadHint(area)) + nopage_sequential_readahead(area, pgoff, size); + + /* + * Found the page and have a reference on it, need to check sharing + * and possibly copy it over to another page.. + */ + mark_page_accessed(page); + flush_page_to_ram(page); + return page; + +no_cached_page: + /* + * If the requested offset is within our file, try to read a whole + * cluster of pages at once. + * + * Otherwise, we're off the end of a privately mapped file, + * so we need to map a zero page. + */ + if ((pgoff < size) && !VM_RandomReadHint(area)) + error = read_cluster_nonblocking(file, pgoff, size); + else + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +/* Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ +static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t pte = *ptep; + + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { + flush_tlb_page(vma, address); + set_page_dirty(page); + } + } + return 0; +} + +static inline int filemap_sync_pte_range(pmd_t * pmd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned long offset, unsigned int flags) +{ + pte_t * pte; + unsigned long end; + int error; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + pte = pte_offset(pmd, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + error = 0; + do { + error |= filemap_sync_pte(pte, vma, address + offset, flags); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + return error; +} + +static inline int filemap_sync_pmd_range(pgd_t * pgd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned int flags) +{ + pmd_t * pmd; + unsigned long offset, end; + int error; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return 0; + } + pmd = pmd_offset(pgd, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + error = 0; + do { + error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return error; +} + +int filemap_sync(struct vm_area_struct * vma, unsigned long address, + size_t size, unsigned int flags) +{ + pgd_t * dir; + unsigned long end = address + size; + int error = 0; + + /* Aquire the lock early; it may be possible to avoid dropping + * and reaquiring it repeatedly. + */ + spin_lock(&vma->vm_mm->page_table_lock); + + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, end - size, end); + if (address >= end) + BUG(); + do { + error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_range(vma->vm_mm, end - size, end); + + spin_unlock(&vma->vm_mm->page_table_lock); + + return error; +} + +static struct vm_operations_struct generic_file_vm_ops = { + nopage: filemap_nopage, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + if (!mapping->a_ops->writepage) + return -EINVAL; + } + if (!mapping->a_ops->readpage) + return -ENOEXEC; + UPDATE_ATIME(inode); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * The msync() system call. + */ + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC initiates writeout of just the dirty mapped data. + * This provides no guarantee of file integrity - things like indirect + * blocks may not have started writeout. MS_ASYNC is primarily useful + * where the application knows that it has finished with the data and + * wishes to intelligently schedule its own I/O traffic. + */ +static int msync_interval(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int flags) +{ + int ret = 0; + struct file * file = vma->vm_file; + + if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) ) + return -EBUSY; + + if (file && (vma->vm_flags & VM_SHARED)) { + ret = filemap_sync(vma, start, end-start, flags); + + if (!ret && (flags & (MS_SYNC|MS_ASYNC))) { + struct inode * inode = file->f_dentry->d_inode; + + down(&inode->i_sem); + ret = filemap_fdatasync(inode->i_mapping); + if (flags & MS_SYNC) { + int err; + + if (file->f_op && file->f_op->fsync) { + err = file->f_op->fsync(file, file->f_dentry, 1); + if (err && !ret) + ret = err; + } + err = filemap_fdatawait(inode->i_mapping); + if (err && !ret) + ret = err; + } + up(&inode->i_sem); + } + } + return ret; +} + +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error, error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline void setup_read_behavior(struct vm_area_struct * vma, + int behavior) +{ + VM_ClearReadHint(vma); + switch(behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + return; +} + +static long madvise_fixup_start(struct vm_area_struct * vma, + unsigned long end, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_end(struct vm_area_struct * vma, + unsigned long start, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + struct vm_area_struct * left, * right; + struct mm_struct * mm = vma->vm_mm; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_raend = 0; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + setup_read_behavior(vma, behavior); + __insert_vm_struct(mm, left); + __insert_vm_struct(mm, right); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + int error = 0; + + /* This caps the number of vma's this process can own */ + if (vma->vm_mm->map_count > max_map_count) + return -ENOMEM; + + if (start == vma->vm_start) { + if (end == vma->vm_end) { + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + } else + error = madvise_fixup_start(vma, end, behavior); + } else { + if (end == vma->vm_end) + error = madvise_fixup_end(vma, start, behavior); + else + error = madvise_fixup_middle(vma, start, end, behavior); + } + + return error; +} + +/* + * Schedule all required I/O operations, then run the disk queue + * to make sure they are started. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + long error = -EBADF; + struct file * file; + unsigned long size, rlim_rss; + + /* Doesn't work if there's no mapped file. */ + if (!vma->vm_file) + return error; + file = vma->vm_file; + size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + /* Make sure this doesn't exceed the process's max rss. */ + error = -EIO; + rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : + LONG_MAX; /* default: see resource.h */ + if ((vma->vm_mm->rss + (end - start)) > rlim_rss) + return error; + + /* round to cluster boundaries if this isn't a "random" area. */ + if (!VM_RandomReadHint(vma)) { + start = CLUSTER_OFFSET(start); + end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); + + while ((start < end) && (start < size)) { + error = read_cluster_nonblocking(file, start, size); + start += CLUSTER_PAGES; + if (error < 0) + break; + } + } else { + while ((start < end) && (start < size)) { + error = page_cache_read(file, start); + start++; + if (error < 0) + break; + } + } + + /* Don't wait for someone else to push these requests. */ + run_task_queue(&tq_disk); + + return error; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for refill_inactive to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * refill_inactive to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + zap_page_range(vma->vm_mm, start, end - start); + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + + down_write(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_write(¤t->mm->mmap_sem); + return error; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct vm_area_struct * vma, + unsigned long pgoff) +{ + unsigned char present = 0; + struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; + struct page * page, ** hash = page_hash(as, pgoff); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(as, pgoff, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + + return present; +} + +static long mincore_vma(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned char * vec) +{ + long error, i, remaining; + unsigned char * tmp; + + error = -ENOMEM; + if (!vma->vm_file) + return error; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EAGAIN; + tmp = (unsigned char *) __get_free_page(GFP_KERNEL); + if (!tmp) + return error; + + /* (end - start) is # of pages, and also # of bytes in "vec */ + remaining = (end - start), + + error = 0; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { + int j = 0; + long thispiece = (remaining < PAGE_SIZE) ? + remaining : PAGE_SIZE; + + while (j < thispiece) + tmp[j++] = mincore_page(vma, start++); + + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + error = -EFAULT; + break; + } + } + + free_page((unsigned long) tmp); + return error; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE, + * or len has a nonpositive value + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +asmlinkage long sys_mincore(unsigned long start, size_t len, + unsigned char * vec) +{ + int index = 0; + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + long error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_CACHE_MASK) + goto out; + len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = mincore_vma(vma, start, end, + &vec[index]); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = mincore_vma(vma, start, vma->vm_end, &vec[index]); + if (error) + goto out; + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline +struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page **hash = page_hash(mapping, index); + struct page *page, *cached_page = NULL; + int err; +repeat: + page = __find_get_page(mapping, index, hash); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + page = cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and Page_Uptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (Page_Uptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry; + } + if (Page_Uptodate(page)) { + UnlockPage(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +static inline struct page * __grab_cache_page(struct address_space *mapping, + unsigned long index, struct page **cached_page) +{ + struct page *page, **hash = page_hash(mapping, index); +repeat: + page = __find_lock_page(mapping, index, hash); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + page = *cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + *cached_page = NULL; + } + return page; +} + +inline void remove_suid(struct inode *inode) +{ + unsigned int mode; + + /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ + mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; + + /* was any of the uid bits set? */ + mode &= inode->i_mode; + if (mode && !capable(CAP_FSETID)) { + inode->i_mode &= ~mode; + mark_inode_dirty(inode); + } +} + +/* + * Write to a file through the page cache. + * + * We currently put everything into the page cache prior to writing it. + * This is not a problem when writing full pages. With partial pages, + * however, we first have to read the data into the cache, then + * dirty the page, and finally schedule it for writing. Alternatively, we + * could write-through just the portion of data that would go into that + * page, but that would kill performance for applications that write data + * line by line, and it's prone to race conditions. + * + * Note that this routine doesn't try to keep track of dirty pages. Each + * file system has to do this all by itself, unfortunately. + * okir@monad.swb.de + */ +ssize_t +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos; + struct page *page, *cached_page; + ssize_t written; + long status = 0; + int err; + unsigned bytes; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + cached_page = NULL; + + down(&inode->i_sem); + + pos = *ppos; + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + + written = 0; + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + pos = inode->i_size; + + /* + * Check whether we've reached the file size limit. + */ + err = -EFBIG; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = limit - (u32)pos; + } + } + + /* + * LFS rule + */ + if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > MAX_NON_LFS - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = MAX_NON_LFS - (u32)pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write + * If we have exceeded without writing data we send + * a signal and give them an EFBIG. + * + * Linus frestrict idea will clean these up nicely.. + */ + + if (!S_ISBLK(inode->i_mode)) { + if (pos >= inode->i_sb->s_maxbytes) + { + if (count || pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + err = -EFBIG; + goto out; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (pos + count > inode->i_sb->s_maxbytes) + count = inode->i_sb->s_maxbytes - pos; + } else { + if (is_read_only(inode->i_rdev)) { + err = -EPERM; + goto out; + } + if (pos >= inode->i_size) { + if (count || pos > inode->i_size) { + err = -ENOSPC; + goto out; + } + } + + if (pos + count > inode->i_size) + count = inode->i_size - pos; + } + + err = 0; + if (count == 0) + goto out; + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + + if (file->f_flags & O_DIRECT) + goto o_direct; + + do { + unsigned long index, offset; + long page_fault; + char *kaddr; + + /* + * Try to find the page in the cache. If it isn't there, + * allocate a free page. + */ + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + { volatile unsigned char dummy; + __get_user(dummy, buf); + __get_user(dummy, buf+bytes-1); + } + + status = -ENOMEM; /* we'll assign it later anyway */ + page = __grab_cache_page(mapping, index, &cached_page); + if (!page) + break; + + /* We have exclusive IO access to the page.. */ + if (!PageLocked(page)) { + PAGE_BUG(page); + } + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); + if (status) + goto sync_failure; + page_fault = __copy_from_user(kaddr+offset, buf, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); + if (page_fault) + goto fail_write; + if (!status) + status = bytes; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } +unlock: + kunmap(page); + /* Mark it unlocked again and drop the page.. */ + SetPageReferenced(page); + UnlockPage(page); + page_cache_release(page); + + if (status < 0) + break; + } while (count); +done: + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* For now, when the user asks for O_SYNC, we'll actually + * provide O_DSYNC. */ + if (status >= 0) { + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); + } + +out_status: + err = written ? written : status; +out: + + up(&inode->i_sem); + return err; +fail_write: + status = -EFAULT; + goto unlock; + +sync_failure: + /* + * If blocksize < pagesize, prepare_write() may have instantiated a + * few blocks outside i_size. Trim these off again. + */ + kunmap(page); + UnlockPage(page); + page_cache_release(page); + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + goto done; + +o_direct: + written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (written > 0) { + loff_t end = pos + written; + if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && file->f_flags & O_SYNC) + status = generic_osync_inode(inode, OSYNC_METADATA); + goto out_status; +} + +void __init page_cache_init(unsigned long mempages) +{ + unsigned long htable_size, order; + + htable_size = mempages; + htable_size *= sizeof(struct page *); + for(order = 0; (PAGE_SIZE << order) < htable_size; order++) + ; + + do { + unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); + + page_hash_bits = 0; + while((tmp >>= 1UL) != 0UL) + page_hash_bits++; + + page_hash_table = (struct page **) + __get_free_pages(GFP_ATOMIC, order); + } while(page_hash_table == NULL && --order > 0); + + printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n", + (1 << page_hash_bits), order, (PAGE_SIZE << order)); + if (!page_hash_table) + panic("Failed to allocate page hash table\n"); + memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); +} + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + if (unlikely(nr <= 0)) { + *nr_newp = nr_new; + return 0; + } + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_release(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + kvec_cb_t cb; + + size_t size; + unsigned long transferred; + unsigned offset; + struct kveclet *veclet; + + struct kvec_dst src; + + int sync; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io, int unlock) +{ + kvec_cb_t cb; + ssize_t res; + + if (unlock) { + unsigned i; + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + page_cache_release(page); + } + } else { + unsigned i; + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + } + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + + cb = io->cb; + res = io->transferred ? io->transferred : io->err; + kfree(io); + + cb.fn(cb.data, cb.vec, res); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + pr_debug("__iodesc_finish_write(%p)\n", io); + + __iodesc_free(io, WRITE == io->rw); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + char *kaddr = kmap(page); + unsigned long bytes; + unsigned long offset; + long status; + int done = 0; + + offset = io->offset; + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (unlikely(io->err)) { + pr_debug("prepare_write: %d\n", io->err); + kunmap(page); + return 1; + } + + kvec_dst_map(&io->src); + memcpy_from_kvec_dst(kaddr, &io->src, bytes); + kvec_dst_unmap(&io->src); /* commit_write may block */ + + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; + + if (likely(status > 0)) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + } else { + io->err = status; + done = 1; + } + + kunmap(page); + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { + pr_debug("waiting on bh=%pi io=%p\n", bh, io); + if (!wtd_wait_on_buffer(&io->wtd, bh)) + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { + pr_debug("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + + pr_debug("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + for (i=0; inr_pages; i++) { + if (__iodesc_write_page(io, io->pages[i])) + break; + } + + up(&io->file->f_dentry->d_inode->i_sem); + + if (io->sync) { + io->good_idx = 0; + + pr_debug("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + + pr_debug("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + if (!wtd_lock_page(&io->wtd, io->good_page)) + return; + } + } + + //Is this faster? __iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + struct page **src_pagep; + char *dst_addr, *src_addr; + int src_off; + size_t size; + size_t valid; + + struct kveclet *veclet = io->veclet; + struct page *dst_page = veclet->page; + int dst_len = veclet->length; + int dst_off = veclet->offset; + + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + dst_addr = kmap(veclet->page); + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + io->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", + this, io->transferred); + + if (size <= 0) + break; + + if (dst_len <= 0) { + kunmap(dst_page); + veclet++; + dst_page = veclet->page; + dst_off = veclet->offset; + dst_len = veclet->length; + dst_addr = kmap(dst_page); + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); + pr_debug("page(%lu)->count = %d\n", + (*src_pagep)->index, + atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(dst_page); + kunmap(*src_pagep); +no_data: + __iodesc_free(io, 0); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); +again: + while (Page_Uptodate(page)) { + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + if (!wtd_lock_page(&io->wtd, page)) + return; + locked = 1; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + pr_debug("attempting to read %lu\n", page->index); + io->did_read = 1; + locked = 0; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + if (wtd_lock_page(&io->wtd, page)) { + locked = 1; + goto again; + } + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int ret; + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + ret = readpage(io->file, io->new_pages[i]); + if (ret) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + if (!TryLockPage(page)) { + int ret = readpage(io->file, page); + if (ret) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + if (wtd_lock_page(&io->wtd, io->good_page)) + __iodesc_make_uptodate(io); + return; +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos); + +int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, READ, cb, size, pos); +} + +int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, WRITE, cb, size, pos); +} + +int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + struct iodesc *io = NULL; + int ret; + int append = 0; + + ret = -EINVAL; + if (unlikely(rw != READ && rw != WRITE)) + goto out; + + append = unlikely(0 != (file->f_flags & O_APPEND)); + + /* Don't check pos when appending, but otherwise do santity + * checks before allocating memory. -'ve offsets are invalid. + */ + if (unlikely(!append && pos < 0)) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->size = size; + + /* FIXME: make the down a WTD_op */ + if (rw == WRITE) { + unsigned long long tmp; + loff_t limit; + + down(&inode->i_sem); + if (append) + pos = inode->i_size; + + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (likely(RLIM_INFINITY == limit)) + limit = OFFSET_MAX; + + /* Filesystem limits take precedence over user limits */ + if (likely(inode->i_sb->s_maxbytes < limit)) + limit = inode->i_sb->s_maxbytes; + + if (unlikely(pos >= limit)) { + pr_debug("maxbytes: %Ld\n", limit); + ret = 0; + if (size || pos > limit) + ret = -EFBIG; + goto out_io; + } + + /* Clamp writes straddling limit. */ + tmp = pos + size; + if (unlikely(tmp > (unsigned long long)limit)) + size = limit - pos; + } + + if (READ == rw) { + pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size); + + if (pos > inode->i_size) + size = 0; + else if ((pos + size) > inode->i_size) + size = inode->i_size - pos; + + if (io->size < size) + size = io->size; + else if (size < io->size) + io->size = size; + + pr_debug("io->size=%d size=%d\n", io->size, size); + } + + ret = 0; + if (unlikely(!size)) + goto out_io; + + index = pos >> PAGE_CACHE_SHIFT; + eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->rw = rw; + io->as = as; + io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1); + io->file = file; + io->cb = cb; + kvec_dst_init(&io->src, KM_USER0); + kvec_dst_set(&io->src, cb.vec->veclet); + io->veclet = cb.vec->veclet; + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (rw == READ) + __generic_file_read_iodesc(io, 0); + else if (rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + kfree(io); + + if (rw == WRITE) + up(&inode->i_sem); +out: + if (!ret) + cb.fn(cb.data, cb.vec, ret); + return ret; +} diff -urN v2.4.19/mm/memory.c aio-2.4.19.diff/mm/memory.c --- v2.4.19/mm/memory.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/mm/memory.c Mon Sep 16 21:54:13 2002 @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include #include @@ -425,6 +427,8 @@ pte = *ptep; if (pte_present(pte)) { + struct page *page = pte_page(pte); + prefetch(page); if (!write || (pte_write(pte) && pte_dirty(pte))) return pte_page(pte); @@ -1495,3 +1499,272 @@ } return page; } + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + * FIXME: some architectures need to flush the cache based on user addresses + * here. Someone please provide a better macro than flush_cache_page. + */ + +#define dprintk(x...) +atomic_t user_pinned_pages = ATOMIC_INIT(0); + +struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len) +{ + return mm_map_user_kvec(current->mm, rw, ptr, len); +} + +struct kvec *mm_map_user_kvec(struct mm_struct *mm, int rw, unsigned long ptr, + size_t len) +{ + struct kvec *vec; + struct kveclet *veclet; + unsigned long end; + int err; + struct vm_area_struct * vma = 0; + int i; + int datain = (rw == READ); + unsigned nr_pages; + + end = ptr + len; + if (unlikely(end < ptr)) + return ERR_PTR(-EINVAL); + + nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages -= ptr >> PAGE_SHIFT; + nr_pages ++; + + atomic_add(nr_pages, &user_pinned_pages); + err = -EAGAIN; + if (unlikely(atomic_read(&user_pinned_pages) >= aio_max_pinned)) + goto out_adjust; + + vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet), + GFP_KERNEL); + err = -ENOMEM; + if (unlikely(!vec)) + goto out_adjust; + + vec->nr = 0; + vec->max_nr = nr_pages; + veclet = vec->veclet; + + /* Make sure the iobuf is not already mapped somewhere. */ + dprintk ("map_user_kiobuf: begin\n"); + + down_read(&mm->mmap_sem); + + err = -EFAULT; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + struct page *map; + veclet->offset = ptr & ~PAGE_MASK; + veclet->length = PAGE_SIZE - veclet->offset; + if (len < veclet->length) + veclet->length = len; + ptr &= PAGE_MASK; + len -= veclet->length; + + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(mm, ptr); + if (unlikely(!vma)) + goto out_unlock; + if (unlikely(vma->vm_start > ptr)) { + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + goto out_unlock; + if (unlikely(expand_stack(vma, ptr))) + goto out_unlock; + } + if (unlikely(((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ)))) { + err = -EFAULT; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (unlikely(!(map = follow_page(mm, ptr, datain)))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (likely(map != NULL)) { + flush_dcache_page(map); + atomic_inc(&map->count); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + veclet->page = map; + veclet++; + + ptr += PAGE_SIZE; + vec->nr = ++i; + } + + veclet->page = NULL; /* dummy for the prefetch in free_kvec */ + veclet->length = 0; /* bug checking ;-) */ + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return vec; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kvec(vec, 0); + kfree(vec); + dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw); + return ERR_PTR(err); + + out_adjust: + atomic_sub(nr_pages, &user_pinned_pages); + dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw); + return ERR_PTR(err); +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kvec (struct kvec *vec, int dirtied) +{ + struct kveclet *veclet = vec->veclet; + struct kveclet *end = vec->veclet + vec->nr; + struct page *map = veclet->page; + + prefetchw(map); + for (; vecletpage) { + prefetchw(veclet[1].page); + if (likely(map != NULL) && !PageReserved(map)) { + if (dirtied) { + SetPageDirty(map); + flush_dcache_page(map); /* FIXME */ + } + __free_page(map); + } + } + + atomic_sub(vec->max_nr, &user_pinned_pages); + vec->nr = 0; +} + +void free_kvec(struct kvec *vec) +{ + if (unlikely(vec->nr)) + BUG(); + kfree(vec); +} + +/* kvec memory copy helper: appends len bytes in from to dst. + */ +void memcpy_to_kvec_dst(struct kvec_dst *dst, const char *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (dst->space < cnt) + cnt = dst->space; + + memcpy(dst->dst, from, cnt); + from += cnt; + dst->space -= cnt; + dst->dst += cnt; + len -= cnt; + if (!dst->space && len) { + kvec_dst_unmap(dst); + dst->let++; + dst->offset = 0; + kvec_dst_map(dst); + if (unlikely(!dst->space)) + BUG(); + } + } while (len); +} + +/* kvec memory copy helper: copies and consumes len bytes in from to dst. + */ +void memcpy_from_kvec_dst(char *to, struct kvec_dst *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (from->space < cnt) + cnt = from->space; + + memcpy(to, from->dst, cnt); + to += cnt; + from->space -= cnt; + from->dst += cnt; + len -= cnt; + if (unlikely(!from->space && len)) { + kvec_dst_unmap(from); + from->let++; + from->offset = 0; + kvec_dst_map(from); + if (unlikely(!from->space)) + BUG(); + } + } while (len); +} + +/* + */ +int copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len) +{ + struct kveclet *let = to->veclet; + int ret = 0; + + if ((ssize_t)len < 0) + BUG(); + + while (offset) { + if (offset < let->length) + break; + offset -= let->length; + let++; + + if ((let - to->veclet) > to->nr) + BUG(); + } + + /* FIXME: kmap deadlockage */ + while (len && !ret) { + char *dst = kmap(let->page); + size_t this; + + this = let->length - offset; + if (len < this) + this = len; + + offset += let->offset; + if (copy_from_user(dst+offset, from, this)) + ret = -EFAULT; + + from += this; + len -= this; + kunmap(let->page); + offset = 0; + let ++; + } + + return ret; +} + diff -urN v2.4.19/mm/mmap.c aio-2.4.19.diff/mm/mmap.c --- v2.4.19/mm/mmap.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/mm/mmap.c Mon Sep 16 21:54:13 2002 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff -urN v2.4.19/mm/wtd.c aio-2.4.19.diff/mm/wtd.c --- v2.4.19/mm/wtd.c Wed Dec 31 19:00:00 1969 +++ aio-2.4.19.diff/mm/wtd.c Mon Sep 16 21:54:13 2002 @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(page_waitqueue(page), &wtd->wait); + wtd_queue(wtd); + } else + schedule_task(&run_disk_tq); +} + +int wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + + /* Wakeups may race with TryLockPage, so try again within the wait + * queue spinlock. + */ + if (!add_wait_queue_cond(page_waitqueue(page), &wtd->wait, + TryLockPage(page))) { + /* Page is still locked. Kick the disk queue... */ + run_task_queue(&tq_disk); + return 0; + } + } + + return 1; +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + if (!buffer_locked(bh)) { + return 1; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + if (add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh))) + return 1; + run_task_queue(&tq_disk); + return 0; +} + +void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +struct tq_struct run_disk_tq = { + routine: do_run_tq_disk, + data: NULL +}; + diff -urN v2.4.19/net/core/datagram.c aio-2.4.19.diff/net/core/datagram.c --- v2.4.19/net/core/datagram.c Tue Jan 1 14:09:35 2002 +++ aio-2.4.19.diff/net/core/datagram.c Mon Sep 16 21:54:13 2002 @@ -8,6 +8,8 @@ * * Authors: Alan Cox . (datagram_poll() from old udp.c code) * + * Portions Copyright 2001 Red Hat, Inc. + * * Fixes: * Alan Cox : NULL return from skb_peek_copy() understood * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff. @@ -21,6 +23,7 @@ * Darryl Miles : Fixed non-blocking SOCK_STREAM. * Alan Cox : POSIXisms * Pete Wyckoff : Unconnected accept() fix. + * Benjamin LaHaise: added kvec operations * */ @@ -37,6 +40,7 @@ #include #include #include +#include #include #include @@ -446,3 +450,321 @@ return mask; } + +/* + */ +static inline void skb_copy_datagram_kvec_dst(const struct sk_buff *skb, + int offset, struct kvec_dst *dst, int len) +{ + int i, copy; + int start = skb->len - skb->data_len; + + /* Copy header. */ + if ((copy = start-offset) > 0) { + if (copy > len) + copy = len; + memcpy_to_kvec_dst(dst, skb->data + offset, copy); + if ((len -= copy) == 0) + return; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i=0; inr_frags; i++) { + int end; + + BUG_TRAP(start <= offset+len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end-offset) > 0) { + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap_atomic(page, KM_USER1); + memcpy_to_kvec_dst(dst, vaddr + frag->page_offset + + offset-start, copy); + kunmap_atomic(vaddr, KM_USER1); + if (!(len -= copy)) + return; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list; + + for (list = skb_shinfo(skb)->frag_list; list; list=list->next) { + int end; + + BUG_TRAP(start <= offset+len); + + end = start + list->len; + if ((copy = end-offset) > 0) { + if (copy > len) + copy = len; + skb_copy_datagram_kvec_dst(list, offset-start, dst, copy); + if ((len -= copy) == 0) + return; + offset += copy; + } + start = end; + } + } +} + +void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset, + struct kvec *vec, int len) +{ + struct kvec_dst dst; + kvec_dst_init(&dst, KM_USER0); + kvec_dst_set(&dst, vec->veclet); + kvec_dst_map(&dst); + skb_copy_datagram_kvec_dst(skb, offset, &dst, len); + kvec_dst_unmap(&dst); +} + +/* C++ would be better for this. Please don't torture me with this code + * ever again. + */ +static inline unsigned int csum_and_copy_to_dst(struct kvec_dst *dst, + const char *from, int len, unsigned int csum) +{ + do { + int cnt = len; + if (dst->space < cnt) + cnt = dst->space; + + memcpy(dst->dst, from, cnt); + csum = csum_partial_copy_nocheck(from, dst->dst, cnt, csum); + from += cnt; + dst->space -= cnt; + dst->dst += cnt; + len -= cnt; + if (!dst->space && len) { + kvec_dst_unmap(dst); + dst->let++; + dst->offset = 0; + kvec_dst_map(dst); + if (!dst->space) + BUG(); + } + } while (len); + return csum; +} + +static inline void skb_copy_and_csum_datagram_kvec_dst(const struct sk_buff *skb, int offset, struct kvec_dst *dst, int len, unsigned int *csump) +{ + int i, copy; + int start = skb->len - skb->data_len; + int pos = 0; + + /* Copy header. */ + if ((copy = start-offset) > 0) { + if (copy > len) + copy = len; + *csump = csum_and_copy_to_dst(dst, skb->data+offset, copy, *csump); + if ((len -= copy) == 0) + return; + offset += copy; + pos = copy; + } + + for (i=0; inr_frags; i++) { + int end; + + BUG_TRAP(start <= offset+len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end-offset) > 0) { + unsigned int csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap_atomic(page, KM_USER1); + csum2 = csum_and_copy_to_dst(dst, + vaddr + frag->page_offset + offset-start, + copy, 0); + kunmap_atomic(vaddr, KM_USER1); + *csump = csum_block_add(*csump, csum2, pos); + if (!(len -= copy)) + return; + offset += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list; + + for (list = skb_shinfo(skb)->frag_list; list; list=list->next) { + int end; + + BUG_TRAP(start <= offset+len); + + end = start + list->len; + if ((copy = end-offset) > 0) { + unsigned int csum2 = 0; + if (copy > len) + copy = len; + skb_copy_and_csum_datagram_kvec_dst(list, offset-start, dst, copy, &csum2); + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return; + offset += copy; + pos += copy; + } + start = end; + } + } +} + +int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb, int offset, + struct kvec *vec, int len) +{ + unsigned int csum; + struct kvec_dst dst; + + csum = csum_partial(skb->data, offset, skb->csum); + + kvec_dst_init(&dst, KM_USER0); + kvec_dst_set(&dst, vec->veclet); + kvec_dst_map(&dst); + skb_copy_and_csum_datagram_kvec_dst(skb, offset, &dst, len, &csum); + kvec_dst_unmap(&dst); + + if ((unsigned short)csum_fold(csum)) + return -EINVAL; + return 0; +} + +struct skb_async_info { + struct worktodo wtd; + struct sock *sk; + int len; + void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb); + kvec_cb_t cb; +}; +static void skb_async_read_worker(void *_data); + +int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len, + void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb)) +{ + struct skb_async_info *info = kmalloc(sizeof(struct skb_async_info), GFP_KERNEL); + if (info) { + wtd_set_action(&info->wtd, skb_async_read_worker, info); + info->sk = sk; + info->len = len; + info->finish = finish; + info->cb = cb; + skb_async_read_worker(info); + return 0; + } + return -EAGAIN; +} + +static void skb_async_read_waiter(wait_queue_t *wait) +{ + struct skb_async_info *info = (void *)wait; + __remove_wait_queue(info->sk->sleep, &info->wtd.wait); + wtd_queue(&info->wtd); +} + +static void skb_async_read_worker(void *_data) +{ + struct skb_async_info *info = _data; + struct sock *sk = info->sk; + struct sk_buff *skb; + int error; + + /* Caller is allowed not to check sk->err before skb_recv_datagram() */ + error = sock_error(sk); + if (error) + goto no_packet; + + + init_waitqueue_func_entry(&info->wtd.wait, skb_async_read_waiter); + + /* Attempted to dequeue and process any skbs that already arrived. + * Note that add_wait_queue_cond is used to check against a race + * where an skb is added to the queue after we checked but before + * the callback is added to the wait queue. + */ + do { + skb = skb_dequeue(&sk->receive_queue); + if (skb) { + info->finish(sk, info->cb, info->len, skb); + kfree(info); + return; + } + } while ( add_wait_queue_cond( sk->sleep, &info->wtd.wait, + (!(error = sock_error(sk)) && + skb_queue_empty(&sk->receive_queue)) ) + && !error); + + if (!error) + return; + +no_packet: + info->cb.fn(info->cb.data, info->cb.vec, error); + kfree(info); + return; +} + +#if 0 +static void skb_async_read_worker(void *_data) +{ + struct skb_async_info *info = _data; + int error; + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (!skb_queue_empty(&sk->receive_queue)) + goto ready; + + /* Socket shut down? */ + if (sk->shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. If so we report the problem */ + error = -ENOTCONN; + if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + /* here: queue sleep */ + *timeo_p = schedule_timeout(*timeo_p); + return; + +ready: + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return 0; + +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; +out: + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return error; +out_noerr: + *err = 0; + error = 1; + goto out; +} +#endif diff -urN v2.4.19/net/core/sock.c aio-2.4.19.diff/net/core/sock.c --- v2.4.19/net/core/sock.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/net/core/sock.c Mon Sep 16 21:54:13 2002 @@ -587,6 +587,8 @@ if(sk && zero_it) { memset(sk, 0, sizeof(struct sock)); sk->family = family; + INIT_LIST_HEAD(&sk->kvec_read_list); + INIT_LIST_HEAD(&sk->kvec_write_list); sock_lock_init(sk); } @@ -1117,7 +1119,7 @@ void sock_def_wakeup(struct sock *sk) { read_lock(&sk->callback_lock); - if (sk->sleep && waitqueue_active(sk->sleep)) + if (sk->sleep) wake_up_interruptible_all(sk->sleep); read_unlock(&sk->callback_lock); } @@ -1125,7 +1127,7 @@ void sock_def_error_report(struct sock *sk) { read_lock(&sk->callback_lock); - if (sk->sleep && waitqueue_active(sk->sleep)) + if (sk->sleep) wake_up_interruptible(sk->sleep); sk_wake_async(sk,0,POLL_ERR); read_unlock(&sk->callback_lock); @@ -1134,7 +1136,7 @@ void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->callback_lock); - if (sk->sleep && waitqueue_active(sk->sleep)) + if (sk->sleep) wake_up_interruptible(sk->sleep); sk_wake_async(sk,1,POLL_IN); read_unlock(&sk->callback_lock); @@ -1148,7 +1150,7 @@ * progress. --DaveM */ if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) { - if (sk->sleep && waitqueue_active(sk->sleep)) + if (sk->sleep) wake_up_interruptible(sk->sleep); /* Should agree with poll, otherwise some programs break */ diff -urN v2.4.19/net/ipv4/af_inet.c aio-2.4.19.diff/net/ipv4/af_inet.c --- v2.4.19/net/ipv4/af_inet.c Fri Aug 9 13:50:46 2002 +++ aio-2.4.19.diff/net/ipv4/af_inet.c Mon Sep 16 21:54:13 2002 @@ -729,6 +729,19 @@ } +int inet_kvec_read(struct socket *sock, kvec_cb_t cb, size_t len) +{ + struct sock *sk = sock->sk; + + return sk->prot->kvec_read(sk, cb, len); +} + +int inet_kvec_write(struct socket *sock, kvec_cb_t cb, size_t len) +{ + struct sock *sk = sock->sk; + + return sk->prot->kvec_write(sk, cb, len); +} int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags, struct scm_cookie *scm) @@ -960,7 +973,9 @@ sendmsg: inet_sendmsg, recvmsg: inet_recvmsg, mmap: sock_no_mmap, - sendpage: tcp_sendpage + sendpage: tcp_sendpage, + kvec_read: inet_kvec_read, + kvec_write: inet_kvec_write, }; struct proto_ops inet_dgram_ops = { @@ -982,6 +997,8 @@ recvmsg: inet_recvmsg, mmap: sock_no_mmap, sendpage: sock_no_sendpage, + kvec_read: inet_kvec_read, + kvec_write: inet_kvec_write, }; struct net_proto_family inet_family_ops = { diff -urN v2.4.19/net/ipv4/tcp.c aio-2.4.19.diff/net/ipv4/tcp.c --- v2.4.19/net/ipv4/tcp.c Fri Aug 9 13:50:47 2002 +++ aio-2.4.19.diff/net/ipv4/tcp.c Mon Sep 16 21:54:13 2002 @@ -252,6 +252,7 @@ #include #include #include +#include #include #include @@ -677,11 +678,266 @@ return 0; } +struct tcp_write_async_info { + struct worktodo wtd; + struct sock *sk; + int len; + int done; + int offset; + struct kveclet *cur_let; + kvec_cb_t cb; + spinlock_t lock; +}; + +static void async_lock_sock_wait(wait_queue_t *wait) +{ + struct tcp_write_async_info *info = (void *)wait; + printk("async_lock_sock_wait(%p)\n", info); + if (!info->sk->lock.users) { + printk("async_lock_sock_wait: queuing\n"); + __remove_wait_queue(info->sk->sleep, &info->wtd.wait); + wtd_queue(&info->wtd); + } +} + +static void async_lock_sock(void *data) +{ + struct tcp_write_async_info *info = data; + struct sock *sk; + printk(KERN_DEBUG "async_lock_sock(%p)\n", info); + sk = info->sk; + spin_lock_bh(&sk->lock.slock); + if (sk->lock.users) { + printk(KERN_DEBUG "async_lock_sock: waiting\n"); + wtd_push(&info->wtd, async_lock_sock, info); + init_waitqueue_func_entry(&info->wtd.wait, async_lock_sock_wait); + if (!add_wait_queue_cond(sk->sleep, &info->wtd.wait, !sk->lock.users)) { + spin_unlock_bh(&sk->lock.slock); + return; + } + wtd_pop(&info->wtd); + } + printk(KERN_DEBUG "async_lock_sock: locking\n"); + sk->lock.users = 1; + spin_unlock_bh(&sk->lock.slock); + wtd_queue(&info->wtd); +} + +static void async_wait_for_tcp_connect(void *data); +int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len) +{ + struct tcp_write_async_info *info; + info = kmalloc(sizeof(*info), GFP_KERNEL); + printk(KERN_DEBUG "tcp_kvec_write: %p\n", info); + if (!info) + return -ENOMEM; + wtd_init(&info->wtd, async_wait_for_tcp_connect); + info->sk = sk; + info->len = len; + info->done = 0; + info->offset = 0; + info->cur_let = cb.vec->veclet; + info->cb = cb; + spin_lock_init(&info->lock); + async_lock_sock(info); + return 0; +} + +static void async_cn_wait_task(void *data) +{ + struct tcp_write_async_info *info = (void *)data; + async_lock_sock(info); +} + +static void async_cn_wait(wait_queue_t *wait) +{ + struct tcp_write_async_info *info = (void *)wait; + __remove_wait_queue(info->sk->sleep, &info->wtd.wait); + wtd_set_action(&info->wtd, async_cn_wait_task, info); + wtd_queue(&info->wtd); +} + +/* sock_get_iocb + * Attempts to allocate a local socket iocb, which allows high + * performance for the common cases of a small number of ios + * outstanding per socket. + */ +struct sock_iocb *sock_get_iocb(struct sock *sk) +{ + struct sock_iocb *iocb; + + iocb = kmalloc(sizeof(*iocb), GFP_KERNEL); + return iocb; +} + +void sock_put_iocb(struct sock_iocb *iocb) +{ + kfree(iocb); +} + +/* tcp_kvec_read_kick + * Attempts to process an async read request. Must be called with + * the socket lock held. + */ +void tcp_kvec_read_kick(struct sock *sk, struct sock_iocb *iocb) +{ + TCP_CHECK_TIMER(sk); +#if 0 + if (unlikely(TCP_LISTEN == sk->state)) + goto out; +#endif + return; +} + +/* tcp_kvec_read + * Queues an async read request on a socket. If there were + & no outstanding read requests, kicks the backlog processing. + */ +int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int size) +{ + struct sock_iocb *iocb; + printk("tcp_kvec_read(%p, %d): blah", sk, size); + + iocb = sock_get_iocb(sk); + if (unlikely(NULL == iocb)) + return -ENOMEM; + + iocb->cb = cb; + kvec_dst_init(&iocb->dst, KM_USER0); + + spin_lock_bh(&sk->lock.slock); + if (sk->lock.users != 0 || !list_empty(&sk->kvec_read_list)) { + list_add_tail(&iocb->list, &sk->kvec_read_list); + spin_unlock_bh(&sk->lock.slock); + return 0; + } + spin_unlock_bh(&sk->lock.slock); + + /* We're the head read request and now own the socket lock; + * attempt to kick off processing. + */ + tcp_kvec_read_kick(sk, iocb); + release_sock(sk); + return 0; +} + +static void tcp_kvec_write_worker(struct tcp_write_async_info *info); +static void async_wait_for_tcp_connect(void *data) +{ + struct tcp_write_async_info *info = data; + struct sock *sk = info->sk; + int err; + /* At this point the socket is locked for us. */ + while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { + if (sk->err) { + err = sock_error(sk); + goto error; + } + if ((1 << sk->state) & + ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + err = -EPIPE; + goto error; + } + + sk->tp_pinfo.af_tcp.write_pending++; + init_waitqueue_func_entry(&info->wtd.wait, async_cn_wait); + + /* Add our worker to the socket queue, but make sure the socket + * state isn't changed from when we checked while we do so. + */ + if (!add_wait_queue_cond(sk->sleep, &info->wtd.wait, + ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + )) { + release_sock(sk); + return; + } + } + /* sk is now locked *and* the connection is established, let's + * proceed to the data transfer stage. + */ + tcp_kvec_write_worker(info); + return; + +error: + release_sock(sk); + info->cb.fn(info->cb.data, info->cb.vec, err); + kfree(info); +} + static inline int tcp_memory_free(struct sock *sk) { return sk->wmem_queued < sk->sndbuf; } +static void async_wait_for_tcp_memory(struct tcp_write_async_info *info); +static void async_wait_for_tcp_memory_done(void *data) +{ + struct tcp_write_async_info *info = data; + info->sk->tp_pinfo.af_tcp.write_pending--; + if (tcp_memory_free(info->sk)) + tcp_kvec_write_worker(info); + else + async_wait_for_tcp_memory(info); +} + +static void async_wait_for_tcp_memory_waiting(void *data) +{ + struct tcp_write_async_info *info = data; + wtd_set_action(&info->wtd, async_wait_for_tcp_memory_done, info); + async_lock_sock(info); +} + +static void async_wait_for_tcp_memory_wake(wait_queue_t *wait) +{ + struct tcp_write_async_info *info = (void *)wait; + __remove_wait_queue(info->sk->sleep, &info->wtd.wait); + wtd_set_action(&info->wtd, async_wait_for_tcp_memory_waiting, info); + wtd_queue(&info->wtd); +} + +static void async_wait_for_tcp_memory(struct tcp_write_async_info *info) +{ + struct sock *sk = info->sk; + ssize_t res; + kvec_cb_t cb; + int raced = 0; + + printk("async_wait_for_tcp_memory(%p)\n", info); + res = -EPIPE; + if (sk->err || (sk->shutdown & SEND_SHUTDOWN)) + goto err; + + if (tcp_memory_free(sk)) + printk("async_wait_for_tcp_memory: spinning?\n"); + + init_waitqueue_func_entry(&info->wtd.wait, async_wait_for_tcp_memory_wake); + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + raced = add_wait_queue_cond( sk->sleep, &info->wtd.wait, + !(sk->err || (sk->shutdown & SEND_SHUTDOWN) || tcp_memory_free(sk)) ); + + sk->tp_pinfo.af_tcp.write_pending++; + if (raced) { + /* Requeue to be run here: this allows other tasks to + * get rescheduled in case of bugs + */ + wtd_set_action(&info->wtd, async_wait_for_tcp_memory_done, info); + wtd_queue(&info->wtd); + return; + } + + release_sock(sk); + return; + +err: + printk("async_wait_for_tcp_memory: err %ld\n", (long)res); + if (info->done) + res = info->done; + cb = info->cb; + kfree(info); + cb.fn(cb.data, cb.vec, res); +} + /* * Wait for more memory for a socket */ @@ -692,9 +948,17 @@ long current_timeo = *timeo; DECLARE_WAITQUEUE(wait, current); + if (sk->err || (sk->shutdown & SEND_SHUTDOWN)) + return -EPIPE; + if (tcp_memory_free(sk)) current_timeo = vm_wait = (net_random()%(HZ/5))+2; + if (!*timeo) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + return -EAGAIN; + } + add_wait_queue(sk->sleep, &wait); for (;;) { set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); @@ -745,7 +1009,7 @@ goto out; } -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags); static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) @@ -824,7 +1088,7 @@ return err; } -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int mss_now; @@ -851,9 +1115,14 @@ int offset, size, copy, i; struct page *page; - page = pages[poffset/PAGE_SIZE]; - offset = poffset % PAGE_SIZE; - size = min_t(size_t, psize, PAGE_SIZE-offset); + while (poffset >= let->length) { + poffset -= let->length; + let++; + } + + page = let->page; + offset = let->offset + poffset; + size = min_t(unsigned int, psize, let->length); if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) { new_segment: @@ -893,6 +1162,10 @@ copied += copy; poffset += copy; + if (poffset >= let->length) { + poffset = 0; + let++; + } if (!(psize -= copy)) goto out; @@ -932,6 +1205,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { + struct kveclet let = { page, offset, size }; ssize_t res; struct sock *sk = sock->sk; @@ -941,16 +1215,54 @@ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) return sock_no_sendpage(sock, page, offset, size, flags); -#undef TCP_ZC_CSUM_FLAGS lock_sock(sk); TCP_CHECK_TIMER(sk); - res = do_tcp_sendpages(sk, &page, offset, size, flags); + res = do_tcp_sendpages(sk, &let, 0, size, flags); TCP_CHECK_TIMER(sk); release_sock(sk); return res; } +static void tcp_kvec_write_worker(struct tcp_write_async_info *info) +{ + struct sock *sk = info->sk; + int res; + if (!(sk->route_caps & NETIF_F_SG) || + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) + BUG(); + + res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT); + if (res > 0) + info->done += res; + + if (res == -EAGAIN) { + printk("tcp_kvec_write_worker: -EAGAIN: queuing\n"); + goto requeue; + } + + while (res > info->cur_let->length) { + res -= info->cur_let->length; + info->cur_let++; + } + + if (res <= 0 || (info->done >= info->len)) { + kvec_cb_t cb = info->cb; + printk("tcp_kvec_write_worker: error(%d)\n", res); + if (info->done) + res = info->done; + release_sock(sk); + kfree(info); + cb.fn(cb.data, cb.vec, res); + return; + } + +requeue: + async_wait_for_tcp_memory(info); +} + +#undef TCP_ZC_CSUM_FLAGS + #define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page) #define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off) diff -urN v2.4.19/net/ipv4/tcp_ipv4.c aio-2.4.19.diff/net/ipv4/tcp_ipv4.c --- v2.4.19/net/ipv4/tcp_ipv4.c Fri Aug 9 13:50:47 2002 +++ aio-2.4.19.diff/net/ipv4/tcp_ipv4.c Mon Sep 16 21:54:13 2002 @@ -2298,6 +2298,8 @@ hash: tcp_v4_hash, unhash: tcp_unhash, get_port: tcp_v4_get_port, + kvec_read: tcp_kvec_read, + kvec_write: tcp_kvec_write, }; diff -urN v2.4.19/net/ipv4/udp.c aio-2.4.19.diff/net/ipv4/udp.c --- v2.4.19/net/ipv4/udp.c Fri Aug 9 13:50:47 2002 +++ aio-2.4.19.diff/net/ipv4/udp.c Mon Sep 16 21:54:13 2002 @@ -93,6 +93,7 @@ #include #include #include +#include /* * Snmp MIB for the UDP layer @@ -619,6 +620,74 @@ __udp_checksum_complete(skb); } +void udp_kvec_read_finish(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb) +{ + struct sockaddr_in *sin = NULL; + int msg_flags = 0; + int copied, err; + + if (!skb) + BUG(); + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg_flags |= MSG_TRUNC; + } + + err = 0; + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + skb_copy_datagram_kvec(skb, sizeof(struct udphdr), + cb.vec, copied); + } else if (msg_flags&MSG_TRUNC) { + err = -EAGAIN; + if (unlikely(__udp_checksum_complete(skb))) { + UDP_INC_STATS_BH(UdpInErrors); + goto out_free; + } + err = 0; + skb_copy_datagram_kvec(skb, sizeof(struct udphdr), + cb.vec, copied); + } else { + err = skb_copy_and_csum_datagram_kvec(skb, + sizeof(struct udphdr), cb.vec, copied); + } + + if (err) + goto out_free; + + //sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) + { + sin->sin_family = AF_INET; + sin->sin_port = skb->h.uh->source; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + //if (sk->protinfo.af_inet.cmsg_flags) + // ip_cmsg_recv(msg, skb); + err = copied; + +out_free: + skb_free_datagram(sk, skb); + cb.fn(cb.data, cb.vec, err); + return; +} + +static int udp_kvec_read(struct sock *sk, kvec_cb_t cb, int len) +{ + return skb_kvec_recv_datagram(sk, cb, len, udp_kvec_read_finish); +} + +static int udp_kvec_write(struct sock *sk, kvec_cb_t cb, int len) +{ + return -EINVAL; /* TODO: someone please write ;-) */ +} + + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -1037,6 +1106,8 @@ getsockopt: ip_getsockopt, sendmsg: udp_sendmsg, recvmsg: udp_recvmsg, + kvec_read: udp_kvec_read, + kvec_write: udp_kvec_write, backlog_rcv: udp_queue_rcv_skb, hash: udp_v4_hash, unhash: udp_v4_unhash, diff -urN v2.4.19/net/khttpd/datasending.c aio-2.4.19.diff/net/khttpd/datasending.c --- v2.4.19/net/khttpd/datasending.c Mon Sep 24 02:16:05 2001 +++ aio-2.4.19.diff/net/khttpd/datasending.c Mon Sep 16 21:54:13 2002 @@ -127,7 +127,7 @@ desc.count = ReadSize; desc.buf = (char *) CurrentRequest->sock; desc.error = 0; - do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor); + do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor, 0); if (desc.written>0) { CurrentRequest->BytesSent += desc.written; diff -urN v2.4.19/net/socket.c aio-2.4.19.diff/net/socket.c --- v2.4.19/net/socket.c Fri Aug 9 13:50:47 2002 +++ aio-2.4.19.diff/net/socket.c Mon Sep 16 21:54:13 2002 @@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent + * Benjamin LaHaise: real aio support. * * * This program is free software; you can redistribute it and/or @@ -104,6 +105,8 @@ unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +static int sock_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); +static int sock_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); /* @@ -123,7 +126,11 @@ fasync: sock_fasync, readv: sock_readv, writev: sock_writev, - sendpage: sock_sendpage + sendpage: sock_sendpage, + aio_read: generic_sock_aio_read, + aio_write: generic_file_aio_write, + kvec_read: sock_kvec_read, + kvec_write: sock_kvec_write, }; /* @@ -533,13 +540,14 @@ static ssize_t sock_read(struct file *file, char *ubuf, size_t size, loff_t *ppos) { + int read_flags = 0; struct socket *sock; struct iovec iov; struct msghdr msg; int flags; - if (ppos != &file->f_pos) - return -ESPIPE; + if (read_flags & ~F_ATOMIC) + return -EINVAL; if (size==0) /* Match SYS5 behaviour */ return 0; @@ -554,6 +562,8 @@ iov.iov_base=ubuf; iov.iov_len=size; flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (read_flags & F_ATOMIC) + flags |= MSG_DONTWAIT; return sock_recvmsg(sock, &msg, size, flags); } @@ -567,12 +577,13 @@ static ssize_t sock_write(struct file *file, const char *ubuf, size_t size, loff_t *ppos) { + int flags = 0; struct socket *sock; struct msghdr msg; struct iovec iov; - - if (ppos != &file->f_pos) - return -ESPIPE; + + if (flags & ~F_ATOMIC) + return -EINVAL; if(size==0) /* Match SYS5 behaviour */ return 0; @@ -585,6 +596,8 @@ msg.msg_control=NULL; msg.msg_controllen=0; msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (flags & F_ATOMIC) + msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; iov.iov_base=(void *)ubuf; @@ -611,6 +624,29 @@ return sock->ops->sendpage(sock, page, offset, size, flags); } +static int sock_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + struct socket *sock; + sock = socki_lookup(file->f_dentry->d_inode); + if ((int)size < 0 || (size_t)(int)size != size) + return -EINVAL; + if (sock->ops->kvec_read) + return sock->ops->kvec_read(sock, cb, size); + return -EOPNOTSUPP; +} + +static int sock_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + struct socket *sock; + sock = socki_lookup(file->f_dentry->d_inode); + if ((int)size < 0 || (size_t)(int)size != size) + return -EINVAL; + if (sock->ops->kvec_write) + return sock->ops->kvec_write(sock, cb, size); + return -EOPNOTSUPP; +} + + int sock_readv_writev(int type, struct inode * inode, struct file * file, const struct iovec * iov, long count, long size) {