diff -urN v2.4.19-pre5/AIO-NOTES linux.diff/AIO-NOTES --- v2.4.19-pre5/AIO-NOTES Wed Dec 31 19:00:00 1969 +++ linux.diff/AIO-NOTES Tue Apr 2 19:24:02 2002 @@ -0,0 +1,11 @@ +20020102 + - ABI adjustment to io_getevents: the prototype is now + long io_getevents(aio_context_t ctx_id, + long nr, + struct io_event *events, + const struct timespec *timeout) + which does not affect binary compatibility on x86, only + for 64 bit machines. + + + diff -urN v2.4.19-pre5/Documentation/Configure.help linux.diff/Documentation/Configure.help --- v2.4.19-pre5/Documentation/Configure.help Wed Apr 3 21:04:25 2002 +++ linux.diff/Documentation/Configure.help Tue Apr 30 17:29:31 2002 @@ -17983,6 +17983,26 @@ contains more information and the location of the joystick package that you'll need. +/dev/epoll support +CONFIG_EVENTPOLL + This option will allow for the creation of a '/dev/epoll' character + device, with major number 10 (MISC_MAJOR) and minor number 124 + (EVENTPOLL_MINOR). + + This device can be used to very efficiently handle incoming events on a + socket, much more so than select() or poll(). There is a paper that + describes this device and how to program for it (as well as including + some very impressive benchmarks) at the following URL: + http://www.xmailserver.org/linux-patches/nio-improve.html + + If you are writing very scalable servers and wish to code against + /dev/epoll for enhanced speed, say 'Y' or 'M' here. If you have + software in hand that requires (or can make use of) /dev/epoll, + also say 'Y' or 'M' here. + + The vast majority of the planet can very safely say 'N' here + and breathe easily. + Game port support CONFIG_INPUT_GAMEPORT Gameport support is for the standard 15-pin PC gameport. If you diff -urN v2.4.19-pre5/MAINTAINERS linux.diff/MAINTAINERS --- v2.4.19-pre5/MAINTAINERS Wed Apr 3 21:04:25 2002 +++ linux.diff/MAINTAINERS Tue Apr 2 18:56:58 2002 @@ -228,6 +228,12 @@ L: linux-net@vger.kernel.org S: Maintained +ASYNC IO +P: Benjamin LaHaise +M: bcrl@redhat.com +L: linux-aio@kvack.org +S: Maintained + AX.25 NETWORK LAYER P: Matthias Welwarsky M: dg2fef@afthd.tu-darmstadt.de diff -urN v2.4.19-pre5/Makefile linux.diff/Makefile --- v2.4.19-pre5/Makefile Wed Apr 3 21:04:25 2002 +++ linux.diff/Makefile Fri Apr 19 20:57:16 2002 @@ -226,7 +226,7 @@ drivers/sound/pndsperm.c \ drivers/sound/pndspini.c \ drivers/atm/fore200e_*_fw.c drivers/atm/.fore200e_*.fw \ - .version .config* config.in config.old \ + .uniquebytes .version .config* config.in config.old \ scripts/tkparse scripts/kconfig.tk scripts/kconfig.tmp \ scripts/lxdialog/*.o scripts/lxdialog/lxdialog \ .menuconfig.log \ @@ -268,6 +268,7 @@ --end-group \ -o vmlinux $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map + @$(MAKE) -C ulib symlinks: rm -f include/asm @@ -296,7 +297,7 @@ linuxsubdirs: $(patsubst %, _dir_%, $(SUBDIRS)) -$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/version.h include/config/MARKER +$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/compile.h include/config/MARKER $(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C $(patsubst _dir_%, %, $@) $(TOPDIR)/include/linux/version.h: include/linux/version.h @@ -322,6 +323,11 @@ echo \#define LINUX_COMPILE_DOMAIN ; \ fi >> .ver @echo \#define LINUX_COMPILER \"`$(CC) $(CFLAGS) -v 2>&1 | tail -1`\" >> .ver + @rm -f .uniquebytes + @dd if=/dev/urandom of=.uniquebytes bs=1 count=16 + @echo -n \#"define LINUX_UNIQUE_BYTES " >>.ver + @hexdump -v -e '1/1 "0x%02x, "' .uniquebytes | sed -e 's/, $$//g' >>.ver + @echo "" >>.ver @mv -f .ver $@ include/linux/version.h: ./Makefile @@ -404,6 +410,8 @@ .PHONY: $(patsubst %, _modinst_%, $(SUBDIRS)) $(patsubst %, _modinst_%, $(SUBDIRS)) : $(MAKE) -C $(patsubst _modinst_%, %, $@) modules_install + mkdir -p $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/ + install -m 755 ulib/libredhat-kernel.so.1.0.1 $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/ # modules disabled.... @@ -423,6 +431,7 @@ rm -f $(CLEAN_FILES) rm -rf $(CLEAN_DIRS) $(MAKE) -C Documentation/DocBook clean + $(MAKE) -C ulib clean mrproper: clean archmrproper find . \( -size 0 -o -name .depend \) -type f -print | xargs rm -f diff -urN v2.4.19-pre5/arch/i386/Makefile linux.diff/arch/i386/Makefile --- v2.4.19-pre5/arch/i386/Makefile Thu May 3 11:22:07 2001 +++ linux.diff/arch/i386/Makefile Tue Apr 2 18:56:58 2002 @@ -98,7 +98,7 @@ DRIVERS += arch/i386/math-emu/math.o endif -arch/i386/kernel: dummy +arch/i386/kernel: dummy include/linux/compile.h $(MAKE) linuxsubdirs SUBDIRS=arch/i386/kernel arch/i386/mm: dummy diff -urN v2.4.19-pre5/arch/i386/kernel/Makefile linux.diff/arch/i386/kernel/Makefile --- v2.4.19-pre5/arch/i386/kernel/Makefile Mon Nov 26 23:43:07 2001 +++ linux.diff/arch/i386/kernel/Makefile Tue Apr 2 18:56:58 2002 @@ -30,6 +30,10 @@ endif endif +obj-y += vsysdata.o vunique.o dynamic_syscall.o + +vunique.o: $(TOPDIR)/include/linux/compile.h + obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_MTRR) += mtrr.o obj-$(CONFIG_X86_MSR) += msr.o diff -urN v2.4.19-pre5/arch/i386/kernel/dynamic_syscall.c linux.diff/arch/i386/kernel/dynamic_syscall.c --- v2.4.19-pre5/arch/i386/kernel/dynamic_syscall.c Wed Dec 31 19:00:00 1969 +++ linux.diff/arch/i386/kernel/dynamic_syscall.c Tue Apr 2 18:56:58 2002 @@ -0,0 +1,90 @@ +/* arch/i386/kernel/dynamic_syscall.c + * Entry code for dynamic syscalls on i386. + */ +#include +#include +#include +#include +#include +#include + +struct dummy_args { + long data[8]; +}; + +extern struct vsyscall_entry { + long eip; + long (*call)(struct dummy_args args); +} vsyscall_list_begin, vsyscall_list_end; + +long sys_dynamic_syscall(struct pt_regs regs) __attribute__((regparm(0))); + +long sys_dynamic_syscall(struct pt_regs regs) +{ + struct dummy_args dummy_args; + struct vsyscall_entry *ent = (void *)regs.edx; + void *args = (void *)regs.ecx; + long ret; + + pr_debug("ent = %p args = %p\n", ent, args); + pr_debug("eip = 0x%08lx\n", regs.eip); + + if (unlikely(!current->mm->vsys_mapped)) + goto err; + + /* The pointer must be aligned in the table. */ + if (unlikely((long)ent & (sizeof(*ent) - 1))) { + pr_debug("unaligned\n"); + goto err; + } + + /* Bounds checking... */ + if (unlikely(ent < &vsyscall_list_begin) || + unlikely(ent >= &vsyscall_list_end)) { + pr_debug("out of range %p <= %p < %p\n", + &vsyscall_list_begin, ent, + &vsyscall_list_end); + goto err; + } + /* The entry should be valid now. Verify that the caller's eip + * is correct. + */ + if (unlikely(ent->eip != regs.eip)) { + pr_debug("eip mismatch (0x%lx vs 0x%lx)\n", ent->eip, regs.eip); + goto err; + } + + pr_debug("ent->call = %p\n", ent->call); + + if (unlikely(verify_area(VERIFY_READ, args, sizeof(dummy_args)))) + return -EFAULT; + + __asm__ volatile ( + " cld \n" + " sub $0x20, %%esp \n" + " movl %%esp, %%edi \n" + " movl $0x8, %%ecx \n" + "1: rep movsl \n" + " call %%edx \n" + "2: add $0x20, %%esp \n" + +/* the exception handling: just return -EFAULT */ + ".section .fixup, \"ax\" \n" + "3: movl $0xfffffff2, %%eax \n" /* -EFAULT */ + " jmp 2b \n" + ".previous \n" + ".section __ex_table,\"a\" \n" + " .align 4 \n" + " .long 1b, 3b \n" + ".previous \n" + + : "=a" (ret) + : "S" (args), "d" (ent->call) + : "%edi", "%ecx" ); + + pr_debug("ret = 0x%08lx\n", ret); + + return ret; +err: + return -ENOSYS; +} diff -urN v2.4.19-pre5/arch/i386/kernel/entry.S linux.diff/arch/i386/kernel/entry.S --- v2.4.19-pre5/arch/i386/kernel/entry.S Wed Apr 3 21:04:26 2002 +++ linux.diff/arch/i386/kernel/entry.S Tue Apr 2 18:56:58 2002 @@ -45,6 +45,7 @@ #include #include #include +#include EBX = 0x00 ECX = 0x04 @@ -636,6 +637,11 @@ .long SYMBOL_NAME(sys_ni_syscall) /* reserved for fremovexattr */ .long SYMBOL_NAME(sys_tkill) + .rept __NR_sys_dynamic_syscall-(.-sys_call_table)/4 + .long SYMBOL_NAME(sys_ni_syscall) + .endr + .long SYMBOL_NAME(sys_dynamic_syscall) + .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) .endr diff -urN v2.4.19-pre5/arch/i386/kernel/irq.c linux.diff/arch/i386/kernel/irq.c --- v2.4.19-pre5/arch/i386/kernel/irq.c Mon Nov 12 17:49:47 2001 +++ linux.diff/arch/i386/kernel/irq.c Fri Apr 5 18:28:20 2002 @@ -577,7 +577,17 @@ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; + long esp; + /* Debugging check for stack overflow: is there less than 2KB free? */ + __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191)); + if (esp < (sizeof(struct task_struct) + 2048)) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct task_struct)); + __asm__ __volatile__("movl %%esp,%0" : "=r" (esp)); + show_stack((void *)esp); + } + kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); diff -urN v2.4.19-pre5/arch/i386/kernel/vsysdata.c linux.diff/arch/i386/kernel/vsysdata.c --- v2.4.19-pre5/arch/i386/kernel/vsysdata.c Wed Dec 31 19:00:00 1969 +++ linux.diff/arch/i386/kernel/vsysdata.c Tue Apr 2 18:56:58 2002 @@ -0,0 +1,11 @@ +/* vsysdata.c - declarations for variables shared with the kernel + * + * Items placed in .data.vsyscall have a kernel virtual address + * and are read/write from kernel space only. The copy placed + * in .vsyscall_data are linked at a userspace address and are + * read only accessible from userland. + */ +#include + +union vsys_union user_vsys_cpudata[256] __attribute__((section(".vsyscall_data"))); +//asm(".globl vsys_cpudata ; bobbob = user_vsys_cpudata - vsyscall_text_begin ; vsys_cpudata = bobbob + VSYSCALL_text"); diff -urN v2.4.19-pre5/arch/i386/kernel/vunique.S linux.diff/arch/i386/kernel/vunique.S --- v2.4.19-pre5/arch/i386/kernel/vunique.S Wed Dec 31 19:00:00 1969 +++ linux.diff/arch/i386/kernel/vunique.S Tue Apr 2 18:56:58 2002 @@ -0,0 +1,7 @@ +#include + + .section .first_vsyscall_text,"xa" + .globl signature +signature: + .byte LINUX_UNIQUE_BYTES + .size signature,.-signature diff -urN v2.4.19-pre5/arch/i386/mm/fault.c linux.diff/arch/i386/mm/fault.c --- v2.4.19-pre5/arch/i386/mm/fault.c Wed Apr 3 21:04:26 2002 +++ linux.diff/arch/i386/mm/fault.c Tue Apr 9 18:03:05 2002 @@ -27,6 +27,8 @@ extern void die(const char *,struct pt_regs *,long); +spinlock_t oops_lock = SPIN_LOCK_UNLOCKED; + /* * Ugly, ugly, but the goto's result in better assembly.. */ @@ -306,7 +308,7 @@ * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - + spin_lock(&oops_lock); bust_spinlocks(1); if (address < PAGE_SIZE) @@ -327,6 +329,7 @@ } die("Oops", regs, error_code); bust_spinlocks(0); + spin_unlock(&oops_lock); do_exit(SIGKILL); /* diff -urN v2.4.19-pre5/arch/i386/vmlinux.lds linux.diff/arch/i386/vmlinux.lds --- v2.4.19-pre5/arch/i386/vmlinux.lds Thu Mar 7 16:39:56 2002 +++ linux.diff/arch/i386/vmlinux.lds Tue Apr 2 18:56:58 2002 @@ -14,6 +14,27 @@ *(.gnu.warning) } = 0x9090 + /* Note: most of these declarations are in kernel/vsysdata.c,vsyscall.S. + * We use two segments for the data liked at a kernel virtual address + * (.data.vsyscall) and user virtual address (.vsyscall_data). + * .vsyscall_text is linked at a kernel virtual address + */ + . = ALIGN(4096); + VSYSCALL_text = .; + VSYSCALL 0xbfff0000 : AT ( VSYSCALL_text ) { + vsyscall_text_begin = .; + *(.first_vsyscall_text) + *(.vsyscall_text) + . = ALIGN(4096); + vsyscall_text_end = .; + *(.vsyscall_data) + . = ALIGN(4096); + vsyscall_data_end = .; + } + vsys_cpudata = user_vsys_cpudata - vsyscall_text_begin + VSYSCALL_text; + . = VSYSCALL_text + SIZEOF(VSYSCALL); + VSYSCALL_text_end = .; + _etext = .; /* End of text section */ .rodata : { *(.rodata) *(.rodata.*) } @@ -30,6 +51,17 @@ .data : { /* Data */ *(.data) + + . = ALIGN(8); + vsyscall_list_begin = .; + *(.data.vsyscall_list) + vsyscall_list_end = .; + . = ALIGN(4096); + kernel_vsyscall_data_begin = .; + *(.data.vsyscall) + . = ALIGN(4096); + kernel_vsyscall_data_end = .; + CONSTRUCTORS } @@ -79,4 +111,15 @@ .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } +/* +# VSYSCALL : { +# / * vsyscall area *i / +# __vsyscall_begin = .; +# *(vsyscall_text) +# . = ALIGN(4096); +# *(.data.vsyscall) +# . = ALIGN(4096); +# __vsyscall_end = .; +# } >vsyscall_area +*/ } diff -urN v2.4.19-pre5/drivers/block/loop.c linux.diff/drivers/block/loop.c --- v2.4.19-pre5/drivers/block/loop.c Wed Apr 3 21:04:30 2002 +++ linux.diff/drivers/block/loop.c Tue Apr 2 18:56:57 2002 @@ -283,7 +283,7 @@ spin_lock_irq(&lo->lo_lock); file = lo->lo_backing_file; spin_unlock_irq(&lo->lo_lock); - do_generic_file_read(file, &pos, &desc, lo_read_actor); + do_generic_file_read(file, &pos, &desc, lo_read_actor, 0); return desc.error; } diff -urN v2.4.19-pre5/drivers/char/Config.in linux.diff/drivers/char/Config.in --- v2.4.19-pre5/drivers/char/Config.in Wed Apr 3 21:04:30 2002 +++ linux.diff/drivers/char/Config.in Tue Apr 30 17:29:31 2002 @@ -220,6 +220,7 @@ dep_tristate 'AMD 768 Random Number Generator support' CONFIG_AMD_RNG $CONFIG_PCI dep_tristate 'Intel i8x0 Random Number Generator support' CONFIG_INTEL_RNG $CONFIG_PCI tristate '/dev/nvram support' CONFIG_NVRAM +tristate '/dev/epoll - Efficent file event polling method' CONFIG_EVENTPOLL tristate 'Enhanced Real Time Clock Support' CONFIG_RTC if [ "$CONFIG_IA64" = "y" ]; then bool 'EFI Real Time Clock Services' CONFIG_EFI_RTC diff -urN v2.4.19-pre5/drivers/char/Makefile linux.diff/drivers/char/Makefile --- v2.4.19-pre5/drivers/char/Makefile Wed Apr 3 21:04:30 2002 +++ linux.diff/drivers/char/Makefile Tue Apr 30 17:29:31 2002 @@ -208,6 +208,7 @@ ifeq ($(CONFIG_PPC),) obj-$(CONFIG_NVRAM) += nvram.o endif +obj-$(CONFIG_EVENTPOLL) += eventpoll.o obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o diff -urN v2.4.19-pre5/drivers/char/eventpoll.c linux.diff/drivers/char/eventpoll.c --- v2.4.19-pre5/drivers/char/eventpoll.c Wed Dec 31 19:00:00 1969 +++ linux.diff/drivers/char/eventpoll.c Tue Apr 30 17:29:31 2002 @@ -0,0 +1,800 @@ +/* + * drivers/char/eventpoll.c + * + * Copyright (C) 2001, Davide Libenzi + * + * Efficent event polling implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + + + + +#define DEBUG 0 +#ifdef DEBUG +#define DPRINTK(x) printk x +#define DNPRINTK(n,x) if (n <= DEBUG) printk x +#else +#define DPRINTK(x) +#define DNPRINTK(n,x) +#endif + +#define DEBUG_DPI 0 + +#if DEBUG_DPI +#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) +#else +#define DPI_SLAB_DEBUG 0 +#endif + +#define INITIAL_HASH_BITS 7 +#define MAX_HASH_BITS 18 +#define RESIZE_LENGTH 2 + +#define dpi_mem_alloc() (struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL) +#define dpi_mem_free(p) kmem_cache_free(dpi_cache, p) + + + + + +typedef unsigned long long event_version_t; + +struct eventpoll { + rwlock_t lock; + wait_queue_head_t wq; + wait_queue_head_t poll_wait; + struct list_head *hash; + unsigned int hbits; + unsigned int hmask; + atomic_t hents; + atomic_t resize; + int numpages; + char **pages; + char *pages0[MAX_EVENTPOLL_PAGES]; + char *pages1[MAX_EVENTPOLL_PAGES]; + atomic_t mmapped; + int eventcnt; + event_version_t ver; +}; + +struct epitem { + struct list_head llink; + struct eventpoll *ep; + struct file *file; + struct pollfd pfd; + int index; + event_version_t ver; +}; + + + + + + +static int ep_alloc_pages(char **pages, int numpages); +static int ep_free_pages(char **pages, int numpages); +static int ep_init(struct eventpoll *ep); +static void ep_free(struct eventpoll *ep); +static inline struct epitem *ep_find_nl(struct eventpoll *ep, int fd); +static struct epitem *ep_find(struct eventpoll *ep, int fd); +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags); +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd); +static int ep_remove(struct eventpoll *ep, struct epitem *dpi); +static void notify_proc(struct file *file, void *data, unsigned long *local, long *event); +static int open_eventpoll(struct inode *inode, struct file *file); +static int close_eventpoll(struct inode *inode, struct file *file); +static unsigned int poll_eventpoll(struct file *file, poll_table *wait); +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos); +static int ep_poll(struct eventpoll *ep, void *arg); +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +static void eventpoll_mm_open(struct vm_area_struct * vma); +static void eventpoll_mm_close(struct vm_area_struct * vma); +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma); + + + + +static kmem_cache_t *dpi_cache; + +static struct file_operations eventpoll_fops = { + write: write_eventpoll, + ioctl: ioctl_eventpoll, + mmap: mmap_eventpoll, + open: open_eventpoll, + release: close_eventpoll, + poll: poll_eventpoll +}; + +static struct vm_operations_struct eventpoll_mmap_ops = { + open: eventpoll_mm_open, + close: eventpoll_mm_close, +}; + +static struct miscdevice eventpoll = { + EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops +}; + + + + +static int ep_alloc_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0); + if (!pages[ii]) { + for (--ii; ii >= 0; ii--) { + clear_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + free_pages((unsigned long) pages[ii], 0); + } + return -ENOMEM; + } + set_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + } + return 0; +} + + +static int ep_free_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + clear_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + free_pages((unsigned long) pages[ii], 0); + } + return 0; +} + + +static int ep_init(struct eventpoll *ep) +{ + int ii, hentries; + + rwlock_init(&ep->lock); + init_waitqueue_head(&ep->wq); + init_waitqueue_head(&ep->poll_wait); + ep->hbits = INITIAL_HASH_BITS; + ep->hmask = (1 << ep->hbits) - 1; + atomic_set(&ep->hents, 0); + atomic_set(&ep->resize, 0); + atomic_set(&ep->mmapped, 0); + ep->numpages = 0; + ep->pages = ep->pages0; + ep->eventcnt = 0; + ep->ver = 1; + + hentries = ep->hmask + 1; + if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) + return -ENOMEM; + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&ep->hash[ii]); + + return 0; +} + + +static void ep_free(struct eventpoll *ep) +{ + int ii; + struct list_head *lnk; + + lock_kernel(); + for (ii = 0; ii <= ep->hmask; ii++) { + while ((lnk = list_first(&ep->hash[ii]))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + file_notify_delcb(dpi->file, notify_proc); + list_del(lnk); + dpi_mem_free(dpi); + } + } + vfree(ep->hash); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + } + unlock_kernel(); +} + + +static inline struct epitem *ep_find_nl(struct eventpoll *ep, int fd) +{ + struct epitem *dpi = NULL; + struct list_head *lsthead, *lnk; + + lsthead = &ep->hash[fd & ep->hmask]; + list_for_each(lnk, lsthead) { + dpi = list_entry(lnk, struct epitem, llink); + + if (dpi->pfd.fd == fd) break; + dpi = NULL; + } + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_find(%d) -> %p\n", current, fd, dpi)); + + return dpi; +} + + +static struct epitem *ep_find(struct eventpoll *ep, int fd) +{ + struct epitem *dpi; + unsigned long flags; + + read_lock_irqsave(&ep->lock, flags); + + dpi = ep_find_nl(ep, fd); + + read_unlock_irqrestore(&ep->lock, flags); + + return dpi; +} + + +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags) +{ + struct list_head *hash, *oldhash; + unsigned int hbits = ep->hbits + 1; + unsigned int hmask = (1 << hbits) - 1; + int ii, res, hentries = hmask + 1; + unsigned long flags = *kflags; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_hashresize(%p) bits=%u\n", current, ep, hbits)); + + write_unlock_irqrestore(&ep->lock, flags); + + res = -ENOMEM; + if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) { + write_lock_irqsave(&ep->lock, flags); + goto out; + } + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&hash[ii]); + + write_lock_irqsave(&ep->lock, flags); + + oldhash = ep->hash; + for (ii = 0; ii <= ep->hmask; ii++) { + struct list_head *oldhead = &oldhash[ii], *lnk; + + while ((lnk = list_first(oldhead))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + list_del(lnk); + list_add(lnk, &hash[dpi->pfd.fd & hmask]); + } + } + + ep->hash = hash; + ep->hbits = hbits; + ep->hmask = hmask; + + write_unlock_irqrestore(&ep->lock, flags); + vfree(oldhash); + write_lock_irqsave(&ep->lock, flags); + + res = 0; +out: + *kflags = flags; + atomic_dec(&ep->resize); + return res; +} + + +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd) +{ + struct epitem *dpi; + struct file *file; + unsigned long flags; + + if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE)) + return -E2BIG; + + if (!(file = fcheck(pfd->fd))) + return -EINVAL; + + if (!(dpi = dpi_mem_alloc())) + return -ENOMEM; + + INIT_LIST_HEAD(&dpi->llink); + dpi->ep = ep; + dpi->file = file; + dpi->pfd = *pfd; + dpi->index = -1; + dpi->ver = ep->ver - 1; + + write_lock_irqsave(&ep->lock, flags); + + list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]); + atomic_inc(&ep->hents); + + if (!atomic_read(&ep->resize) && + (atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH && + ep->hbits < MAX_HASH_BITS) { + atomic_inc(&ep->resize); + ep_hashresize(ep, &flags); + } + + write_unlock_irqrestore(&ep->lock, flags); + + file_notify_addcb(file, notify_proc, dpi); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_insert(%p, %d)\n", current, ep, pfd->fd)); + + return 0; +} + + +static int ep_remove(struct eventpoll *ep, struct epitem *dpi) +{ + int fd = dpi->pfd.fd; + unsigned long flags; + struct pollfd *pfd, *lpfd; + struct epitem *ldpi; + + file_notify_delcb(dpi->file, notify_proc); + + write_lock_irqsave(&ep->lock, flags); + + list_del(&dpi->llink); + atomic_dec(&ep->hents); + + if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) { + lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] + + EVENT_PAGE_OFFSET(ep->eventcnt)); + *pfd = *lpfd; + + if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + dpi_mem_free(dpi); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_remove(%p, %d)\n", current, ep, fd)); + + return 0; +} + + +static void notify_proc(struct file *file, void *data, unsigned long *local, long *event) +{ + struct epitem *dpi = (struct epitem *) data; + struct eventpoll *ep = dpi->ep; + struct pollfd *pfd; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: notify(%p, %p, %ld, %ld) ep=%p\n", + current, file, data, event[0], event[1], ep)); + + write_lock(&ep->lock); + if (!(dpi->pfd.events & event[1])) + goto out; + + if (dpi->index < 0 || dpi->ver != ep->ver) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + dpi->ver = ep->ver; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } else { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd != dpi->pfd.fd) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } + } + + pfd->revents |= (pfd->events & event[1]); + + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + wake_up(&ep->poll_wait); +out: + write_unlock(&ep->lock); +} + + +static int open_eventpoll(struct inode *inode, struct file *file) +{ + int res; + struct eventpoll *ep; + + if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) + return -ENOMEM; + + memset(ep, 0, sizeof(*ep)); + if ((res = ep_init(ep))) { + kfree(ep); + return res; + } + + file->private_data = ep; + + MOD_INC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: open() ep=%p\n", current, ep)); + return 0; +} + + +static int close_eventpoll(struct inode *inode, struct file *file) +{ + struct eventpoll *ep = file->private_data; + + ep_free(ep); + + kfree(ep); + + MOD_DEC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: close() ep=%p\n", current, ep)); + return 0; +} + + +static unsigned int poll_eventpoll(struct file *file, poll_table *wait) +{ + struct eventpoll *ep = file->private_data; + + poll_wait(file, &ep->poll_wait, wait); + if (ep->eventcnt) + return POLLIN | POLLRDNORM; + + return 0; +} + + +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos) +{ + int res, rcount; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + struct pollfd pfd; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: write(%p, %d)\n", current, ep, count)); + + if (count % sizeof(struct pollfd)) + return -EINVAL; + + if ((res = verify_area(VERIFY_READ, buffer, count))) + return res; + + rcount = 0; + + lock_kernel(); + + while (count > 0) { + __copy_from_user(&pfd, buffer, sizeof(pfd)); + + dpi = ep_find(ep, pfd.fd); + + if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd]) + pfd.events = POLLREMOVE; + if (pfd.events & POLLREMOVE) { + if (dpi) { + ep_remove(ep, dpi); + rcount += sizeof(pfd); + } + } + else if (dpi) { + dpi->pfd.events = pfd.events; + rcount += sizeof(pfd); + } else { + pfd.revents = 0; + if (!ep_insert(ep, &pfd)) + rcount += sizeof(pfd); + } + + buffer += sizeof(pfd); + count -= sizeof(pfd); + } + + unlock_kernel(); + + return rcount; +} + + +static int ep_poll(struct eventpoll *ep, void *arg) +{ + int res = 0; + long timeout; + unsigned long flags; + struct evpoll dvp; + wait_queue_t wait; + + if (copy_from_user(&dvp, arg, sizeof(struct evpoll))) + return -EFAULT; + + if (!atomic_read(&ep->mmapped)) + return -EINVAL; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_POLL, %d)\n", current, ep, dvp.ep_timeout)); + + write_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!ep->eventcnt) { + init_waitqueue_entry(&wait, current); + add_wait_queue(&ep->wq, &wait); + timeout = dvp.ep_timeout == -1 || dvp.ep_timeout > MAX_SCHEDULE_TIMEOUT/HZ ? + MAX_SCHEDULE_TIMEOUT: (dvp.ep_timeout * HZ) / 1000; + for (;;) { + if (ep->eventcnt || !timeout) + break; + if (signal_pending(current)) { + res = -EINTR; + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + write_unlock_irqrestore(&ep->lock, flags); + timeout = schedule_timeout(timeout); + write_lock_irqsave(&ep->lock, flags); + } + remove_wait_queue(&ep->wq, &wait); + + set_current_state(TASK_RUNNING); + } + + if (!res && ep->eventcnt) { + res = ep->eventcnt; + ep->eventcnt = 0; + ++ep->ver; + if (ep->pages == ep->pages0) { + ep->pages = ep->pages1; + dvp.ep_resoff = 0; + } else { + ep->pages = ep->pages0; + dvp.ep_resoff = ep->numpages * PAGE_SIZE; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + if (res > 0) + copy_to_user(arg, &dvp, sizeof(struct evpoll)); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_POLL, %d) == %d\n", current, ep, dvp.ep_timeout, res)); + return res; +} + + +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int res, numpages; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + unsigned long flags; + struct pollfd pfd; + + switch (cmd) { + case EP_ALLOC: + if (atomic_read(&ep->mmapped)) + return -EBUSY; + + numpages = EP_FDS_PAGES(arg); + if (numpages > MAX_EVENTPOLL_PAGES) + return -EINVAL; + + res = 0; + write_lock_irqsave(&ep->lock, flags); + if (numpages > ep->numpages) { + if (!(res = ep_alloc_pages(&ep->pages0[ep->numpages], numpages - ep->numpages))) { + if (!(res = ep_alloc_pages(&ep->pages1[ep->numpages], numpages - ep->numpages))) { + ep->numpages = numpages; + } else { + ep_free_pages(&ep->pages0[ep->numpages], numpages - ep->numpages); + } + } + } + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_ALLOC, %lu) == %d\n", + current, ep, arg, res)); + return res; + + case EP_FREE: + if (atomic_read(&ep->mmapped)) + return -EBUSY; + + res = -EINVAL; + write_lock_irqsave(&ep->lock, flags); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + ep->numpages = 0; + ep->pages = ep->pages0; + res = 0; + } + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_FREE) == %d\n", + current, ep, res)); + return res; + + case EP_POLL: + return ep_poll(ep, (void *) arg); + + case EP_ISPOLLED: + if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd))) + return 0; + + read_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!(dpi = ep_find_nl(ep, pfd.fd))) + goto out_ispolled; + + pfd = dpi->pfd; + res = 1; + + out_ispolled: + read_unlock_irqrestore(&ep->lock, flags); + + if (res) + copy_to_user((void *) arg, &pfd, sizeof(struct pollfd)); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n", + current, ep, pfd.fd, res)); + return res; + } + + return -EINVAL; +} + + +static void eventpoll_mm_open(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep) atomic_inc(&ep->mmapped); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mm_open(%p)\n", current, ep)); +} + + +static void eventpoll_mm_close(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep) atomic_dec(&ep->mmapped); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mm_close(%p)\n", current, ep)); +} + + +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma) +{ + struct eventpoll *ep = file->private_data; + unsigned long start, flags; + int ii, res; + int numpages; + size_t mapsize; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mmap(%p, %lx, %lx)\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT)); + + if ((vma->vm_pgoff << PAGE_SHIFT) != 0) + return -EINVAL; + + mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start); + numpages = mapsize >> PAGE_SHIFT; + + write_lock_irqsave(&ep->lock, flags); + + res = -EINVAL; + if (numpages != (2 * ep->numpages)) + goto out; + + start = vma->vm_start; + for (ii = 0; ii < ep->numpages; ii++) { + if (remap_page_range(start, __pa(ep->pages0[ii]), + PAGE_SIZE, vma->vm_page_prot)) + goto out; + start += PAGE_SIZE; + } + for (ii = 0; ii < ep->numpages; ii++) { + if (remap_page_range(start, __pa(ep->pages1[ii]), + PAGE_SIZE, vma->vm_page_prot)) + goto out; + start += PAGE_SIZE; + } + vma->vm_ops = &eventpoll_mmap_ops; + atomic_set(&ep->mmapped, 1); + res = 0; +out: + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mmap(%p, %lx, %lx) == %d\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res)); + return res; +} + + +int __init eventpoll_init(void) +{ + dpi_cache = kmem_cache_create("eventpoll", + sizeof(struct epitem), + __alignof__(struct epitem), + DPI_SLAB_DEBUG, NULL, NULL); + if (!dpi_cache) { + printk(KERN_INFO "[%p] /dev/epoll: driver install failed.\n", current); + return -ENOMEM; + } + + printk(KERN_INFO "[%p] /dev/epoll: driver installed.\n", current); + + misc_register(&eventpoll); + + return 0; +} + + +module_init(eventpoll_init); + +#ifdef MODULE + +void cleanup_module(void) +{ + misc_deregister(&eventpoll); + kmem_cache_destroy(dpi_cache); +} + +#endif + +MODULE_LICENSE("GPL"); + diff -urN v2.4.19-pre5/drivers/char/mem.c linux.diff/drivers/char/mem.c --- v2.4.19-pre5/drivers/char/mem.c Wed Apr 3 21:04:30 2002 +++ linux.diff/drivers/char/mem.c Tue Apr 2 18:56:58 2002 @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_I2C extern int i2c_init_all(void); @@ -572,6 +573,87 @@ write: write_full, }; +void vsys_mmap_close(struct vm_area_struct *area) +{ + area->vm_mm->vsys_mapped = 0; +} + +static struct vm_operations_struct vsys_mmap_ops = { + close: vsys_mmap_close, +}; + +int vsys_mmap(struct file *file, struct vm_area_struct *vma) +{ + extern unsigned char vsyscall_text_begin, vsyscall_text_end, VSYSCALL_text[]; + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long len = vma->vm_end - vma->vm_start; + unsigned long actual_len = &vsyscall_text_end - &vsyscall_text_begin; + + if ((offset + len) > actual_len) + len = actual_len - offset; + + pr_debug("len = 0x%lx, actual_len = 0x%lx\n", len, actual_len); + + vma->vm_start = (unsigned long)&vsyscall_text_begin + offset; + vma->vm_end = vma->vm_start + len; + vma->vm_flags |= VM_RESERVED; + + pr_debug("vm_start = 0x%lx, vm_end = 0x%lx\n", + vma->vm_start, vma->vm_end); + pr_debug("va=%p pa=0x%lx\n", + VSYSCALL_text + offset, + __pa(VSYSCALL_text) + offset); + + if (vma->vm_start < (unsigned long)&vsyscall_text_begin) { + pr_debug("vsys_mmap: start < begin\n"); + return -EINVAL; + } + + if (vma->vm_end < (unsigned long)&vsyscall_text_begin) { + pr_debug("vsys_mmap: end < begin\n"); + return -EINVAL; + } + + if (vma->vm_end > (unsigned long)&vsyscall_text_end) { + pr_debug("vsys_mmap: end(%lx) > text_end(%p)\n", + vma->vm_end, &vsyscall_text_end); + return -EINVAL; + } + + if (vma->vm_start >= vma->vm_end) { + pr_debug("vsys_mmap: end\n"); + return -EINVAL; + } + + if (find_vma_intersection(current->mm, vma->vm_start, vma->vm_end)) { + pr_debug("vsyscall: mapping collision\n"); + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == (VM_SHARED | VM_WRITE)) { + pr_debug("vsyscall: attempt to write to mapping\n"); + return -EPERM; + } + + if (remap_page_range(vma->vm_start, + __pa(VSYSCALL_text) + offset, + vma->vm_end-vma->vm_start, + vma->vm_page_prot)) + return -EAGAIN; + + pr_debug("VSYSCALL_text(%p): %02x %02x %02x %02x\n", + VSYSCALL_text, + VSYSCALL_text[0], VSYSCALL_text[1], + VSYSCALL_text[2], VSYSCALL_text[3]); + + current->mm->vsys_mapped = 1; + return 0; +} + +static struct file_operations vsys_fops = { + mmap: vsys_mmap, +}; + static int memory_open(struct inode * inode, struct file * filp) { switch (MINOR(inode->i_rdev)) { @@ -601,6 +683,9 @@ case 9: filp->f_op = &urandom_fops; break; + case 10: + filp->f_op = &vsys_fops; + break; default: return -ENXIO; } @@ -627,7 +712,8 @@ {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, {7, "full", S_IRUGO | S_IWUGO, &full_fops}, {8, "random", S_IRUGO | S_IWUSR, &random_fops}, - {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} + {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, + {10,"vsys", S_IRUGO, &vsys_fops}, }; int i; diff -urN v2.4.19-pre5/drivers/char/raw.c linux.diff/drivers/char/raw.c --- v2.4.19-pre5/drivers/char/raw.c Mon Sep 24 02:16:03 2001 +++ linux.diff/drivers/char/raw.c Tue Apr 2 18:56:58 2002 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -34,13 +36,18 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, write: raw_write, open: raw_open, release: raw_release, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: raw_kvec_read, + kvec_write: raw_kvec_write, }; static struct file_operations raw_ctl_fops = { @@ -250,7 +257,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -381,3 +387,99 @@ out: return err; } + +static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, READ, cb, size, pos); +} + +int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, WRITE, cb, size, pos); +} + +int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos) +{ + int err; + unsigned minor; + kdev_t dev; + unsigned long limit, blocknr, blocks; + + unsigned sector_size, sector_bits, sector_mask; + unsigned max_sectors; + unsigned i; + + pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + /* EOF at the end */ + err = 0; + if (!size || (pos >> sector_bits) == limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + cb.fn(cb.data, cb.vec, err); + return 0; + } + + /* ENXIO for io beyond the end */ + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask); + goto out; + } + + /* Verify that the scatter-gather list is sector aligned. */ + for (i=0; inr; i++) + if ((cb.vec->veclet[i].offset & sector_mask) || + (cb.vec->veclet[i].length & sector_mask)) { + pr_debug("veclet offset/length wrong"); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + if (!blocks) { + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + goto out; + } + + err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits); +out: + if (err) + printk(KERN_DEBUG "raw_kvec_rw: ret is %d\n", err); + return err; +} + diff -urN v2.4.19-pre5/fs/Makefile linux.diff/fs/Makefile --- v2.4.19-pre5/fs/Makefile Thu Mar 7 16:40:03 2002 +++ linux.diff/fs/Makefile Tue Apr 30 17:29:31 2002 @@ -7,12 +7,12 @@ O_TARGET := fs.o -export-objs := filesystems.o open.o dcache.o buffer.o +export-objs := filesystems.o open.o dcache.o buffer.o fcblist.o mod-subdirs := nls obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + fcntl.o ioctl.o readdir.o select.o fifo.o locks.o fcblist.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o namespace.o seq_file.o @@ -22,6 +22,9 @@ obj-y += noquot.o endif +obj-y += aio.o +export-objs += aio.o + subdir-$(CONFIG_PROC_FS) += proc subdir-y += partitions diff -urN v2.4.19-pre5/fs/aio.c linux.diff/fs/aio.c --- v2.4.19-pre5/fs/aio.c Wed Dec 31 19:00:00 1969 +++ linux.diff/fs/aio.c Mon Apr 29 18:04:24 2002 @@ -0,0 +1,1224 @@ +/* fs/aio.c + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if DEBUG > 1 +#define dprintk printk +#else +#define dprintk(x...) do { ; } while (0) +#endif + +/*------ sysctl variables----*/ +unsigned aio_nr; /* current system wide number of aio requests */ +unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +unsigned aio_max_size = 0x20000; /* 128KB per chunk */ +unsigned aio_max_pinned; /* set to mem/4 in aio_setup */ +/*----end sysctl variables---*/ + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kioctx_cachep; + +/* tunable. Needs to be added to sysctl. */ +int max_aio_reqs = 0x10000; + +/* Used for rare fput completion. */ +static void aio_fput_routine(void *); +static struct tq_struct fput_tqueue = { + routine: aio_fput_routine, +}; + +static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(fput_head); + +/* forward prototypes */ +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res); +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res); + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + aio_max_pinned = num_physpages/4; + + printk(KERN_NOTICE "aio_setup: num_physpages = %u\n", aio_max_pinned); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +static void ioctx_free_reqs(struct kioctx *ctx) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, &ctx->free_reqs) { + struct kiocb *iocb = list_kiocb(pos); + list_del(&iocb->list); + kmem_cache_free(kiocb_cachep, iocb); + } +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_reqs) +{ + struct kioctx *ctx; + unsigned i; + long size; + + /* Round off to a power of 2. Needed for cheap mask operations */ + for (i=1; i (0x10000000U / sizeof(struct io_event))) || + (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) { + pr_debug("ENOMEM: nr_reqs too high\n"); + return ERR_PTR(-EINVAL); + } + + if (nr_reqs > aio_max_nr) + return ERR_PTR(-EAGAIN); + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + ctx->max_reqs = nr_reqs; + ctx->mm = current->mm; + atomic_inc(&ctx->mm->mm_count); + + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->lock); + spin_lock_init(&ctx->ring_lock); + init_waitqueue_head(&ctx->wait); + + INIT_LIST_HEAD(&ctx->free_reqs); + INIT_LIST_HEAD(&ctx->active_reqs); + + /* Allocate nr_reqs iocbs for io. Free iocbs are on the + * ctx->free_reqs list. When active they migrate to the + * active_reqs list. During completion and cancellation + * the request may temporarily not be on any list. + */ + for (i=0; ikey = i; + iocb->users = 0; + list_add(&iocb->list, &ctx->free_reqs); + } + + /* Compensate for the ring buffer's head/tail overlap entry */ + nr_reqs *= 2; + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_reqs; + + /* Try to use alloc_pages first... */ + for (i=0; (PAGE_SIZE << i) < size; i++) + ; + ctx->ring_order = i; + ctx->ring = (void *)__get_free_pages(GFP_KERNEL, i); + + if (!ctx->ring) { + /* vmalloc it is... */ + ctx->ring = vmalloc(size); + if (!ctx->ring) + goto out_freectx; + ctx->ring_was_vmallocd = 1; + } else + ctx->ring_was_vmallocd = 0; + + memset(ctx->ring, 0, size); + ctx->ring_mask = nr_reqs - 1; /* trusted copy */ + ctx->ring->mask = ctx->ring_mask; /* user copy */ + + /* now link into global list. kludge. FIXME */ + br_write_lock(BR_AIO_REQ_LOCK); + if (unlikely(aio_nr + ctx->max_reqs > aio_max_nr)) + goto out_cleanup; + aio_nr += ctx->max_reqs; /* undone by __put_ioctx */ + ctx->ring->id = ctx->user_id = ++current->mm->new_ioctx_id; + ctx->next = current->mm->ioctx_list; + current->mm->ioctx_list = ctx; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, current->mm, ctx->ring->mask); + return ctx; + +out_cleanup: + br_write_unlock(BR_AIO_REQ_LOCK); + ctx->max_reqs = 0; /* prevent __put_ioctx from sub'ing aio_nr */ + __put_ioctx(ctx); + return ERR_PTR(-EAGAIN); + +out_freereqs: + ioctx_free_reqs(ctx); +out_freectx: + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + dprintk("aio: error allocating ioctx %p\n", ctx); + return ctx; +} + +/* aio_cancel_all + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void aio_cancel_all(struct kioctx *ctx) +{ + int (*cancel)(struct kiocb *); + spin_lock_irq(&ctx->lock); + while (!list_empty(&ctx->active_reqs)) { + struct list_head *pos = ctx->active_reqs.next; + struct kiocb *iocb = list_kiocb(pos); + list_del_init(&iocb->list); + cancel = iocb->cancel; + if (cancel) + iocb->users++; + spin_unlock_irq(&ctx->lock); + if (cancel) + cancel(iocb); + spin_lock_irq(&ctx->lock); + } + spin_unlock_irq(&ctx->lock); +} + +/* exit_aio: called when the last user of mm goes away. At this point, + * there is no way for any new requests to be submited or any of the + * io_* syscalls to be called on the context. However, there may be + * outstanding requests which hold references to the context; as they + * go away, they will call put_ioctx and release any pinned memory + * associated with the request (held via struct page * references). + */ +void exit_aio(struct mm_struct *mm) +{ + struct kioctx *ctx = mm->ioctx_list; + mm->ioctx_list = NULL; + while (ctx) { + struct kioctx *next = ctx->next; + ctx->next = NULL; + aio_cancel_all(ctx); + + if (1 != atomic_read(&ctx->users)) + printk(KERN_DEBUG + "exit_aio:ioctx still alive: %d %d %d\n", + atomic_read(&ctx->users), ctx->dead, + ctx->reqs_active); + put_ioctx(ctx); + ctx = next; + } +} + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __put_ioctx(struct kioctx *ctx) +{ + unsigned nr_reqs = ctx->max_reqs; + + mmdrop(ctx->mm); + ctx->mm = NULL; + pr_debug("__put_ioctx: freeing %p\n", ctx); + if (ctx->ring_was_vmallocd) + vfree(ctx->ring); + else + free_pages((unsigned long)ctx->ring, ctx->ring_order); + + ioctx_free_reqs(ctx); + kmem_cache_free(kioctx_cachep, ctx); + + br_write_lock(BR_AIO_REQ_LOCK); + aio_nr -= nr_reqs; + br_write_unlock(BR_AIO_REQ_LOCK); +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns -EAGAIN if no requests are free. + */ +static inline struct kiocb *__aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req = NULL; + + /* Use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + if (!list_empty(&ctx->free_reqs) && + (ctx->reqs_active < aio_ring_avail(ctx->ring))) { + req = list_kiocb(ctx->free_reqs.next); + list_del(&req->list); + list_add(&req->list, &ctx->active_reqs); + ctx->reqs_active++; + req->user_obj = NULL; + get_ioctx(ctx); + + if (req->ctx) + BUG(); + req->ctx = ctx; + if (req->users) + BUG(); + req->users = 1; + } + spin_unlock_irq(&ctx->lock); + + return req; +} + +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + /* Handle a potential starvation case -- should be exceedingly rare as + * requests will be stuck on fput_head only if the aio_fput_routine is + * delayed and the requests were the last user of the struct file. + */ + req = __aio_get_req(ctx); + if (unlikely(NULL == ctx)) { + aio_fput_routine(NULL); + req = __aio_get_req(ctx); + } + return req; +} + +static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +{ + req->ctx = NULL; + req->filp = NULL; + req->user_obj = NULL; + ctx->reqs_active--; + list_add(&req->list, &ctx->free_reqs); +} + +static void aio_fput_routine(void *data) +{ + spin_lock_irq(&fput_lock); + while (likely(!list_empty(&fput_head))) { + struct kiocb *req = list_kiocb(fput_head.next); + struct kioctx *ctx = req->ctx; + + list_del(&req->list); + spin_unlock_irq(&fput_lock); + + /* Complete the fput */ + __fput(req->filp); + + /* Link the iocb into the context's free list */ + spin_lock_irq(&ctx->lock); + really_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + + put_ioctx(ctx); + spin_lock_irq(&fput_lock); + } + spin_unlock_irq(&fput_lock); +} + +/* __aio_put_req + * Returns true if this put was the last user of the request. + */ +static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", + req, atomic_read(&req->filp->f_count)); + + req->users --; + if (unlikely(req->users < 0)) + BUG(); + if (req->users) + return 0; + list_del(&req->list); /* remove from active_reqs */ + req->cancel = NULL; + + /* Must be done under the lock to serialise against cancellation. + * Call this aio_fput as it duplicates fput via the fput_tqueue. + */ + if (unlikely(atomic_dec_and_test(&req->filp->f_count))) { + get_ioctx(ctx); + spin_lock(&fput_lock); + list_add(&req->list, &fput_head); + spin_unlock(&fput_lock); + schedule_task(&fput_tqueue); + } else + really_put_req(ctx, req); + return 1; +} + +/* aio_put_req + * Returns true if this put was the last user of the kiocb, + * false if the request is still in use. + */ +int aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + int ret; + spin_lock_irq(&ctx->lock); + ret = __aio_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + if (ret) + put_ioctx(ctx); + return ret; +} + +/* Lookup an ioctx id. ioctx_list is lockless for reads. + * FIXME: this is O(n) and is only suitable for development. + */ +static inline struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct kioctx *ioctx; + struct mm_struct *mm; + + br_read_lock(BR_AIO_REQ_LOCK); + mm = current->mm; + for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) + if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { + get_ioctx(ioctx); + break; + } + br_read_unlock(BR_AIO_REQ_LOCK); + + return ioctx; +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + * Returns true if this is the last user of the request. The + * only other user of the request can be the cancellation code. + */ +int aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ctx; + struct aio_ring *ring = ctx->ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + int ret; + + /* add a completion event to the ring buffer. + * must be done holding ctx->lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->lock, flags); + + tail = ring->tail; + event = &ring->io_events[tail]; + tail = (tail + 1) & ring->mask; + + event->obj = (u64)(unsigned long)iocb->user_obj; + event->data = iocb->user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); + + /* everything turned out well, dispose of the aiocb. */ + ret = __aio_put_req(ctx, iocb); + + spin_unlock_irqrestore(&ctx->lock, flags); + + wake_up(&ctx->wait); + if (ret) + put_ioctx(ctx); + + return ret; +} + +/* aio_read_evt + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched (0 or 1 ;-) + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring *ring = ioctx->ring; + unsigned long head; + int ret = 0; + + dprintk("in aio_read_evt h%lu t%lu m%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail, + (unsigned long)ring->mask); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&ioctx->ring_lock); + + head = ring->head; + if (head != ring->tail) { + *ent = ring->io_events[head]; + head = (head + 1) & ioctx->ring_mask; + barrier(); + ring->head = head; + ret = 1; + } + spin_unlock(&ioctx->ring_lock); + +out: + dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + wait_queue_head_t wait; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up(&to->wait); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + init_waitqueue_head(&to->wait); +} + +static inline void set_timeout(struct timeout *to, const struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, int nr, struct io_event *event, + const struct timespec *timeout) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(to_wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct timeout to; + + /* needed to zero any padding within an entry (there shouldn't be + * any, but C is fun! + */ + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (likely(i < nr)) { + ret = aio_read_evt(ctx, &ent); + if (unlikely(ret <= 0)) + break; + + dprintk("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + ret = 0; + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (i) + return i; + if (ret) + return ret; + + /* End fast path */ + + init_timeout(&to); + if (timeout) { + struct timespec ts; + ret = -EFAULT; + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + goto out; + + set_timeout(&to, &ts); + if (to.timed_out) + timeout = 0; + } + + while (likely(i < nr)) { + add_wait_queue_exclusive_lifo(&ctx->wait, &wait); + add_wait_queue(&to.wait, &to_wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx, &ent); + if (ret) + break; + if (i) + break; + ret = 0; + if (to.timed_out) /* Only check after read evt */ + break; + schedule(); + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + /*ret = aio_read_evt(ctx, &ent);*/ + } while (1) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + remove_wait_queue(&to.wait, &to_wait); + + if (unlikely(ret <= 0)) + break; + + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (timeout) + clear_timeout(&to); +out: + return i ? i : ret; +} + +/* Take an ioctx and remove it from the list of ioctx's. Protects + * against races with itself via ->dead. + */ +static void io_destroy(struct kioctx *ioctx) +{ + struct kioctx **tmp; + int was_dead; + + /* delete the entry from the list is someone else hasn't already */ + br_write_lock(BR_AIO_REQ_LOCK); + was_dead = ioctx->dead; + ioctx->dead = 1; + for (tmp = ¤t->mm->ioctx_list; *tmp && *tmp != ioctx; + tmp = &(*tmp)->next) + ; + if (*tmp) + *tmp = ioctx->next; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio_release(%p)\n", ioctx); + put_ioctx(ioctx); /* once for the lookup */ + if (likely(!was_dead)) + put_ioctx(ioctx); /* twice for the list */ +} + +asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) { + pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n"); + goto out; + } + + ret = -EAGAIN; + if (unlikely(nr_reqs > max_aio_reqs)) + goto out; + + ioctx = ioctx_alloc(nr_reqs); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) + return 0; + io_destroy(ioctx); + } + +out: + return ret; +} + +/* aio_release + * Release the kioctx associated with the userspace handle. + */ +asmlinkage long sys_io_destroy(aio_context_t ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (likely(NULL != ioctx)) { + io_destroy(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +int generic_aio_poll(struct file *file, struct kiocb *req, struct iocb iocb) +{ + unsigned events = iocb.aio_buf; + + /* Did the user set any bits they weren't supposed to? (The + * above is actually a cast. + */ + if (unlikely(events != iocb.aio_buf)) + return -EINVAL; + + return async_poll(req, events); +} + +/* sys_io_submit + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp) +{ + struct kioctx *ctx; + long ret = 0; + int i; + + if (unlikely(nr < 0)) + return -EINVAL; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + for (i=0; ifilp = file; + tmp.aio_key = req->key; + ret = put_user(tmp.aio_key, &iocbp->aio_key); + if (unlikely(ret)) { + dprintk("EFAULT: aio_key\n"); + goto out_put_req; + } + + req->user_obj = iocbp; + req->user_data = tmp.aio_data; + req->buf = tmp.aio_buf; + req->pos = tmp.aio_offset; + req->size = tmp.aio_nbytes; + + switch (tmp.aio_lio_opcode) { + case IOCB_CMD_PREAD: + op = file->f_op->aio_read; + ret = -EBADF; + if (!(file->f_mode & FMODE_READ)) + goto out_put_req; + break; + case IOCB_CMD_PREADX: + op = file->f_op->aio_readx; + ret = -EBADF; + if (!(file->f_mode & FMODE_READ)) + goto out_put_req; + break; + case IOCB_CMD_PWRITE: + op = file->f_op->aio_write; + ret = -EBADF; + if (!(file->f_mode & FMODE_WRITE)) + goto out_put_req; + break; + case IOCB_CMD_FSYNC: + op = file->f_op->aio_fsync; + break; + //case IOCB_CMD_POLL: + // op = generic_aio_poll; + // break; + default: + op = NULL; + break; + } + + ret = -EINVAL; + if (unlikely(!op)) { + pr_debug("EINVAL: io_submit: no operation provided\n"); + goto out_put_req; + } + + ret = op(file, req, tmp); + if (likely(!ret)) + continue; + + pr_debug("io_submit: op returned %ld\n", ret); + aio_complete(req, ret, 0); + ret = 0; /* A completion event was sent, so + * submit is a success. */ + continue; + + out_put_req: + aio_put_req(ctx, req); + break; + } + + put_ioctx(ctx); + run_task_queue(&tq_disk); + return i ? i : ret; +} + +static void generic_aio_next_chunk(void *_iocb) +{ + int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t); + struct kiocb *iocb = _iocb; + int rw = iocb->this_size; + unsigned long buf = iocb->buf; + kvec_cb_t cb; + ssize_t res; + + iocb->this_size = iocb->size - iocb->nr_transferred; + if (iocb->this_size > aio_max_size) + iocb->this_size = aio_max_size; + + buf += iocb->nr_transferred; + cb.vec = mm_map_user_kvec(iocb->ctx->mm, rw, buf, iocb->this_size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = iocb; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (unlikely(IS_ERR(cb.vec))) + goto done; + + kvec_op = (rw == READ) ? iocb->filp->f_op->kvec_read + : iocb->filp->f_op->kvec_write; + dprintk("submit: %d %d %d\n", iocb->this_size, iocb->nr_transferred, iocb->size); + res = kvec_op(iocb->filp, cb, iocb->this_size, + iocb->pos + iocb->nr_transferred); + if (!res) { + dprintk("submit okay\n"); + return; + } + dprintk("submit failed: %d\n", res); + + cb.fn(cb.data, cb.vec, res); + return; + +done: + if (!iocb->nr_transferred) + BUG(); + aio_complete(iocb, iocb->nr_transferred, 0); +} + +static void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res) +{ + struct kiocb *iocb = _iocb; + + unmap_kvec(vec, rw == READ); + free_kvec(vec); + + if (res > 0) + iocb->nr_transferred += res; + + /* Was this chunk successful? Is there more left to transfer? */ + if (res == iocb->this_size && iocb->nr_transferred < iocb->size) { + /* We may be in irq context, so queue processing in + * process context. + */ + iocb->this_size = rw; + INIT_TQUEUE(&iocb->tq, generic_aio_next_chunk, iocb); + schedule_task(&iocb->tq); + return; + } + + aio_complete(iocb, iocb->nr_transferred ? iocb->nr_transferred : res, + 0); +} + +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(READ, _iocb, vec, res); +} + +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(WRITE, _iocb, vec, res); +} + +ssize_t generic_aio_rw(int rw, struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t); + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + size_t nr_read = 0; + loff_t pos = iocb.aio_offset; + kvec_cb_t cb; + ssize_t res; + +#if 0 + if (likely(NULL != file->f_op->new_read)) { + nr_read = file->f_op->new_read(file, (void *)buf, size, + &pos, F_ATOMIC); + dprintk("from new_read: nr_read: %ld\n", (long)nr_read); + if ((-EAGAIN == nr_read) || (-EWOULDBLOCKIO == nr_read)) + nr_read = 0; + else if ((nr_read >= min_size) || (nr_read < 0)) { + dprintk("returning nr_read: %ld\n", (long)nr_read); + return nr_read; + } + } + dprintk("nr_read: %ld\n", (long)nr_read); +#endif + + req->nr_transferred = nr_read; + size -= nr_read; + if (size > aio_max_size) + /* We have to split up the request. Pin the mm + * struct for further use with map_user_kvec later. + */ + size = aio_max_size; + else + req->buf = 0; + + req->this_size = size; + + buf += nr_read; + cb.vec = map_user_kvec(rw, buf, size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = req; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (IS_ERR(cb.vec)) + return nr_read ? nr_read : PTR_ERR(cb.vec); + + kvec_op = (rw == READ) ? file->f_op->kvec_read : file->f_op->kvec_write; + + res = kvec_op(file, cb, size, pos); + if (unlikely(res != 0)) { + /* If the first chunk was successful, we have to run + * the callback to attempt the rest of the io. + */ + if (res == size && req->buf) { + cb.fn(cb.data, cb.vec, res); + return 0; + } + + unmap_kvec(cb.vec, rw == READ); + free_kvec(cb.vec); + if (nr_read) { + if (res < 0) + res = 0; + res += nr_read; + } + } + return res; +} + +ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_rw(READ, file, req, iocb, iocb.aio_nbytes); +} + +ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_rw(READ, file, req, iocb, 1); +} + +ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size) +{ + return generic_aio_rw(WRITE, file, req, iocb, 1); +#if 0 + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + loff_t pos = iocb.aio_offset; + ssize_t nr_written = 0; + kvec_cb_t cb; + long res; +#if 0 + if (likely(NULL != file->f_op->new_write)) { + nr_written = file->f_op->new_write(file, (void *)buf, size, + &pos, F_ATOMIC); + pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written); + if (-EAGAIN == nr_written) + nr_written = 0; + if ((nr_written >= min_size) || (nr_written < 0)) + return nr_written; + } +#endif + + req->nr_transferred = nr_written; + size -= nr_written; + if (size > aio_max_size) + size = aio_max_size; + req->this_size = size; + buf += nr_written; + cb.vec = map_user_kvec(WRITE, buf, size); + cb.fn = generic_aio_complete_write; + cb.data = req; + + if (IS_ERR(cb.vec)) { + pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec)); + return nr_written ? nr_written : PTR_ERR(cb.vec); + } + + res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset); + pr_debug("generic_aio_write: kvec_write: %ld\n", res); + if (unlikely(res != 0)) { + unmap_kvec(cb.vec, 0); + free_kvec(cb.vec); + if (nr_written) { + if (res < 0) + res = 0; + res += nr_written; + } + } + return res; +#endif +} + +ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb iocb) +{ + return generic_aio_write(file, req, iocb, iocb.aio_nbytes); +} + +/* lookup_kiocb + * Finds a given iocb for cancellation. + * MUST be called with ctx->lock held. + */ +struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key) +{ + struct list_head *pos; + /* TODO: use a hash or array, this sucks. */ + list_for_each(pos, &ctx->free_reqs) { + struct kiocb *kiocb = list_kiocb(pos); + if (kiocb->user_obj == iocb && kiocb->key == key) + return kiocb; + } + return NULL; +} + +asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb) +{ + int (*cancel)(struct kiocb *iocb); + struct kioctx *ctx; + struct kiocb *kiocb; + u32 key; + int ret; + + ret = get_user(key, &iocb->aio_key); + if (unlikely(ret)) + return ret; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); + if (kiocb && kiocb->cancel) { + cancel = kiocb->cancel; + kiocb->users ++; + } else + cancel = NULL; + spin_unlock_irq(&ctx->lock); + + if (NULL != cancel) + ret = cancel(kiocb); + + put_ioctx(ctx); + + return ret; +} + +asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb, const struct timespec *timeout) +{ +#if 0 /* FIXME. later. */ + struct kioctx *ioctx; + long ret = -EINVAL; + unsigned key; + long obj = (long)iocb; + + ioctx = lookup_ioctx(ctx_id); + if (!ioctx) + goto out; + + ret = get_user(key, &iocb->aio_key); + if (ret) + goto out; + + ret = __aio_complete(ioctx, key, obj, !!timeout); + put_ioctx(ioctx); + +out: + return ret; +#endif + return -ENOSYS; +} + +asmlinkage long sys_io_getevents(aio_context_t ctx_id, + long nr, + struct io_event *events, + const struct timespec *timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(NULL != ioctx)) { + ret = read_events(ioctx, nr, events, timeout); + put_ioctx(ioctx); + } + + return ret; +} + +__initcall(aio_setup); + +add_dynamic_syscall(sys_io_setup); +add_dynamic_syscall(sys_io_destroy); +add_dynamic_syscall(sys_io_submit); +add_dynamic_syscall(sys_io_cancel); +add_dynamic_syscall(sys_io_wait); +add_dynamic_syscall(sys_io_getevents); +EXPORT_SYMBOL_GPL(generic_file_kvec_read); +EXPORT_SYMBOL_GPL(generic_file_aio_read); +EXPORT_SYMBOL_GPL(generic_file_kvec_write); +EXPORT_SYMBOL_GPL(generic_file_aio_write); +EXPORT_SYMBOL_GPL(generic_file_new_read); diff -urN v2.4.19-pre5/fs/buffer.c linux.diff/fs/buffer.c --- v2.4.19-pre5/fs/buffer.c Wed Apr 3 21:04:36 2002 +++ linux.diff/fs/buffer.c Tue Apr 2 18:56:57 2002 @@ -3084,3 +3084,220 @@ module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + kvec_cb_t cb; + atomic_t io_count; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_cb_put(struct brw_cb *brw_cb) +{ + if (atomic_dec_and_test(&brw_cb->io_count)) { + ssize_t res = 0, err = 0; + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (!err && buffer_uptodate(bh)) + res += bh->b_size; + else + err = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (!res) + res = err; + + brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + brw_cb_put(brw_cb); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift) +{ + struct kvec *vec = cb.vec; + struct kveclet *veclet; + int err; + int length; + unsigned sector_size = 1 << sector_shift; + int i; + + struct brw_cb *brw_cb; + + if (!vec->nr) + BUG(); + + /* + * First, do some alignment and validity checks + */ + length = 0; + for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) { + length += veclet->length; + if ((veclet->offset & (sector_size-1)) || + (veclet->length & (sector_size-1))) { + printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size); + return -EINVAL; + } + } + + if (length < (blocks << sector_shift)) + BUG(); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + err = 0; + + if (!blocks) { + printk("brw_kiovec_async: !i\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb->cb = cb; + brw_cb->nr = 0; + + /* This is ugly. FIXME. */ + for (i=0, veclet=vec->veclet; inr; i++,veclet++) { + struct page *page = veclet->page; + unsigned offset = veclet->offset; + unsigned length = veclet->length; + + if (!page) + BUG(); + + while (length > 0) { + struct buffer_head *tmp; + tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO); + err = -ENOMEM; + if (!tmp) + goto error; + + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, page, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blknr++; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) + | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + + if (brw_cb->nr >= blocks) + goto submit; + } /* End of block loop */ + } /* End of page loop */ + +submit: + atomic_set(&brw_cb->io_count, brw_cb->nr+1); + /* okay, we've setup all our io requests, now fire them off! */ + for (i=0; inr; i++) + submit_bh(rw, brw_cb->bh[i]); + brw_cb_put(brw_cb); + + return 0; + +error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (i=0; inr; i++) + kmem_cache_free(bh_cachep, brw_cb->bh[i]); + kfree(brw_cb); + } + + return err; +} +#if 0 +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} +#endif diff -urN v2.4.19-pre5/fs/exec.c linux.diff/fs/exec.c --- v2.4.19-pre5/fs/exec.c Wed Apr 3 21:04:36 2002 +++ linux.diff/fs/exec.c Mon Apr 29 15:54:22 2002 @@ -397,6 +397,7 @@ old_mm = current->mm; if (old_mm && atomic_read(&old_mm->mm_users) == 1) { mm_release(); + exit_aio(old_mm); exit_mmap(old_mm); return 0; } diff -urN v2.4.19-pre5/fs/ext2/file.c linux.diff/fs/ext2/file.c --- v2.4.19-pre5/fs/ext2/file.c Thu Nov 1 16:40:02 2001 +++ linux.diff/fs/ext2/file.c Sun Apr 7 18:47:48 2002 @@ -40,6 +40,8 @@ */ struct file_operations ext2_file_operations = { llseek: generic_file_llseek, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, read: generic_file_read, write: generic_file_write, ioctl: ext2_ioctl, @@ -47,6 +49,8 @@ open: generic_file_open, release: ext2_release_file, fsync: ext2_sync_file, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext2_file_inode_operations = { diff -urN v2.4.19-pre5/fs/ext3/file.c linux.diff/fs/ext3/file.c --- v2.4.19-pre5/fs/ext3/file.c Mon Nov 26 23:43:08 2001 +++ linux.diff/fs/ext3/file.c Sun Apr 7 18:47:59 2002 @@ -78,6 +78,8 @@ struct file_operations ext3_file_operations = { llseek: generic_file_llseek, /* BKL held */ + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, /* FIXME: attributes */ read: generic_file_read, /* BKL not held. Don't need */ write: ext3_file_write, /* BKL not held. Don't need */ ioctl: ext3_ioctl, /* BKL held */ @@ -85,6 +87,8 @@ open: ext3_open_file, /* BKL not held. Don't need */ release: ext3_release_file, /* BKL not held. Don't need */ fsync: ext3_sync_file, /* BKL held */ + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext3_file_inode_operations = { diff -urN v2.4.19-pre5/fs/fcblist.c linux.diff/fs/fcblist.c --- v2.4.19-pre5/fs/fcblist.c Wed Dec 31 19:00:00 1969 +++ linux.diff/fs/fcblist.c Tue Apr 30 17:29:31 2002 @@ -0,0 +1,130 @@ +/* + * linux/fs/fcblist.c + * + * Copyright (C) 2001, Davide Libenzi + * + * Handle file callbacks + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +long ion_band_table[NSIGPOLL] = { + ION_IN, /* POLL_IN */ + ION_OUT, /* POLL_OUT */ + ION_IN, /* POLL_MSG */ + ION_ERR, /* POLL_ERR */ + 0, /* POLL_PRI */ + ION_HUP /* POLL_HUP */ +}; +EXPORT_SYMBOL(ion_band_table); + +long poll_band_table[NSIGPOLL] = { + POLLIN | POLLRDNORM, /* POLL_IN */ + POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ + POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ + POLLERR, /* POLL_ERR */ + POLLPRI | POLLRDBAND, /* POLL_PRI */ + POLLHUP | POLLERR /* POLL_HUP */ +}; +EXPORT_SYMBOL(poll_band_table); + + +void file_notify_event(struct file *filep, long *event) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_read_lock(filep, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + fcbp->cbproc(filep, fcbp->data, fcbp->local, event); + } + + fcblist_read_unlock(filep, flags); +} +EXPORT_SYMBOL(file_notify_event); + + +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data) +{ + unsigned long flags; + struct fcb_struct *fcbp; + + if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL))) + return -ENOMEM; + + memset(fcbp, 0, sizeof(struct fcb_struct)); + fcbp->cbproc = cbproc; + fcbp->data = data; + + fcblist_write_lock(filep, flags); + list_add_tail(&fcbp->llink, &filep->f_cblist); + fcblist_write_unlock(filep, flags); + + return 0; +} +EXPORT_SYMBOL(file_notify_addcb); + + +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_write_lock(filep, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + if (fcbp->cbproc == cbproc) { + list_del(lnk); + fcblist_write_unlock(filep, flags); + kfree(fcbp); + return 0; + } + } + + fcblist_write_unlock(filep, flags); + + return -ENOENT; +} +EXPORT_SYMBOL(file_notify_delcb); + + +void file_notify_cleanup(struct file *filep) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_write_lock(filep, flags); + + lsthead = &filep->f_cblist; + while ((lnk = list_first(lsthead))) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + list_del(lnk); + fcblist_write_unlock(filep, flags); + kfree(fcbp); + fcblist_write_lock(filep, flags); + } + + fcblist_write_unlock(filep, flags); +} +EXPORT_SYMBOL(file_notify_cleanup); + diff -urN v2.4.19-pre5/fs/file_table.c linux.diff/fs/file_table.c --- v2.4.19-pre5/fs/file_table.c Mon Sep 24 02:16:04 2001 +++ linux.diff/fs/file_table.c Tue Apr 30 17:29:31 2002 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ f->f_uid = current->fsuid; f->f_gid = current->fsgid; list_add(&f->f_list, &anon_list); + file_notify_init(f); file_list_unlock(); return f; } @@ -91,6 +93,7 @@ filp->f_uid = current->fsuid; filp->f_gid = current->fsgid; filp->f_op = dentry->d_inode->i_fop; + file_notify_init(filp); if (filp->f_op->open) return filp->f_op->open(dentry->d_inode, filp); else @@ -99,31 +102,36 @@ void fput(struct file * file) { + if (atomic_dec_and_test(&file->f_count)) + __fput(file); +} + +void __fput(struct file * file) +{ struct dentry * dentry = file->f_dentry; struct vfsmount * mnt = file->f_vfsmnt; struct inode * inode = dentry->d_inode; - if (atomic_dec_and_test(&file->f_count)) { - locks_remove_flock(file); + file_notify_cleanup(file); + locks_remove_flock(file); - if (file->f_iobuf) - free_kiovec(1, &file->f_iobuf); + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); - if (file->f_op && file->f_op->release) - file->f_op->release(inode, file); - fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); - file_list_lock(); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; - list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - dput(dentry); - mntput(mnt); - } + if (file->f_op && file->f_op->release) + file->f_op->release(inode, file); + fops_put(file->f_op); + if (file->f_mode & FMODE_WRITE) + put_write_access(inode); + file_list_lock(); + file->f_dentry = NULL; + file->f_vfsmnt = NULL; + list_del(&file->f_list); + list_add(&file->f_list, &free_list); + files_stat.nr_free_files++; + file_list_unlock(); + dput(dentry); + mntput(mnt); } struct file * fget(unsigned int fd) diff -urN v2.4.19-pre5/fs/locks.c linux.diff/fs/locks.c --- v2.4.19-pre5/fs/locks.c Thu Nov 1 16:40:02 2001 +++ linux.diff/fs/locks.c Mon Apr 8 16:46:00 2002 @@ -440,7 +440,7 @@ while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter = list_entry(blocker->fl_block.next, struct file_lock, fl_block); - if (wait) { + if (0) { locks_notify_blocked(waiter); /* Let the blocked process remove waiter from the * block list when it gets scheduled. diff -urN v2.4.19-pre5/fs/nfs/file.c linux.diff/fs/nfs/file.c --- v2.4.19-pre5/fs/nfs/file.c Thu Mar 7 16:40:04 2002 +++ linux.diff/fs/nfs/file.c Tue Apr 2 18:56:58 2002 @@ -39,9 +39,13 @@ static ssize_t nfs_file_write(struct file *, const char *, size_t, loff_t *); static int nfs_file_flush(struct file *); static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); struct file_operations nfs_file_operations = { llseek: generic_file_llseek, + kvec_read: nfs_kvec_read, + kvec_write: nfs_kvec_write, read: nfs_file_read, write: nfs_file_write, mmap: nfs_file_mmap, @@ -50,6 +54,8 @@ release: nfs_release, fsync: nfs_fsync, lock: nfs_lock, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations nfs_file_inode_operations = { @@ -88,6 +94,28 @@ return status; } +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_write(file, cb, count, pos); + return ret; +} + +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_read(file, cb, count, pos); + return ret; +} + static ssize_t nfs_file_read(struct file * file, char * buf, size_t count, loff_t *ppos) { diff -urN v2.4.19-pre5/fs/pipe.c linux.diff/fs/pipe.c --- v2.4.19-pre5/fs/pipe.c Wed Apr 3 21:04:37 2002 +++ linux.diff/fs/pipe.c Tue Apr 30 17:29:31 2002 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; + int pfull; ssize_t size, read, ret; /* Seeks are not allowed on pipes. */ @@ -72,6 +74,7 @@ PIPE_WAITING_READERS(*inode)++; pipe_wait(inode); PIPE_WAITING_READERS(*inode)--; + pfull = PIPE_FULL(*inode); ret = -ERESTARTSYS; if (signal_pending(current)) goto out; @@ -82,6 +85,8 @@ goto out; } } + else + pfull = PIPE_FULL(*inode); /* Read what data is available. */ ret = -EFAULT; @@ -104,6 +109,9 @@ count -= chars; buf += chars; } + /* Send notification message */ + if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode)) + file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND); /* Cache behaviour optimization */ if (!PIPE_LEN(*inode)) @@ -138,6 +146,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; + int pempty; ssize_t free, written, ret; /* Seeks are not allowed on pipes. */ @@ -182,6 +191,7 @@ } /* Copy into available space. */ + pempty = PIPE_EMPTY(*inode); ret = -EFAULT; while (count > 0) { int space; @@ -210,6 +220,9 @@ break; do { + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); /* * Synchronous wake-up: it knows that this process * is going to give up this CPU, so it doesnt have @@ -219,6 +232,7 @@ PIPE_WAITING_WRITERS(*inode)++; pipe_wait(inode); PIPE_WAITING_WRITERS(*inode)--; + pempty = PIPE_EMPTY(*inode); if (signal_pending(current)) goto out; if (!PIPE_READERS(*inode)) @@ -227,6 +241,9 @@ ret = -EFAULT; } + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); /* Signal readers asynchronously that there is more data. */ wake_up_interruptible(PIPE_WAIT(*inode)); @@ -299,9 +316,22 @@ static int pipe_release(struct inode *inode, int decr, int decw) { + struct file *rdfile, *wrfile; down(PIPE_SEM(*inode)); PIPE_READERS(*inode) -= decr; PIPE_WRITERS(*inode) -= decw; + rdfile = PIPE_READFILE(*inode); + wrfile = PIPE_WRITEFILE(*inode); + if (decr && !PIPE_READERS(*inode)) { + PIPE_READFILE(*inode) = NULL; + if (wrfile) + file_send_notify(wrfile, ION_HUP, POLLHUP); + } + if (decw && !PIPE_WRITERS(*inode)) { + PIPE_WRITEFILE(*inode) = NULL; + if (rdfile) + file_send_notify(rdfile, ION_HUP, POLLHUP); + } if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { struct pipe_inode_info *info = inode->i_pipe; inode->i_pipe = NULL; @@ -454,6 +484,7 @@ PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; + PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL; return inode; fail_page: @@ -561,6 +592,9 @@ f2->f_mode = 2; f2->f_version = 0; + PIPE_READFILE(*inode) = f1; + PIPE_WRITEFILE(*inode) = f2; + fd_install(i, f1); fd_install(j, f2); fd[0] = i; diff -urN v2.4.19-pre5/fs/select.c linux.diff/fs/select.c --- v2.4.19-pre5/fs/select.c Mon Sep 24 02:16:05 2001 +++ linux.diff/fs/select.c Thu Apr 11 16:17:49 2002 @@ -12,6 +12,12 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * June 2001 + * Added async_poll implementation. -bcrl + * Nov 2001 + * Async poll improvments from Suparna Bhattacharya + * April 2002 + * smp safe async poll plus cancellation. -bcrl */ #include @@ -19,6 +25,8 @@ #include #include /* for STICKY_TIMEOUTS */ #include +#include +#include #include @@ -26,19 +34,36 @@ #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) struct poll_table_entry { - struct file * filp; wait_queue_t wait; wait_queue_head_t * wait_address; + struct file * filp; + poll_table * p; }; struct poll_table_page { + unsigned long size; struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; #define POLL_TABLE_FULL(table) \ - ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + ((unsigned long)((table)->entry+1) > \ + (table)->size + (unsigned long)(table)) + +/* async poll uses only one entry per poll table as it is linked to an iocb */ +typedef struct async_poll_table_struct { + poll_table pt; + struct worktodo wtd; + int events; /* event mask for async poll */ + int wake; + long sync; + struct poll_table_page pt_page; /* one poll table page hdr */ + struct poll_table_entry entries[1]; /* space for a single entry */ +} async_poll_table; + + +static kmem_cache_t *async_poll_table_cache; /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. @@ -61,6 +86,8 @@ struct poll_table_page *old; entry = p->entry; + if (entry == p->entries) /* may happen with async poll */ + break; do { entry--; remove_wait_queue(entry->wait_address,&entry->wait); @@ -68,8 +95,98 @@ } while (entry > p->entries); old = p; p = p->next; - free_page((unsigned long) old); + if (old->size == PAGE_SIZE) + free_page((unsigned long) old); } + if (pt->iocb) + kmem_cache_free(async_poll_table_cache, pt); +} + +void async_poll_complete(void *data) +{ + async_poll_table *pasync = data; + poll_table *p = data; + struct kiocb *iocb = p->iocb; + unsigned int mask; + + pasync->wake = 0; + wmb(); + do { + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= pasync->events | POLLERR | POLLHUP; + if (mask) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + return; + } + pasync->sync = 0; + wmb(); + } while (pasync->wake); +} + +static void async_poll_waiter(wait_queue_t *wait) +{ + struct poll_table_entry *entry = (struct poll_table_entry *)wait; + async_poll_table *pasync = (async_poll_table *)(entry->p); + + /* avoid writes to the cacheline if possible for SMP */ + if (!pasync->wake) { + pasync->wake = 1; + /* ensure only one wake up queues the wtd */ + if (!pasync->sync && !test_and_set_bit(0, &pasync->sync)) + wtd_queue(&pasync->wtd); + } +} + +int async_poll_cancel(struct kiocb *iocb) +{ + poll_table *p = iocb->data; + iocb->cancel = NULL; + //wtd_queue(); + return -EAGAIN; +} + +int async_poll(struct kiocb *iocb, int events) +{ + unsigned int mask; + async_poll_table *pasync; + poll_table *p; + + pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL); + if (!pasync) + return -ENOMEM; + + p = (poll_table *)pasync; + poll_initwait(p); + wtd_set_action(&pasync->wtd, async_poll_complete, pasync); + p->iocb = iocb; + pasync->wake = 0; + pasync->sync = 0; + pasync->events = events; + pasync->pt_page.entry = pasync->pt_page.entries; + pasync->pt_page.size = sizeof(pasync->pt_page); + p->table = &pasync->pt_page; + + iocb->data = p; + wmb(); + iocb->cancel = async_poll_cancel; + + mask = DEFAULT_POLLMASK; +#warning broken + iocb->users ++; + if (iocb->filp->f_op && iocb->filp->f_op->poll) + mask = iocb->filp->f_op->poll(iocb->filp, p); + mask &= events | POLLERR | POLLHUP; + if (mask && !test_and_set_bit(0, &pasync->sync)) + aio_complete(iocb, mask, 0); + + if (aio_put_req(iocb->ctx, iocb)) + /* Must be freed after aio_complete to synchronise with + * cancellation of the request. + */ + poll_freewait(p); + + return 0; } void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -85,6 +202,7 @@ __set_current_state(TASK_RUNNING); return; } + new_table->size = PAGE_SIZE; new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; @@ -98,7 +216,11 @@ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->p = p; + if (p->iocb) /* async poll */ + init_waitqueue_func_entry(&entry->wait, async_poll_waiter); + else + init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } } @@ -494,3 +616,14 @@ poll_freewait(&table); return err; } + +static int __init async_poll_init(void) +{ + async_poll_table_cache = kmem_cache_create("async poll table", + sizeof(async_poll_table), 0, 0, NULL, NULL); + if (!async_poll_table_cache) + panic("unable to alloc poll_table_cache"); + return 0; +} + +module_init(async_poll_init); diff -urN v2.4.19-pre5/include/asm-i386/a.out.h linux.diff/include/asm-i386/a.out.h --- v2.4.19-pre5/include/asm-i386/a.out.h Fri Jun 16 14:33:06 1995 +++ linux.diff/include/asm-i386/a.out.h Tue Apr 2 18:56:58 2002 @@ -19,7 +19,9 @@ #ifdef __KERNEL__ -#define STACK_TOP TASK_SIZE +#define VSYSCALL_SIZE 0x10000 /* 64KB for vsyscalls */ +#define STACK_GUARD_SIZE 0x02000 /* 8KB guard area */ +#define STACK_TOP (TASK_SIZE - VSYSCALL_SIZE - STACK_GUARD_SIZE) #endif diff -urN v2.4.19-pre5/include/asm-i386/poll.h linux.diff/include/asm-i386/poll.h --- v2.4.19-pre5/include/asm-i386/poll.h Thu Jan 23 14:01:28 1997 +++ linux.diff/include/asm-i386/poll.h Tue Apr 30 17:29:31 2002 @@ -15,6 +15,7 @@ #define POLLWRNORM 0x0100 #define POLLWRBAND 0x0200 #define POLLMSG 0x0400 +#define POLLREMOVE 0x1000 struct pollfd { int fd; diff -urN v2.4.19-pre5/include/asm-i386/unistd.h linux.diff/include/asm-i386/unistd.h --- v2.4.19-pre5/include/asm-i386/unistd.h Wed Apr 3 21:04:38 2002 +++ linux.diff/include/asm-i386/unistd.h Tue Apr 2 18:56:58 2002 @@ -245,6 +245,8 @@ #define __NR_tkill 238 +#define __NR_sys_dynamic_syscall 250 + /* user-visible error numbers are in the range -1 - -124: see */ #define __syscall_return(type, res) \ diff -urN v2.4.19-pre5/include/asm-i386/vsyscall.h linux.diff/include/asm-i386/vsyscall.h --- v2.4.19-pre5/include/asm-i386/vsyscall.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/asm-i386/vsyscall.h Mon Apr 22 11:49:21 2002 @@ -0,0 +1,42 @@ +#ifndef __ASM__VSYSCALL_H +#define __ASM__VSYSCALL_H +/* include/asm-i386/vsyscall.h + * Copyright 2002 Red Hat, Inc. + */ +#include +#include + +/* We call sys_dynamic_syscall(long nr, void *args) using regparm(2) + * convention. The .text.vsyscall section is mapped into userspace, + * whereas .data.vsyscall_list is a kernel-only array of the vsyscalls + * and the valid userspace address to call them from. All vsyscalls + * are called with C calling convention (ie args on the stack for x86). + * + * Note: the layout of .data.vsyscall_list must match the entries in + * dynamic_syscall.c. + */ +#define STRINGIFYa(x) #x +#define STRINGIFY(x) STRINGIFYa(x) +#define NR_dyn_sys STRINGIFY(__NR_sys_dynamic_syscall) +#define add_dynamic_syscall(name) \ + __asm__(" \n\ + .section .vsyscall_text, \"xa\" \n\ + .globl v" #name " \n\ + v" #name ": \n\ + push %ecx \n\ + push %edx \n\ + movl $" NR_dyn_sys ",%eax \n\ + movl $2f,%edx \n\ + leal 12(%esp),%ecx \n\ + int $0x80 \n\ + 1: \n\ + popl %edx \n\ + popl %ecx \n\ + ret \n\ + .size v" #name ",.-v" #name " \n\ + .previous \n\ + .section .data.vsyscall_list,\"a\" \n\ + 2: .long 1b," #name " \n\ + .previous") + +#endif diff -urN v2.4.19-pre5/include/linux/aio.h linux.diff/include/linux/aio.h --- v2.4.19-pre5/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/aio.h Tue Apr 30 17:56:55 2002 @@ -0,0 +1,97 @@ +#ifndef __LINUX__AIO_H +#define __LINUX__AIO_H + +#include +#include +#include +#include + +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kioctx; + +/* Notes on cancelling a kiocb: + * If a kiocb is cancelled, aio_complete may return 0 to indicate + * that cancel has not yet disposed of the kiocb. All cancel + * operations *must* call aio_put_req to dispose of the kiocb + * to guard against races with the completion code. + */ +#define KIOCB_C_CANCELLED 0x01 +#define KIOCB_C_COMPLETE 0x02 + +struct kiocb { + struct list_head list; + struct file *filp; + struct kioctx *ctx; + void *user_obj; + __u64 user_data; + loff_t pos; + unsigned long buf; + size_t nr_transferred; /* used for chunking */ + size_t size; + size_t this_size; + unsigned key; /* id of this request */ + int (*cancel)(struct kiocb *kiocb); + void *data; /* for use by the the async op */ + int users; + struct tq_struct tq; /* argh. */ +}; + +#define AIO_RING_PAGES 8 +#define AIO_RING_PAGES 8 +struct kioctx { + atomic_t users; + int dead; + struct mm_struct *mm; + + /* This needs improving */ + unsigned long user_id; + struct kioctx *next; + + wait_queue_head_t wait; + + spinlock_t lock; + + int reqs_active; + struct list_head free_reqs; + struct list_head active_reqs; /* used for cancellation */ + + unsigned max_reqs; + unsigned ring_mask; + struct aio_ring *ring; + spinlock_t ring_lock; + int ring_was_vmallocd; + int ring_order; +}; + +extern struct file_operations aio_fops; + +extern int FASTCALL(aio_put_req(struct kioctx *ctx, struct kiocb *iocb)); +extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2)); +extern void __put_ioctx(struct kioctx *ctx); +struct mm_struct; +extern void exit_aio(struct mm_struct *mm); + +#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) +#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) + +#include + +static inline struct kiocb *list_kiocb(struct list_head *h) +{ + return list_entry(h, struct kiocb, list); +} + +struct file; +extern int generic_aio_poll(struct file *file, struct kiocb *req, struct iocb iocb); +extern ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size); +extern ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb iocb, size_t min_size); +extern ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb iocb); +extern ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb iocb); +extern ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb iocb); + +/* for sysctl: */ +extern unsigned aio_nr, aio_max_nr, aio_max_size, aio_max_pinned; + +#endif /* __LINUX__AIO_H */ diff -urN v2.4.19-pre5/include/linux/aio_abi.h linux.diff/include/linux/aio_abi.h --- v2.4.19-pre5/include/linux/aio_abi.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/aio_abi.h Thu Apr 11 19:12:13 2002 @@ -0,0 +1,102 @@ +/* linux/aio_abi.h + * + * Copyright 2000,2001,2002 Red Hat. + * + * Written by Benjamin LaHaise + * + * Permission to use, copy, modify, and distribute this software and its + * documentation is hereby granted, provided that the above copyright + * notice appears in all copies. This software is provided without any + * warranty, express or implied. Red Hat makes no representations about + * the suitability of this software for any purpose. + * + * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, + * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF + * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND + * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, + * ENHANCEMENTS, OR MODIFICATIONS. + */ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +#include + +typedef unsigned long aio_context_t; + +enum { + IOCB_CMD_PREAD = 0, + IOCB_CMD_PWRITE = 1, + IOCB_CMD_FSYNC = 2, + IOCB_CMD_FDSYNC = 3, + IOCB_CMD_PREADX = 4, + IOCB_CMD_POLL = 5, + IOCB_CMD_NOOP = 6, +}; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#if defined(__LITTLE_ENDIAN) +#define PADDED(x,y) x, y +#elif defined(__BIG_ENDIAN) +#define PADDED(x,y) y, x +#else +#error edit for your odd byteorder. +#endif + +struct aio_ring { + __u32 PADDED(id, pad1); /* kernel internal index number */ + __u32 PADDED(mask, pad2); /* number of io_events - 1 */ + __u32 PADDED(head, pad3); + __u32 PADDED(tail, pad4); + + __u32 PADDED(woke, pad5); /* set when a wakeup was sent */ + + __u32 pad6[22]; /* pad out to 128 bytes */ + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define aio_ring_avail(ring) (((ring)->head + (ring)->mask - (ring)->tail) & (ring)->mask) + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + */ + +struct iocb { + /* these are internal to the kernel/libc. */ + __u64 aio_data; /* data to be returned in event's data */ + __u32 PADDED(aio_key, aio_reserved1); + /* the kernel sets aio_key to the req # */ + + /* common fields */ + __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ + __s16 aio_reqprio; + __u32 aio_fildes; + + __u64 aio_buf; + __u64 aio_nbytes; + __s64 aio_offset; + + /* extra parameters */ + __u64 aio_reserved2; + __u64 aio_reserved3; +}; /* 64 bytes */ + +#undef IFBIG +#undef IFLITTLE + +#endif /* __LINUX__AIO_ABI_H */ + diff -urN v2.4.19-pre5/include/linux/brlock.h linux.diff/include/linux/brlock.h --- v2.4.19-pre5/include/linux/brlock.h Wed Apr 3 21:10:30 2002 +++ linux.diff/include/linux/brlock.h Thu Apr 11 17:35:53 2002 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_AIO_REQ_LOCK, __BR_END }; diff -urN v2.4.19-pre5/include/linux/errno.h linux.diff/include/linux/errno.h --- v2.4.19-pre5/include/linux/errno.h Tue Nov 6 20:40:27 2001 +++ linux.diff/include/linux/errno.h Tue Apr 2 18:56:57 2002 @@ -21,6 +21,9 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ +/* Defined for TUX async IO */ +#define EWOULDBLOCKIO 530 /* Would block due to block-IO */ + #endif #endif diff -urN v2.4.19-pre5/include/linux/eventpoll.h linux.diff/include/linux/eventpoll.h --- v2.4.19-pre5/include/linux/eventpoll.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/eventpoll.h Tue Apr 30 17:29:31 2002 @@ -0,0 +1,43 @@ +/* + * include/linux/eventpoll.h + * + * Copyright (C) 2001, Davide Libenzi + * + * Efficent event polling implementation + */ + + +#ifndef _LINUX_EVENTPOLL_H +#define _LINUX_EVENTPOLL_H + + + + +#define EVENTPOLL_MINOR 124 +#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd)) +#define MAX_FDS_IN_EVENTPOLL (1024 * 128) +#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE) +#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE) +#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE) +#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd)) +#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE) +#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2) + + + + + +struct evpoll { + int ep_timeout; + unsigned long ep_resoff; +}; + +#define EP_ALLOC _IOR('P', 1, int) +#define EP_POLL _IOWR('P', 2, struct evpoll) +#define EP_FREE _IO('P', 3) +#define EP_ISPOLLED _IOWR('P', 4, struct pollfd) + + + +#endif + diff -urN v2.4.19-pre5/include/linux/fcblist.h linux.diff/include/linux/fcblist.h --- v2.4.19-pre5/include/linux/fcblist.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/fcblist.h Tue Apr 30 17:29:31 2002 @@ -0,0 +1,67 @@ +/* + * include/linux/fcblist.h + * + * Copyright (C) 2001, Davide Libenzi + * + * Handle file callbacks + */ + +#ifndef __LINUX_FCBLIST_H +#define __LINUX_FCBLIST_H + +#include +#include +#include +#include + + + +/* file callback notification events */ +#define ION_IN 1 +#define ION_OUT 2 +#define ION_HUP 3 +#define ION_ERR 4 + +#define FCB_LOCAL_SIZE 4 + +#define fcblist_read_lock(fp, fl) read_lock_irqsave(&(fp)->f_cblock, fl) +#define fcblist_read_unlock(fp, fl) read_unlock_irqrestore(&(fp)->f_cblock, fl) +#define fcblist_write_lock(fp, fl) write_lock_irqsave(&(fp)->f_cblock, fl) +#define fcblist_write_unlock(fp, fl) write_unlock_irqrestore(&(fp)->f_cblock, fl) + +struct fcb_struct { + struct list_head llink; + void (*cbproc)(struct file *, void *, unsigned long *, long *); + void *data; + unsigned long local[FCB_LOCAL_SIZE]; +}; + + +extern long ion_band_table[]; +extern long poll_band_table[]; + + +void file_notify_event(struct file *filep, long *event); + +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data); + +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)); + +void file_notify_cleanup(struct file *filep); + + +static inline void file_notify_init(struct file *filep) +{ + rwlock_init(&filep->f_cblock); + INIT_LIST_HEAD(&filep->f_cblist); +} + +static inline void file_send_notify(struct file *filep, long ioevt, long plevt) { + long event[] = { ioevt, plevt, -1 }; + + file_notify_event(filep, event); +} + +#endif diff -urN v2.4.19-pre5/include/linux/file.h linux.diff/include/linux/file.h --- v2.4.19-pre5/include/linux/file.h Wed Apr 3 21:04:40 2002 +++ linux.diff/include/linux/file.h Tue Apr 2 18:56:57 2002 @@ -5,6 +5,7 @@ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H +extern void FASTCALL(__fput(struct file *)); extern void FASTCALL(fput(struct file *)); extern struct file * FASTCALL(fget(unsigned int fd)); diff -urN v2.4.19-pre5/include/linux/fs.h linux.diff/include/linux/fs.h --- v2.4.19-pre5/include/linux/fs.h Wed Apr 3 21:12:53 2002 +++ linux.diff/include/linux/fs.h Tue Apr 30 17:56:55 2002 @@ -196,6 +196,8 @@ #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ #ifdef __KERNEL__ +#include +#include #include #include @@ -536,6 +538,10 @@ /* needed for tty driver, and maybe others */ void *private_data; + /* file callback list */ + rwlock_t f_cblock; + struct list_head f_cblist; + /* preallocated helper kiobuf to speedup O_DIRECT */ struct kiobuf *f_iobuf; long f_iobuf_lock; @@ -823,6 +829,10 @@ * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. */ + +#define F_ATOMIC 0x0001 +#define F_OFFSETOK 0x0002 + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -842,6 +852,16 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + + /* in-kernel fully async api */ + int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t); + int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t); + + /* userland aio ops */ + ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_readx)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb); + ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb); }; struct inode_operations { @@ -1420,12 +1440,16 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); +extern ssize_t generic_file_new_read(struct file *, char *, size_t, loff_t *, int); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); -extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); -extern loff_t no_llseek(struct file *file, loff_t offset, int origin); -extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int); +extern int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); +extern int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); + extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern int generic_file_open(struct inode * inode, struct file * filp); +extern loff_t no_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern struct file_operations generic_ro_fops; diff -urN v2.4.19-pre5/include/linux/iobuf.h linux.diff/include/linux/iobuf.h --- v2.4.19-pre5/include/linux/iobuf.h Wed Apr 3 21:12:55 2002 +++ linux.diff/include/linux/iobuf.h Tue Apr 30 17:56:55 2002 @@ -53,8 +53,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -80,6 +82,8 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], kdev_t dev, unsigned long b[], int size); diff -urN v2.4.19-pre5/include/linux/kiovec.h linux.diff/include/linux/kiovec.h --- v2.4.19-pre5/include/linux/kiovec.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/kiovec.h Tue Apr 30 17:56:55 2002 @@ -0,0 +1,123 @@ +#ifndef __LINUX__KIOVEC_H +#define __LINUX__KIOVEC_H + +struct page; +#include + +struct kveclet { + struct page *page; + unsigned offset; + unsigned length; +}; + +struct kvec { + unsigned max_nr; + unsigned nr; + struct kveclet veclet[0]; +}; + +struct kvec_cb { + struct kvec *vec; + void (*fn)(void *data, struct kvec *vec, ssize_t res); + void *data; +}; + +struct kvec_cb_list { + struct list_head list; + struct kvec_cb cb; +}; + +#ifndef _LINUX_TYPES_H +#include +#endif +#ifndef _LINUX_KDEV_T_H +#include +#endif +#ifndef _ASM_KMAP_TYPES_H +#include +#endif + +extern struct kvec *map_user_kvec(int rw, unsigned long va, size_t len); +extern struct kvec *mm_map_user_kvec(struct mm_struct *, int rw, + unsigned long va, size_t len); +extern void unmap_kvec(struct kvec *, int dirtied); +extern void free_kvec(struct kvec *); + +/* brw_kvec_async: + * Performs direct io to/from disk into cb.vec. Count is the number + * of sectors to read, sector_shift is the blocksize (which must be + * compatible with the kernel's current idea of the device's sector + * size) in log2. blknr is the starting sector offset on dev. + * + */ +extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count, + unsigned long blknr, int sector_shift); + +/* Memory copy helpers usage: + * void foo(... struct kveclet *veclet...) + * + * struct kvec_dst dst; + * + * kvec_dst_init(&dst, KM_USER0); -- resets type + * kvec_dst_set(&dst, veclet); -- set target & clear offset + * kvec_dst_map(&dst); -- activates kmap + * for (...) + * memcpy_to_kvec_dst(&dst, data, size); -- each copy appends + * kvec_dst_unmap(&dst); -- releases kmap + * + * Note that scheduling is not permitted between kvec_dst_map() and + * kvec_dst_unmap(). This is because internally the routines make use + * of an atomic kmap. + */ +struct kvec_dst { + char *addr; + char *dst; + struct kveclet *let; + int space; + int offset; + enum km_type type; +}; + + +#define kvec_dst_set(Xdst, Xlet) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = (Xlet); \ + _dst->let = _let; \ + _dst->space = _let->length; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_map(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = _dst->let; \ + _dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\ + _dst->dst += _let->offset + _dst->offset; \ + _dst->space = _let->length - _dst->offset; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_init(Xdst, Xtype) \ + do { \ + (Xdst)->space = 0; \ + (Xdst)->addr = 0; \ + (Xdst)->offset = 0; \ + (Xdst)->type = Xtype; \ + } while(0) + +#define kvec_dst_unmap(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + kunmap_atomic(_dst->addr, _dst->type); \ + _dst->offset = _dst->dst - _dst->addr; \ + _dst->offset -= _dst->let->offset; \ + _dst->addr = NULL; \ + } while(0) + +extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst, + const char *from, long len)); +extern void FASTCALL(memcpy_from_kvec_dst(char *to, + struct kvec_dst *from, long len)); + +#endif /* __LINUX__KIOVEC_H */ diff -urN v2.4.19-pre5/include/linux/list.h linux.diff/include/linux/list.h --- v2.4.19-pre5/include/linux/list.h Wed Apr 3 21:12:49 2002 +++ linux.diff/include/linux/list.h Tue Apr 30 17:29:31 2002 @@ -172,6 +172,11 @@ pos = pos->prev, prefetch(pos->prev)) +#define list_first(head) (((head)->next != (head)) ? (head)->next: (struct list_head *) 0) +#define list_last(head) (((head)->prev != (head)) ? (head)->prev: (struct list_head *) 0) +#define list_next(pos, head) (((pos)->next != (head)) ? (pos)->next: (struct list_head *) 0) +#define list_prev(pos, head) (((pos)->prev != (head)) ? (pos)->prev: (struct list_head *) 0) + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -urN v2.4.19-pre5/include/linux/net.h linux.diff/include/linux/net.h --- v2.4.19-pre5/include/linux/net.h Wed Apr 3 21:12:49 2002 +++ linux.diff/include/linux/net.h Tue Apr 30 17:56:55 2002 @@ -83,6 +83,9 @@ struct scm_cookie; struct vm_area_struct; struct page; +struct iocb; +struct kioctx; +#include /* shut gcc up */ struct proto_ops { int family; @@ -110,6 +113,8 @@ int (*recvmsg) (struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); + int (*kvec_read) (struct socket *sock, kvec_cb_t cb, size_t size); + int (*kvec_write) (struct socket *sock, kvec_cb_t cb, size_t size); }; struct net_proto_family diff -urN v2.4.19-pre5/include/linux/pagemap.h linux.diff/include/linux/pagemap.h --- v2.4.19-pre5/include/linux/pagemap.h Wed Apr 3 21:12:55 2002 +++ linux.diff/include/linux/pagemap.h Tue Apr 30 17:56:55 2002 @@ -88,6 +88,7 @@ extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index); extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index); extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash); +extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page)); extern void ___wait_on_page(struct page *); diff -urN v2.4.19-pre5/include/linux/pipe_fs_i.h linux.diff/include/linux/pipe_fs_i.h --- v2.4.19-pre5/include/linux/pipe_fs_i.h Thu May 3 11:22:20 2001 +++ linux.diff/include/linux/pipe_fs_i.h Tue Apr 30 17:29:31 2002 @@ -13,6 +13,8 @@ unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + struct file *rdfile; + struct file *wrfile; }; /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual @@ -30,6 +32,8 @@ #define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers) #define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter) #define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter) +#define PIPE_READFILE(inode) ((inode).i_pipe->rdfile) +#define PIPE_WRITEFILE(inode) ((inode).i_pipe->wrfile) #define PIPE_EMPTY(inode) (PIPE_LEN(inode) == 0) #define PIPE_FULL(inode) (PIPE_LEN(inode) == PIPE_SIZE) diff -urN v2.4.19-pre5/include/linux/poll.h linux.diff/include/linux/poll.h --- v2.4.19-pre5/include/linux/poll.h Wed Apr 3 21:12:55 2002 +++ linux.diff/include/linux/poll.h Tue Apr 30 17:56:56 2002 @@ -9,12 +9,15 @@ #include #include #include +#include struct poll_table_page; +struct kiocb; typedef struct poll_table_struct { - int error; - struct poll_table_page * table; + int error; + struct poll_table_page *table; + struct kiocb *iocb; /* iocb for async poll */ } poll_table; extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); @@ -29,8 +32,11 @@ { pt->error = 0; pt->table = NULL; + pt->iocb = NULL; } + extern void poll_freewait(poll_table* pt); +extern int async_poll(struct kiocb *iocb, int events); /* diff -urN v2.4.19-pre5/include/linux/sched.h linux.diff/include/linux/sched.h --- v2.4.19-pre5/include/linux/sched.h Wed Apr 3 21:12:55 2002 +++ linux.diff/include/linux/sched.h Tue Apr 30 17:56:55 2002 @@ -203,6 +203,7 @@ extern int max_map_count; +struct kioctx; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ rb_root_t mm_rb; @@ -231,6 +232,10 @@ /* Architecture-specific MM context */ mm_context_t context; + + struct kioctx *ioctx_list; + unsigned long new_ioctx_id; + int vsys_mapped; }; extern int mmlist_nr; @@ -243,6 +248,7 @@ mm_count: ATOMIC_INIT(1), \ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ page_table_lock: SPIN_LOCK_UNLOCKED, \ + vsys_mapped: 0, \ mmlist: LIST_HEAD_INIT(name.mmlist), \ } @@ -794,6 +800,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff -urN v2.4.19-pre5/include/linux/skbuff.h linux.diff/include/linux/skbuff.h --- v2.4.19-pre5/include/linux/skbuff.h Wed Apr 3 21:12:55 2002 +++ linux.diff/include/linux/skbuff.h Tue Apr 30 17:56:56 2002 @@ -1128,6 +1128,15 @@ extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); +/* skb <-> kvec helpers */ +extern void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset, + struct kvec *vec, int len); +extern int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb, + int offset, struct kvec *vec, int len); +extern int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len, + void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb)); + + extern void skb_init(void); extern void skb_add_mtu(int mtu); diff -urN v2.4.19-pre5/include/linux/sysctl.h linux.diff/include/linux/sysctl.h --- v2.4.19-pre5/include/linux/sysctl.h Wed Apr 3 21:12:49 2002 +++ linux.diff/include/linux/sysctl.h Tue Apr 30 17:56:56 2002 @@ -546,6 +546,13 @@ FS_LEASES=13, /* int: leases enabled */ FS_DIR_NOTIFY=14, /* int: directory notification enabled */ FS_LEASE_TIME=15, /* int: maximum time to wait for a lease break */ + /* 16 == jbd-debug */ + /* 17 == jbd-oom-retry */ + + FS_AIO_NR=18, /* int: current number of aio requests */ + FS_AIO_MAX_NR=19, /* int: max system wide aio requests */ + FS_AIO_MAX_SIZE=20, /* int: max size of read/write chunks */ + FS_AIO_MAX_PINNED=21, /* long: max memory pinned (in pages) */ }; /* CTL_DEBUG names: */ diff -urN v2.4.19-pre5/include/linux/tasklet.h linux.diff/include/linux/tasklet.h --- v2.4.19-pre5/include/linux/tasklet.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/tasklet.h Wed Apr 10 17:06:48 2002 @@ -0,0 +1,154 @@ +#ifndef __LINUX__TASKLET_H +#define __LINUX__TASKLET_H + +#include +#include +#include +#include /* for smp_mb */ + +/* Tasklets --- multithreaded analogue of BHs. + + Main feature differing them of generic softirqs: tasklet + is running only on one CPU simultaneously. + + Main feature differing them of BHs: different tasklets + may be run simultaneously on different CPUs. + + Properties: + * If tasklet_schedule() is called, then tasklet is guaranteed + to be executed on some cpu at least once after this. + * If the tasklet is already scheduled, but its excecution is still not + started, it will be executed only once. + * If this tasklet is already running on another CPU (or schedule is called + from tasklet itself), it is rescheduled for later. + * Tasklet is strictly serialized wrt itself, but not + wrt another tasklets. If client needs some intertask synchronization, + he makes it with spinlocks. + */ + +struct tasklet_struct +{ + struct tasklet_struct *next; + unsigned long state; + atomic_t count; + void (*func)(unsigned long); + unsigned long data; + int *unlocked; +}; + +#define DECLARE_TASKLET(name, func, data) \ +struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data, NULL } + +#define DECLARE_TASKLET_DISABLED(name, func, data) \ +struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data, NULL } + + +enum +{ + TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ + TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ +}; + +struct tasklet_head +{ + struct tasklet_struct *list; +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + +extern struct tasklet_head tasklet_vec[NR_CPUS]; +extern struct tasklet_head tasklet_hi_vec[NR_CPUS]; + +#ifdef CONFIG_SMP +static inline int tasklet_trylock(struct tasklet_struct *t) +{ + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); +} + +static inline void tasklet_unlock(struct tasklet_struct *t) +{ + smp_mb__before_clear_bit(); + clear_bit(TASKLET_STATE_RUN, &(t)->state); +} + +static inline void tasklet_unlock_self(struct tasklet_struct *t) +{ + *t->unlocked = 1; + t->unlocked = NULL; + tasklet_unlock(t); +} + +static inline void tasklet_unlock_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } +} +#else +#define tasklet_trylock(t) 1 +#define tasklet_unlock_wait(t) do { } while (0) +#define tasklet_unlock(t) do { } while (0) +#endif + +extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t)); + +static inline void tasklet_schedule(struct tasklet_struct *t) +{ + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + __tasklet_schedule(t); +} + +extern void FASTCALL(__tasklet_hi_schedule(struct tasklet_struct *t)); + +static inline void tasklet_hi_schedule(struct tasklet_struct *t) +{ + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + __tasklet_hi_schedule(t); +} + + +static inline void tasklet_disable_nosync(struct tasklet_struct *t) +{ + atomic_inc(&t->count); + smp_mb__after_atomic_inc(); +} + +static inline void tasklet_disable(struct tasklet_struct *t) +{ + tasklet_disable_nosync(t); + tasklet_unlock_wait(t); + smp_mb(); +} + +static inline void tasklet_enable(struct tasklet_struct *t) +{ + smp_mb__before_atomic_dec(); + atomic_dec(&t->count); +} + +static inline void tasklet_hi_enable(struct tasklet_struct *t) +{ + smp_mb__before_atomic_dec(); + atomic_dec(&t->count); +} + +extern void tasklet_kill(struct tasklet_struct *t); +extern void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data); + +#ifdef CONFIG_SMP + +#define SMP_TIMER_NAME(name) name##__thr + +#define SMP_TIMER_DEFINE(name, task) \ +DECLARE_TASKLET(task, name##__thr, 0); \ +static void name (unsigned long dummy) \ +{ \ + tasklet_schedule(&(task)); \ +} + +#else /* CONFIG_SMP */ + +#define SMP_TIMER_NAME(name) name +#define SMP_TIMER_DEFINE(name, task) + +#endif /* CONFIG_SMP */ + + +#endif /* __LINUX__TASKLET_H */ diff -urN v2.4.19-pre5/include/linux/tqueue.h linux.diff/include/linux/tqueue.h --- v2.4.19-pre5/include/linux/tqueue.h Wed Apr 3 21:12:49 2002 +++ linux.diff/include/linux/tqueue.h Tue Apr 30 17:56:55 2002 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff -urN v2.4.19-pre5/include/linux/types.h linux.diff/include/linux/types.h --- v2.4.19-pre5/include/linux/types.h Wed Apr 3 21:10:29 2002 +++ linux.diff/include/linux/types.h Tue Apr 2 19:14:27 2002 @@ -127,4 +127,9 @@ char f_fpack[6]; }; +/* kernel typedefs -- they belong here. */ +#ifdef __KERNEL__ +typedef struct kvec_cb kvec_cb_t; +#endif /* __KERNEL__ */ + #endif /* _LINUX_TYPES_H */ diff -urN v2.4.19-pre5/include/linux/vsyscall.h linux.diff/include/linux/vsyscall.h --- v2.4.19-pre5/include/linux/vsyscall.h Wed Dec 31 19:00:00 1969 +++ linux.diff/include/linux/vsyscall.h Tue Apr 2 18:56:58 2002 @@ -0,0 +1,20 @@ +#ifndef _LINUX__VSYSCALL_H +#define _LINUX__VSYSCALL_H + +struct vsys_cpudata { + unsigned long context_switches; + unsigned long tv_sec; + unsigned long tsc_low, tsc_high; + unsigned long cycles_per_sec; +}; + +union vsys_union { + struct vsys_cpudata data; + char pad[128]; +}; + +extern union vsys_union vsys_cpudata[256] __attribute__((section(".data.vsyscall"))); + +#define vsys_data(cpu) (&vsys_cpudata[cpu].data) + +#endif /*ndef _LINUX__VSYSCALL_H*/ diff -urN v2.4.19-pre5/include/linux/wait.h linux.diff/include/linux/wait.h --- v2.4.19-pre5/include/linux/wait.h Wed Apr 3 21:12:49 2002 +++ linux.diff/include/linux/wait.h Tue Apr 30 17:56:55 2002 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_qu