diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/aio.c x/fs/aio.c --- x-ref/fs/aio.c 2003-08-31 03:54:47.000000000 +0200 +++ x/fs/aio.c 2003-08-31 03:56:22.000000000 +0200 @@ -902,6 +902,19 @@ asmlinkage long sys_io_destroy(aio_conte return -EINVAL; } +ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + unsigned events = iocb->aio_buf; + + /* Did the user set any bits they weren't supposed to? (The + * above is actually a cast. + */ + if (unlikely(events != iocb->aio_buf)) + return -EINVAL; + + return async_poll(req, events); +} + static inline int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, struct iocb *iocb) { @@ -978,6 +991,9 @@ static inline int io_submit_one(struct k case IOCB_CMD_FSYNC: op = file->f_op->aio_fsync; break; + case IOCB_CMD_POLL: + op = generic_aio_poll; + break; default: dprintk("EINVAL: io_submit: no operation %d provided by aio\n", iocb->aio_lio_opcode); diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/select.c x/fs/select.c --- x-ref/fs/select.c 2003-08-31 03:54:49.000000000 +0200 +++ x/fs/select.c 2003-08-31 03:55:44.000000000 +0200 @@ -12,6 +12,12 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * June 2001 + * Added async_poll implementation. -bcrl + * Nov 2001 + * Async poll improvments from Suparna Bhattacharya + * April 2002 + * smp safe async poll plus cancellation. -bcrl */ #include @@ -20,6 +26,8 @@ #include /* for STICKY_TIMEOUTS */ #include #include +#include +#include #include @@ -27,19 +35,36 @@ #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) struct poll_table_entry { - struct file * filp; wait_queue_t wait; wait_queue_head_t * wait_address; + struct file * filp; + struct poll_wqueues * p; }; struct poll_table_page { + unsigned long size; struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; #define POLL_TABLE_FULL(table) \ - ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + ((unsigned long)((table)->entry+1) > \ + (table)->size + (unsigned long)(table)) + +/* async poll uses only one entry per poll table as it is linked to an iocb */ +typedef struct async_poll_table_struct { + struct poll_wqueues pwq; + struct worktodo wtd; + int events; /* event mask for async poll */ + int wake; + long sync; + struct poll_table_page pt_page; /* one poll table page hdr */ + struct poll_table_entry entries[1]; /* space for a single entry */ +} async_poll_table; + + +static kmem_cache_t *async_poll_table_cache; /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. @@ -60,9 +85,10 @@ void poll_initwait(struct poll_wqueues * init_poll_funcptr(&pwq->pt, __pollwait); pwq->error = 0; pwq->table = NULL; + pwq->iocb = NULL; } -void poll_freewait(struct poll_wqueues *pwq) +void __poll_freewait(struct poll_wqueues *pwq, wait_queue_t *wait) { struct poll_table_page * p = pwq->table; while (p) { @@ -70,15 +96,142 @@ void poll_freewait(struct poll_wqueues * struct poll_table_page *old; entry = p->entry; + if (entry == p->entries) /* may happen with async poll */ + break; do { entry--; - remove_wait_queue(entry->wait_address,&entry->wait); - fput(entry->filp); + if (wait != &entry->wait) + remove_wait_queue(entry->wait_address,&entry->wait); + else + __remove_wait_queue(entry->wait_address,&entry->wait); + fput(entry->filp); } while (entry > p->entries); old = p; p = p->next; - free_page((unsigned long) old); + if (old->size == PAGE_SIZE) + free_page((unsigned long) old); } + if (pwq->iocb) + kmem_cache_free(async_poll_table_cache, pwq); +} + +void poll_freewait(struct poll_wqueues* pwq) +{ + __poll_freewait(pwq, NULL); +} + +void async_poll_complete(void *data) +{ + async_poll_table *pasync = data; + struct poll_wqueues *p = data; + struct kiocb *iocb = p->iocb; + unsigned int mask; + poll_table *wait = &p->pt; + + pasync->wake = 0; + wmb(); + do { + mask = iocb->filp->f_op->poll(iocb->filp, wait); + mask &= pasync->events | POLLERR | POLLHUP; + if (mask) { + struct poll_wqueues *p2 = xchg(&iocb->data, NULL); + if (p2) { + poll_freewait(p2); + aio_complete(iocb, mask, 0); + } + return; + } + pasync->sync = 0; + wmb(); + } while (pasync->wake); +} + +static void async_poll_waiter(wait_queue_t *wait, + int thrway1, int thrway2) +{ + struct poll_table_entry *entry = (struct poll_table_entry *)wait; + async_poll_table *pasync = (async_poll_table *)(entry->p); + struct kiocb *iocb = pasync->pwq.iocb; + unsigned int mask; + + mask = iocb->filp->f_op->poll(iocb->filp, NULL); + mask &= pasync->events | POLLERR | POLLHUP; + if (mask) { + struct poll_wqueues *p2 = xchg(&iocb->data, NULL); + if (p2) { + __poll_freewait(p2, wait); + aio_complete(iocb, mask, 0); + } + return; + } + } + +int async_poll_cancel(struct kiocb *iocb, struct io_event *res) +{ + struct poll_wqueues *p; + + p = xchg(&iocb->data, NULL); + aio_put_req(iocb); + if (p) { + poll_freewait(p); + /* + * Since poll_freewait() locks the wait queue, we know that + * async_poll_waiter() is either not going to be run or has + * finished all its work. + */ + aio_put_req(iocb); + return 0; + } + return -EAGAIN; +} + +int async_poll(struct kiocb *iocb, int events) +{ + unsigned int mask; + async_poll_table *pasync; + struct poll_wqueues *p; + poll_table *wait; + + /* Fast path */ + if (iocb->filp->f_op && iocb->filp->f_op->poll) { + mask = iocb->filp->f_op->poll(iocb->filp, NULL); + mask &= events | POLLERR | POLLHUP; + if (mask & events) + return mask; + } + + pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL); + if (!pasync) + return -ENOMEM; + + p = (struct poll_wqueues *)pasync; + poll_initwait(p); + wtd_set_action(&pasync->wtd, async_poll_complete, pasync); + p->iocb = iocb; + pasync->wake = 0; + pasync->sync = 0; + pasync->events = events; + pasync->pt_page.entry = pasync->pt_page.entries; + pasync->pt_page.size = sizeof(pasync->pt_page); + p->table = &pasync->pt_page; + + iocb->data = p; + iocb->users ++; + wmb(); + + mask = DEFAULT_POLLMASK; + wait = &p->pt; + if (iocb->filp->f_op && iocb->filp->f_op->poll) + mask = iocb->filp->f_op->poll(iocb->filp, wait); + mask &= events | POLLERR | POLLHUP; + if (mask && xchg(&iocb->data, NULL)) { + poll_freewait(p); + aio_complete(iocb, mask, 0); + } + + iocb->cancel = async_poll_cancel; + aio_put_req(iocb); + return 0; } void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p) @@ -95,6 +248,7 @@ void __pollwait(struct file *filp, wait_ __set_current_state(TASK_RUNNING); return; } + new_table->size = PAGE_SIZE; new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; @@ -108,7 +262,11 @@ void __pollwait(struct file *filp, wait_ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->p = p; + if (p->iocb) /* async poll */ + init_waitqueue_func_entry(&entry->wait, async_poll_waiter); + else + init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); smp_mb(); } @@ -507,3 +665,14 @@ out: poll_freewait(&table); return err; } + +static int __init async_poll_init(void) +{ + async_poll_table_cache = kmem_cache_create("async poll table", + sizeof(async_poll_table), 0, 0, NULL, NULL); + if (!async_poll_table_cache) + panic("unable to alloc poll_table_cache"); + return 0; +} + +module_init(async_poll_init); diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/aio_abi.h x/include/linux/aio_abi.h --- x-ref/include/linux/aio_abi.h 2003-08-31 03:54:47.000000000 +0200 +++ x/include/linux/aio_abi.h 2003-08-31 03:55:44.000000000 +0200 @@ -36,10 +36,11 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* These two are experimental. + /* + * Experimental: * IOCB_CMD_PREADX = 4, - * IOCB_CMD_POLL = 5, */ + IOCB_CMD_POLL = 5, IOCB_CMD_NOOP = 6, }; diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2003-08-31 03:54:49.000000000 +0200 +++ x/include/linux/fs.h 2003-08-31 03:55:44.000000000 +0200 @@ -950,6 +950,10 @@ struct block_device_operations { * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. */ + +#define F_ATOMIC 0x0001 +#define F_OFFSETOK 0x0002 + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/pipe_fs_i.h x/include/linux/pipe_fs_i.h --- x-ref/include/linux/pipe_fs_i.h 2003-03-15 03:25:14.000000000 +0100 +++ x/include/linux/pipe_fs_i.h 2003-08-31 03:55:44.000000000 +0200 @@ -13,6 +13,9 @@ struct pipe_inode_info { unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + spinlock_t pipe_aio_lock; + struct list_head read_iocb_list; + struct list_head write_iocb_list; }; /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/poll.h x/include/linux/poll.h --- x-ref/include/linux/poll.h 2003-08-31 03:54:49.000000000 +0200 +++ x/include/linux/poll.h 2003-08-31 03:55:44.000000000 +0200 @@ -9,8 +9,10 @@ #include #include #include +#include struct poll_table_struct; +struct kiocb; /* * structures and helpers for f_op->poll implementations @@ -39,10 +41,12 @@ struct poll_wqueues { poll_table pt; struct poll_table_page * table; int error; + struct kiocb *iocb; /* iocb for async poll */ }; extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); +extern int async_poll(struct kiocb *iocb, int events); /* * Scaleable version of the fd_set.