From: Chris Mason We need aio poll for sles8 compatibility, so I whipped up a quick and dirty 2.6 aio poll patch. Eventually I'll probably add some simple socket support to aio-stress, but I'm hoping to validate the code a little now before putting it into our CVS. I've attached the patch, it's not quite as obvious as the pipe code, but not too bad. I'm not sure if I'm using struct kiocb->private the way it was intended, but I don't see any other code touching it, so it should be ok. On Mon 2004-02-23 at 14:05, Suparna Bhattacharya wrote: > I was wondering if a particular fop->poll routine, could possibly > invoke __pollwait for more than one wait queue (I don't know if such > a case even exists). That kind of a thing would work OK with the existing > poll logic, but not in our case, because we'd end up queueing the same > wait queue on two queues which would be a problem. Oh, I see what you mean. I looked at a few of the poll_wait callers, and it seems safe, but there are too many for a full audit right now. The attached patch fixes the page allocation problem and adds a check to make sure we don't abuse current->io_wait. An oops is better than random corruption at least. I ran it through my basic test and pipetest, the pipetest results are below. The pipetest epoll usage needs updating, so I can only compare against regular poll. ./pipetest --aio-poll 10000 1 5 using 10000 pipe pairs, 1 message threads, 5 generations, 12 bufsize Ok! Mode aio-poll: 5 passes in 0.000073 seconds passes_per_sec: 68493.15 coffee:/usr/src/aio # ./pipetest 10000 1 5 using 10000 pipe pairs, 1 message threads, 5 generations, 12 bufsize Ok! Mode poll: 5 passes in 0.083066 seconds passes_per_sec: 60.19 Here are some optimizations. aio-poll-3 avoids wake_up when it can use finish_wait instead, and adds a fast path to aio-poll for when data is already available. Index: linux.lkcd/fs/aio.c From: Chris Mason We need aio poll for sles8 compatibility, so I whipped up a quick and dirty 2.6 aio poll patch. Eventually I'll probably add some simple socket support to aio-stress, but I'm hoping to validate the code a little now before putting it into our CVS. I've attached the patch, it's not quite as obvious as the pipe code, but not too bad. I'm not sure if I'm using struct kiocb->private the way it was intended, but I don't see any other code touching it, so it should be ok. On Mon 2004-02-23 at 14:05, Suparna Bhattacharya wrote: > I was wondering if a particular fop->poll routine, could possibly > invoke __pollwait for more than one wait queue (I don't know if such > a case even exists). That kind of a thing would work OK with the existing > poll logic, but not in our case, because we'd end up queueing the same > wait queue on two queues which would be a problem. Oh, I see what you mean. I looked at a few of the poll_wait callers, and it seems safe, but there are too many for a full audit right now. The attached patch fixes the page allocation problem and adds a check to make sure we don't abuse current->io_wait. An oops is better than random corruption at least. I ran it through my basic test and pipetest, the pipetest results are below. The pipetest epoll usage needs updating, so I can only compare against regular poll. ./pipetest --aio-poll 10000 1 5 using 10000 pipe pairs, 1 message threads, 5 generations, 12 bufsize Ok! Mode aio-poll: 5 passes in 0.000073 seconds passes_per_sec: 68493.15 coffee:/usr/src/aio # ./pipetest 10000 1 5 using 10000 pipe pairs, 1 message threads, 5 generations, 12 bufsize Ok! Mode poll: 5 passes in 0.083066 seconds passes_per_sec: 60.19 Here are some optimizations. aio-poll-3 avoids wake_up when it can use finish_wait instead, and adds a fast path to aio-poll for when data is already available. Index: linux.lkcd/fs/aio.c diff -upN reference/fs/aio.c current/fs/aio.c --- reference/fs/aio.c 2004-04-29 10:39:25.000000000 -0700 +++ current/fs/aio.c 2004-04-29 10:39:25.000000000 -0700 @@ -1387,6 +1387,16 @@ static ssize_t aio_fsync(struct kiocb *i } /* + * Retry method for aio_poll (also used for first time submit) + * Responsible for updating iocb state as retries progress + */ +static ssize_t aio_poll(struct kiocb *iocb) +{ + unsigned events = (unsigned)(iocb->ki_buf); + return generic_aio_poll(iocb, events); +} + +/* * aio_setup_iocb: * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. @@ -1431,6 +1441,13 @@ ssize_t aio_setup_iocb(struct kiocb *kio if (file->f_op->aio_fsync) kiocb->ki_retry = aio_fsync; break; + case IOCB_CMD_POLL: + ret = -EINVAL; + if (file->f_op->poll) { + memset(kiocb->private, 0, sizeof(kiocb->private)); + kiocb->ki_retry = aio_poll; + } + break; default: dprintk("EINVAL: io_submit: no operation provided\n"); ret = -EINVAL; diff -upN reference/fs/select.c current/fs/select.c --- reference/fs/select.c 2003-10-14 15:50:30.000000000 -0700 +++ current/fs/select.c 2004-04-29 10:39:25.000000000 -0700 @@ -21,6 +21,7 @@ #include /* for STICKY_TIMEOUTS */ #include #include +#include #include @@ -39,6 +40,12 @@ struct poll_table_page { struct poll_table_entry entries[0]; }; +struct aio_poll_table { + int init; + struct poll_wqueues wq; + struct poll_table_page table; +}; + #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) @@ -109,12 +116,34 @@ void __pollwait(struct file *filp, wait_ /* Add a new entry */ { struct poll_table_entry * entry = table->entry; + wait_queue_t *wait; + wait_queue_t *aio_wait = current->io_wait; + + if (aio_wait) { + /* for aio, there can only be one wait_address. + * we might be adding it again via a retry call + * if so, just return. + * if not, bad things are happening + */ + if (table->entry != table->entries) { + if (table->entries[0].wait_address != wait_address) + BUG(); + return; + } + } + table->entry = entry+1; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; init_waitqueue_entry(&entry->wait, current); - add_wait_queue(wait_address,&entry->wait); + + /* if we're in aioland, use current->io_wait */ + if (aio_wait) + wait = aio_wait; + else + wait = &entry->wait; + add_wait_queue(wait_address,wait); } } @@ -533,3 +562,76 @@ out_fds: poll_freewait(&table); return err; } + +static void aio_poll_freewait(struct aio_poll_table *ap, struct kiocb *iocb) +{ + struct poll_table_page * p = ap->wq.table; + if (p) { + struct poll_table_entry * entry = p->entry; + if (entry > p->entries) { + /* + * there is only one entry for aio polls + */ + entry = p->entries; + if (iocb) + finish_wait(entry->wait_address,&iocb->ki_wait); + else + wake_up(entry->wait_address); + fput(entry->filp); + } + } + ap->init = 0; +} + +static int +aio_poll_cancel(struct kiocb *iocb, struct io_event *evt) +{ + struct aio_poll_table *aio_table; + aio_table = (struct aio_poll_table *)iocb->private; + + evt->obj = (u64)(unsigned long)iocb->ki_user_obj; + evt->data = iocb->ki_user_data; + evt->res = iocb->ki_nbytes - iocb->ki_left; + if (evt->res == 0) + evt->res = -EINTR; + evt->res2 = 0; + if (aio_table->init) + aio_poll_freewait(aio_table, NULL); + aio_put_req(iocb); + return 0; +} + +ssize_t generic_aio_poll(struct kiocb *iocb, unsigned events) +{ + struct aio_poll_table *aio_table; + unsigned mask; + struct file *file = iocb->ki_filp; + aio_table = (struct aio_poll_table *)iocb->private; + + /* fast path */ + mask = file->f_op->poll(file, NULL); + mask &= events | POLLERR | POLLHUP; + if (mask) + return mask; + + if ((sizeof(*aio_table) + sizeof(struct poll_table_entry)) > + sizeof(iocb->private)) + BUG(); + + if (!aio_table->init) { + aio_table->init = 1; + poll_initwait(&aio_table->wq); + aio_table->wq.table = &aio_table->table; + aio_table->table.next = NULL; + aio_table->table.entry = aio_table->table.entries; + } + iocb->ki_cancel = aio_poll_cancel; + + mask = file->f_op->poll(file, &aio_table->wq.pt); + mask &= events | POLLERR | POLLHUP; + if (mask) { + aio_poll_freewait(aio_table, iocb); + return mask; + } + return -EIOCBRETRY; +} diff -upN reference/include/linux/aio.h current/include/linux/aio.h --- reference/include/linux/aio.h 2004-04-29 10:39:24.000000000 -0700 +++ current/include/linux/aio.h 2004-04-29 10:39:25.000000000 -0700 @@ -198,4 +198,5 @@ static inline struct kiocb *list_kiocb(s extern atomic_t aio_nr; extern unsigned aio_max_nr; +extern ssize_t generic_aio_poll(struct kiocb *, unsigned); #endif /* __LINUX__AIO_H */ diff -upN reference/include/linux/aio_abi.h current/include/linux/aio_abi.h --- reference/include/linux/aio_abi.h 2002-12-09 18:45:44.000000000 -0800 +++ current/include/linux/aio_abi.h 2004-04-29 10:39:25.000000000 -0700 @@ -38,8 +38,8 @@ enum { IOCB_CMD_FDSYNC = 3, /* These two are experimental. * IOCB_CMD_PREADX = 4, - * IOCB_CMD_POLL = 5, */ + IOCB_CMD_POLL = 5, IOCB_CMD_NOOP = 6, };