From: Nick Piggin Anyway, this last patch should be stable now, just fixed a leak in as_may_queue that looks like its been there for a while. drivers/block/as-iosched.c | 254 ++++++++++++++++----------------------------- drivers/block/ll_rw_blk.c | 88 +++++++++++++++ include/linux/blkdev.h | 44 +++++++ include/linux/sched.h | 6 - kernel/exit.c | 4 kernel/fork.c | 2 6 files changed, 233 insertions(+), 165 deletions(-) diff -puN drivers/block/as-iosched.c~generic-io-contexts drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/drivers/block/as-iosched.c 2003-06-18 00:17:21.000000000 -0700 @@ -59,14 +59,6 @@ */ #define default_antic_expire ((HZ / 150) ? HZ / 150 : 1) -/* - * This is the per-process anticipatory I/O scheduler state. It is refcounted - * and kmalloc'ed. - * - * There is no locking protecting the contents of this structure! Pointers - * to a single as_io_context may appear in multiple queues at once. - */ - /* * Keep track of up to 20ms thinktimes. We can go as big as we like here, * however huge values tend to interfere and not decay fast enough. A program @@ -82,28 +74,6 @@ enum as_io_states { AS_TASK_IORUNNING, /* Process has completed some IO */ }; -struct as_io_context { - atomic_t refcount; - pid_t pid; - unsigned long state; - atomic_t nr_queued; /* queued reads & sync writes */ - atomic_t nr_dispatched; /* number of requests gone to the drivers */ - - spinlock_t lock; - - /* IO History tracking */ - /* Thinktime */ - unsigned long last_end_request; - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; - /* Layout pattern */ - long seek_samples; - sector_t last_request_pos; - sector_t seek_total; - sector_t seek_mean; -}; - enum anticipation_status { ANTIC_OFF=0, /* Not anticipating (normal operation) */ ANTIC_WAIT_REQ, /* The last read has not yet completed */ @@ -144,8 +114,8 @@ struct as_data { unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ struct work_struct antic_work; /* Deferred unplugging */ - struct as_io_context *as_io_context;/* Identify the expected process */ - int aic_finished; /* IO associated with as_io_context finished */ + struct io_context *io_context; /* Identify the expected process */ + int ioc_finished; /* IO associated with io_context is finished */ int nr_dispatched; /* @@ -178,7 +148,7 @@ struct as_rq { struct request *request; - struct as_io_context *as_io_context; /* The submitting task */ + struct io_context *io_context; /* The submitting task */ /* * request hash, key is the ending offset (for back merge lookup) @@ -206,99 +176,55 @@ static kmem_cache_t *arq_pool; /* Debug */ static atomic_t nr_as_io_requests = ATOMIC_INIT(0); -static void put_as_io_context(struct as_io_context **paic) +/* Called to deallocate the as_io_context */ +static void free_as_io_context(struct as_io_context *aic) { - struct as_io_context *aic = *paic; - - if (aic == NULL) - return; - - BUG_ON(atomic_read(&aic->refcount) == 0); - - if (atomic_dec_and_test(&aic->refcount)) { - WARN_ON(atomic_read(&nr_as_io_requests) == 0); - atomic_dec(&nr_as_io_requests); - kfree(aic); - } + atomic_dec(&nr_as_io_requests); + kfree(aic); } -/* Called by the exitting task */ -void exit_as_io_context(void) +/* Called when the task exits */ +static void exit_as_io_context(struct as_io_context *aic) { - unsigned long flags; - struct as_io_context *aic; - - local_irq_save(flags); - aic = current->as_io_context; - if (aic) { - clear_bit(AS_TASK_RUNNING, &aic->state); - put_as_io_context(&aic); - current->as_io_context = NULL; - } - local_irq_restore(flags); + clear_bit(AS_TASK_RUNNING, &aic->state); } -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. - * - * This is always called in the context of the task which submitted the I/O. - * But weird things happen, so we disable local interrupts to ensure exclusive - * access to *current. - */ -static struct as_io_context *get_as_io_context(void) +static struct as_io_context *alloc_as_io_context(void) { - struct task_struct *tsk = current; - unsigned long flags; struct as_io_context *ret; - local_irq_save(flags); - ret = tsk->as_io_context; - if (ret == NULL) { - ret = kmalloc(sizeof(*ret), GFP_ATOMIC); - if (ret) { - atomic_inc(&nr_as_io_requests); - atomic_set(&ret->refcount, 1); - ret->pid = tsk->pid; - ret->state = 1 << AS_TASK_RUNNING; - atomic_set(&ret->nr_queued, 0); - atomic_set(&ret->nr_dispatched, 0); - spin_lock_init(&ret->lock); - ret->ttime_total = 0; - ret->ttime_samples = 0; - ret->ttime_mean = 0; - ret->seek_total = 0; - ret->seek_samples = 0; - ret->seek_mean = 0; - tsk->as_io_context = ret; - } + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (ret) { + atomic_inc(&nr_as_io_requests); + ret->dtor = free_as_io_context; + ret->exit = exit_as_io_context; + ret->state = 1 << AS_TASK_RUNNING; + atomic_set(&ret->nr_queued, 0); + atomic_set(&ret->nr_dispatched, 0); + spin_lock_init(&ret->lock); + ret->ttime_total = 0; + ret->ttime_samples = 0; + ret->ttime_mean = 0; + ret->seek_total = 0; + ret->seek_samples = 0; + ret->seek_mean = 0; } - local_irq_restore(flags); - atomic_inc(&ret->refcount); + return ret; } -static void -copy_as_io_context(struct as_io_context **pdst, struct as_io_context **psrc) +/* + * If the current task has no AS IO context then create one and initialise it. + * Then take a ref on the task's io context and return it. + */ +static struct io_context *as_get_io_context(void) { - struct as_io_context *src = *psrc; - - if (src) { - BUG_ON(atomic_read(&src->refcount) == 0); - atomic_inc(&src->refcount); - put_as_io_context(pdst); - *pdst = src; - } + struct io_context *ioc = get_io_context(); + if (ioc && !ioc->aic) + ioc->aic = alloc_as_io_context(); + return ioc; } -static void -swap_as_io_context(struct as_io_context **aic1, struct as_io_context **aic2) -{ - struct as_io_context *temp; - temp = *aic1; - *aic1 = *aic2; - *aic2 = temp; -} /* * the back merge hash support functions @@ -662,7 +588,7 @@ static void as_antic_waitreq(struct as_d { BUG_ON(ad->antic_status == ANTIC_FINISHED); if (ad->antic_status == ANTIC_OFF) { - if (!ad->as_io_context || ad->aic_finished) + if (!ad->io_context || ad->ioc_finished) as_antic_waitnext(ad); else ad->antic_status = ANTIC_WAIT_REQ; @@ -715,7 +641,7 @@ static int as_close_req(struct as_data * sector_t next = arq->request->sector; sector_t delta; /* acceptable close offset (in sectors) */ - if (ad->antic_status == ANTIC_OFF || !ad->aic_finished) + if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; else delay = ((jiffies - ad->antic_start) * 1000) / HZ; @@ -745,6 +671,7 @@ static int as_close_req(struct as_data * */ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) { + struct io_context *ioc; struct as_io_context *aic; if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { @@ -752,7 +679,7 @@ static int as_can_break_anticipation(str return 1; } - if (ad->aic_finished && as_antic_expired(ad)) { + if (ad->ioc_finished && as_antic_expired(ad)) { /* * In this situation status should really be FINISHED, * however the timer hasn't had the chance to run yet. @@ -760,13 +687,17 @@ static int as_can_break_anticipation(str return 1; } - aic = ad->as_io_context; - BUG_ON(!aic); + ioc = ad->io_context; + BUG_ON(!ioc); - if (arq && aic == arq->as_io_context) { + if (arq && ioc == arq->io_context) { /* request from same process */ return 1; } + + aic = ioc->aic; + if (!aic) + return 0; if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exitted */ @@ -810,7 +741,7 @@ static int as_can_break_anticipation(str */ static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) { - if (!ad->as_io_context) + if (!ad->io_context) /* * Last request submitted was a write */ @@ -973,12 +904,10 @@ static void as_completed_request(request { struct as_data *ad = q->elevator.elevator_data; struct as_rq *arq = RQ_DATA(rq); - struct as_io_context *aic = arq->as_io_context; + struct as_io_context *aic; - if (unlikely(!blk_fs_request(rq))) { - WARN_ON(aic); + if (unlikely(!blk_fs_request(rq))) return; - } WARN_ON(blk_fs_request(rq) && arq->state == AS_RQ_NEW); @@ -1004,18 +933,12 @@ static void as_completed_request(request ad->changed_batch = 0; } - if (!aic) + if (!arq->io_context) return; - spin_lock(&aic->lock); - if (arq->is_sync == REQ_SYNC) { - set_bit(AS_TASK_IORUNNING, &aic->state); - aic->last_end_request = jiffies; - } - - if (ad->as_io_context == aic) { + if (ad->io_context == arq->io_context) { ad->antic_start = jiffies; - ad->aic_finished = 1; + ad->ioc_finished = 1; if (ad->antic_status == ANTIC_WAIT_REQ) { /* * We were waiting on this request, now anticipate @@ -1024,9 +947,19 @@ static void as_completed_request(request as_antic_waitnext(ad); } } + + aic = arq->io_context->aic; + if (!aic) + return; + + spin_lock(&aic->lock); + if (arq->is_sync == REQ_SYNC) { + set_bit(AS_TASK_IORUNNING, &aic->state); + aic->last_end_request = jiffies; + } spin_unlock(&aic->lock); - put_as_io_context(&arq->as_io_context); + put_io_context(arq->io_context); } /* @@ -1047,9 +980,9 @@ static void as_remove_queued_request(req WARN_ON(arq->state != AS_RQ_QUEUED); - if (arq->as_io_context) { - BUG_ON(!atomic_read(&arq->as_io_context->nr_queued)); - atomic_dec(&arq->as_io_context->nr_queued); + if (arq->io_context && arq->io_context->aic) { + BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); + atomic_dec(&arq->io_context->aic->nr_queued); } /* @@ -1082,10 +1015,12 @@ static void as_remove_dispatched_request WARN_ON(arq->state != AS_RQ_DISPATCHED); WARN_ON(ON_RB(&arq->rb_node)); - aic = arq->as_io_context; - if (aic) { - WARN_ON(!atomic_read(&aic->nr_dispatched)); - atomic_dec(&aic->nr_dispatched); + if (arq->io_context && arq->io_context->aic) { + aic = arq->io_context->aic; + if (aic) { + WARN_ON(!atomic_read(&aic->nr_dispatched)); + atomic_dec(&aic->nr_dispatched); + } } } /* @@ -1180,17 +1115,17 @@ static void as_move_to_dispatch(struct a if (data_dir == REQ_SYNC) { /* In case we have to anticipate after this */ - copy_as_io_context(&ad->as_io_context, &arq->as_io_context); + copy_io_context(&ad->io_context, &arq->io_context); } else { - if (ad->as_io_context) { - put_as_io_context(&ad->as_io_context); - ad->as_io_context = NULL; + if (ad->io_context) { + put_io_context(ad->io_context); + ad->io_context = NULL; } if (ad->current_write_count != 0) ad->current_write_count--; } - ad->aic_finished = 0; + ad->ioc_finished = 0; ad->next_arq[data_dir] = as_find_next_arq(ad, arq); @@ -1199,8 +1134,8 @@ static void as_move_to_dispatch(struct a */ as_remove_queued_request(ad->q, arq->request); list_add_tail(&arq->request->queuelist, ad->dispatch); - if (arq->as_io_context) - atomic_inc(&arq->as_io_context->nr_dispatched); + if (arq->io_context && arq->io_context->aic) + atomic_inc(&arq->io_context->aic->nr_dispatched); WARN_ON(arq->state != AS_RQ_QUEUED); arq->state = AS_RQ_DISPATCHED; @@ -1355,11 +1290,11 @@ static void as_add_request(struct as_dat arq->is_sync = 0; data_dir = arq->is_sync; - arq->as_io_context = get_as_io_context(); + arq->io_context = as_get_io_context(); - if (arq->as_io_context) { - atomic_inc(&arq->as_io_context->nr_queued); - as_update_iohist(arq->as_io_context, arq->request); + if (arq->io_context && arq->io_context->aic) { + atomic_inc(&arq->io_context->aic->nr_queued); + as_update_iohist(arq->io_context->aic, arq->request); } as_add_arq_rb(ad, arq); @@ -1575,8 +1510,7 @@ as_merged_requests(request_queue_t *q, s * Don't copy here but swap, because when anext is * removed below, it must contain the unused context */ - swap_as_io_context(&arq->as_io_context, - &anext->as_io_context); + swap_io_context(&arq->io_context, &anext->io_context); } } @@ -1584,7 +1518,7 @@ as_merged_requests(request_queue_t *q, s * kill knowledge of next, this one is a goner */ as_remove_queued_request(q, next); - put_as_io_context(&anext->as_io_context); + put_io_context(anext->io_context); } /* @@ -1630,7 +1564,7 @@ static int as_set_request(request_queue_ RB_CLEAR(&arq->rb_node); arq->request = rq; arq->state = AS_RQ_NEW; - arq->as_io_context = NULL; + arq->io_context = NULL; INIT_LIST_HEAD(&arq->hash); arq->hash_valid_count = 0; INIT_LIST_HEAD(&arq->fifo); @@ -1643,16 +1577,18 @@ static int as_set_request(request_queue_ static int as_may_queue(request_queue_t *q, int rw) { + int ret = 0; struct as_data *ad = q->elevator.elevator_data; - struct as_io_context *aic; + struct io_context *ioc; if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { - aic = get_as_io_context(); - if (ad->as_io_context == aic) - return 1; + ioc = as_get_io_context(); + if (ad->io_context == ioc) + ret = 1; + put_io_context(ioc); } - return 0; + return ret; } static void as_exit(request_queue_t *q, elevator_t *e) @@ -1666,7 +1602,7 @@ static void as_exit(request_queue_t *q, BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); mempool_destroy(ad->arq_pool); - put_as_io_context(&ad->as_io_context); + put_io_context(ad->io_context); kfree(ad->hash); kfree(ad); } diff -puN drivers/block/ll_rw_blk.c~generic-io-contexts drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-06-18 00:17:21.000000000 -0700 @@ -1322,6 +1322,7 @@ get_request(request_queue_t *q, int rw, spin_lock_irq(q->queue_lock); if (rl->count[rw] == q->nr_requests) blk_set_queue_full(q, rw); + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; @@ -2381,6 +2382,93 @@ int __init blk_dev_init(void) return 0; } + +/* + * IO Context helper functions + */ +void put_io_context(struct io_context *ioc) +{ + if (ioc == NULL) + return; + + BUG_ON(atomic_read(&ioc->refcount) == 0); + + if (atomic_dec_and_test(&ioc->refcount)) { + if (ioc->aic && ioc->aic->dtor) + ioc->aic->dtor(ioc->aic); + kfree(ioc); + } +} + +/* Called by the exitting task */ +void exit_io_context(void) +{ + unsigned long flags; + struct io_context *ioc; + + local_irq_save(flags); + ioc = current->io_context; + if (ioc) { + if (ioc->aic && ioc->aic->exit) + ioc->aic->exit(ioc->aic); + put_io_context(ioc); + current->io_context = NULL; + } + local_irq_restore(flags); +} + +/* + * If the current task has no IO context then create one and initialise it. + * If it does have a context, take a ref on it. + * + * This is always called in the context of the task which submitted the I/O. + * But weird things happen, so we disable local interrupts to ensure exclusive + * access to *current. + */ +struct io_context *get_io_context(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + struct io_context *ret; + + local_irq_save(flags); + ret = tsk->io_context; + if (ret == NULL) { + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (ret) { + atomic_set(&ret->refcount, 1); + ret->pid = tsk->pid; + ret->aic = NULL; + tsk->io_context = ret; + } + } + local_irq_restore(flags); + atomic_inc(&ret->refcount); + return ret; +} + +void copy_io_context(struct io_context **pdst, struct io_context **psrc) +{ + struct io_context *src = *psrc; + struct io_context *dst = *pdst; + + if (src) { + BUG_ON(atomic_read(&src->refcount) == 0); + atomic_inc(&src->refcount); + put_io_context(dst); + *pdst = src; + } +} + +void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) +{ + struct io_context *temp; + temp = *ioc1; + *ioc1 = *ioc2; + *ioc2 = temp; +} + + /* * sysfs parts below */ diff -puN include/linux/blkdev.h~generic-io-contexts include/linux/blkdev.h --- 25/include/linux/blkdev.h~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/include/linux/blkdev.h 2003-06-18 00:17:21.000000000 -0700 @@ -24,6 +24,50 @@ struct request_pm_state; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* + * This is the per-process anticipatory I/O scheduler state. + */ +struct as_io_context { + spinlock_t lock; + + void (*dtor)(struct as_io_context *aic); /* destructor */ + void (*exit)(struct as_io_context *aic); /* called on task exit */ + + unsigned long state; + atomic_t nr_queued; /* queued reads & sync writes */ + atomic_t nr_dispatched; /* number of requests gone to the drivers */ + + /* IO History tracking */ + /* Thinktime */ + unsigned long last_end_request; + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; + /* Layout pattern */ + long seek_samples; + sector_t last_request_pos; + sector_t seek_total; + sector_t seek_mean; +}; + +/* + * This is the per-process I/O subsystem state. It is refcounted and + * kmalloc'ed. Currently all fields are modified in process io context + * (apart from the atomic refcount), so require no locking. + */ +struct io_context { + atomic_t refcount; + pid_t pid; + + struct as_io_context *aic; +}; + +void put_io_context(struct io_context *ioc); +void exit_io_context(void); +struct io_context *get_io_context(void); +void copy_io_context(struct io_context **pdst, struct io_context **psrc); +void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); + struct request_list { int count[2]; mempool_t *rq_pool; diff -puN include/linux/sched.h~generic-io-contexts include/linux/sched.h --- 25/include/linux/sched.h~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-06-18 00:17:21.000000000 -0700 @@ -321,8 +321,8 @@ struct k_itimer { }; -struct as_io_context; /* Anticipatory scheduler */ -void exit_as_io_context(void); +struct io_context; /* See blkdev.h */ +void exit_io_context(void); struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -452,7 +452,7 @@ struct task_struct { struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; - struct as_io_context *as_io_context; + struct io_context *io_context; unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ diff -puN kernel/exit.c~generic-io-contexts kernel/exit.c --- 25/kernel/exit.c~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/kernel/exit.c 2003-06-18 00:17:21.000000000 -0700 @@ -680,8 +680,8 @@ NORET_TYPE void do_exit(long code) panic("Attempted to kill the idle task!"); if (unlikely(tsk->pid == 1)) panic("Attempted to kill init!"); - if (tsk->as_io_context) - exit_as_io_context(); + if (tsk->io_context) + exit_io_context(); tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); diff -puN kernel/fork.c~generic-io-contexts kernel/fork.c --- 25/kernel/fork.c~generic-io-contexts 2003-06-18 00:17:21.000000000 -0700 +++ 25-akpm/kernel/fork.c 2003-06-18 00:17:21.000000000 -0700 @@ -864,7 +864,7 @@ struct task_struct *copy_process(unsigne p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; - p->as_io_context = NULL; + p->io_context = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) _