From: Nick Piggin Performance on my tests with mm7 is good. tiobench sequential reads still isn't really good (single processor). I think this is because of the seek_mean logic. I think tiobench files aren't layed out in a very real world manner, are they? Anyway, the following patch changes the whole notion of READ/WRITE to SYNC/ASYNC. Now this is the "simple" way because it does not also keep a list in READ/WRITE order for merging. I think this is actually alright because there probably isn't a lot of wasted merging opportunity. The generic block layer shouldn't mind that we might offer a READ request as a candidate to merge with a WRITE: it checks this and will just disallow the merge. Anyway, OraSim does not change much, pgbench gains about 15%. The biggest difference is just the artificial: Bench 7 - 2 threads, 1 reading, 1 doing write+fsync 2.5.69-mm7: IO Rate: 51.21 MB/s, Sync writes per read: 0.0005 2.5.69-mm7+this: IO Rate: 36.19 MB/s Sync writes per read: 0.8674 Because we now anticipate on reads. Probably help more with ext3 + kjournald stuff. WimMark would be interesting. drivers/block/as-iosched.c | 144 ++++++++++++++++++++++++--------------------- fs/buffer.c | 4 + fs/fs-writeback.c | 2 include/linux/sched.h | 1 4 files changed, 85 insertions(+), 66 deletions(-) diff -puN drivers/block/as-iosched.c~as-sync-async drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-sync-async 2003-06-02 21:10:26.000000000 -0700 +++ 25-akpm/drivers/block/as-iosched.c 2003-06-02 21:10:26.000000000 -0700 @@ -22,6 +22,9 @@ #include #include +#define REQ_SYNC 1 +#define REQ_ASYNC 0 + /* * See Documentation/as-iosched.txt */ @@ -115,13 +118,13 @@ struct as_data { struct list_head fifo_list[2]; struct as_rq *next_arq[2]; /* next in sort order */ - sector_t last_sector[2]; /* last READ and WRITE sectors */ + sector_t last_sector[2]; /* last REQ_SYNC and REQ_ASYNC sectors */ struct list_head *dispatch; /* driver dispatch queue */ struct list_head *hash; /* request hash */ unsigned long hash_valid_count; /* barrier hash count */ unsigned long current_batch_expires; unsigned long last_check_fifo[2]; - int batch_data_dir; /* current/last batch READ or WRITE */ + int batch_data_dir; /* current/last batch REQ_SYNC or REQ_ASYNC */ mempool_t *arq_pool; int antic_status; @@ -184,6 +187,7 @@ struct as_rq { struct list_head fifo; unsigned long expires; + int is_sync; enum arq_state state; /* debug only */ }; @@ -256,9 +260,9 @@ static struct as_io_context *get_as_io_c ret->ttime_total = 0; ret->ttime_samples = 0; ret->ttime_mean = 0; - ret->dir_after_read[READ] = 0; - ret->dir_after_read[WRITE] = 0; - ret->mean_dir_after_read = READ; + ret->dir_after_read[REQ_SYNC] = 0; + ret->dir_after_read[REQ_ASYNC] = 0; + ret->mean_dir_after_read = REQ_SYNC; ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; @@ -379,7 +383,7 @@ static struct request *as_find_arq_hash( #define ON_RB(node) ((node)->rb_color != RB_NONE) #define RB_CLEAR(node) ((node)->rb_color = RB_NONE) #define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node) -#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[rq_data_dir((arq)->request)]) +#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync]) #define rq_rb_key(rq) (rq)->sector /* @@ -499,13 +503,13 @@ as_choose_req(struct as_data *ad, struct if (arq2 == NULL) return arq1; - data_dir = rq_data_dir(arq1->request); + data_dir = arq1->is_sync; last = ad->last_sector[data_dir]; s1 = arq1->request->sector; s2 = arq2->request->sector; - BUG_ON(data_dir != rq_data_dir(arq2->request)); + BUG_ON(data_dir != arq2->is_sync); /* * Strict one way elevator _except_ in the case where we allow @@ -563,7 +567,7 @@ as_choose_req(struct as_data *ad, struct */ static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) { - const int data_dir = rq_data_dir(last->request); + const int data_dir = last->is_sync; struct as_rq *ret; struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); @@ -724,7 +728,7 @@ static int as_can_break_anticipation(str { struct as_io_context *aic; - if (arq && rq_data_dir(arq->request) == READ && as_close_req(ad, arq)) { + if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { /* close request */ return 1; } @@ -766,17 +770,17 @@ static int as_can_break_anticipation(str return 1; } - if (aic->mean_dir_after_read != READ) { + if (aic->mean_dir_after_read != REQ_SYNC) { /* next request from this process will probably be a write */ return 1; } if (arq && aic->seek_samples) { sector_t s; - if (ad->last_sector[READ] < arq->request->sector) - s = arq->request->sector - ad->last_sector[READ]; + if (ad->last_sector[REQ_SYNC] < arq->request->sector) + s = arq->request->sector - ad->last_sector[REQ_SYNC]; else - s = ad->last_sector[READ] - arq->request->sector; + s = ad->last_sector[REQ_SYNC] - arq->request->sector; if (aic->seek_mean > s) /* this request is better than what we're expecting */ return 1; @@ -828,14 +832,15 @@ static int as_can_anticipate(struct as_d */ static void as_update_iohist(struct as_io_context *aic, struct request *rq) { - int data_dir = rq_data_dir(rq); + struct as_rq *arq = RQ_DATA(rq); + int data_dir = arq->is_sync; unsigned long thinktime; sector_t seek_dist; if (aic == NULL) return; - if (data_dir == READ) { + if (data_dir == REQ_SYNC) { if (test_bit(AS_TASK_IORUNNING, &aic->state)) { /* Calculate read -> read thinktime */ thinktime = jiffies - aic->last_end_request; @@ -876,19 +881,19 @@ static void as_update_iohist(struct as_i } /* Calculate read/write pattern */ - if (aic->last_data_dir == READ) { + if (aic->last_data_dir == REQ_SYNC) { unsigned long rprob, wprob; aic->dir_after_read[data_dir] += 256; - rprob = aic->dir_after_read[READ]; - wprob = aic->dir_after_read[WRITE]; + rprob = aic->dir_after_read[REQ_SYNC]; + wprob = aic->dir_after_read[REQ_ASYNC]; if (rprob*4 >= wprob*5) - aic->mean_dir_after_read = READ; + aic->mean_dir_after_read = REQ_SYNC; else - aic->mean_dir_after_read = WRITE; + aic->mean_dir_after_read = REQ_ASYNC; - aic->dir_after_read[READ] = (rprob>>1) + (rprob>>2); - aic->dir_after_read[WRITE] = (wprob>>1) + (wprob>>2); + aic->dir_after_read[REQ_SYNC] = (rprob>>1) + (rprob>>2); + aic->dir_after_read[REQ_ASYNC] = (wprob>>1) + (wprob>>2); } aic->last_data_dir = data_dir; } @@ -900,7 +905,7 @@ static void as_update_iohist(struct as_i */ static void as_update_arq(struct as_data *ad, struct as_rq *arq) { - const int data_dir = rq_data_dir(arq->request); + const int data_dir = arq->is_sync; /* keep the next_arq cache up to date */ ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]); @@ -937,7 +942,7 @@ static void as_completed_request(request if (!aic) return; - if (rq_data_dir(arq->request) == READ) { + if (arq->is_sync == REQ_SYNC) { set_bit(AS_TASK_IORUNNING, &aic->state); aic->last_end_request = jiffies; } @@ -970,7 +975,7 @@ static void as_remove_queued_request(req if (!arq) BUG(); else { - const int data_dir = rq_data_dir(arq->request); + const int data_dir = arq->is_sync; struct as_data *ad = q->elevator.elevator_data; WARN_ON(arq->state != AS_RQ_QUEUED); @@ -1070,8 +1075,8 @@ static int as_fifo_expired(struct as_dat static inline int as_batch_expired(struct as_data *ad) { return time_after(jiffies, ad->current_batch_expires) && - (ad->batch_data_dir == WRITE || - time_after(jiffies, ad->fifo_expire[READ])); + (ad->batch_data_dir == REQ_ASYNC || + time_after(jiffies, ad->fifo_expire[REQ_SYNC])); } /* @@ -1079,7 +1084,7 @@ static inline int as_batch_expired(struc */ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) { - const int data_dir = rq_data_dir(arq->request); + const int data_dir = arq->is_sync; BUG_ON(!ON_RB(&arq->rb_node)); @@ -1093,7 +1098,7 @@ static void as_move_to_dispatch(struct a ad->last_sector[data_dir] = arq->request->sector + arq->request->nr_sectors; - if (data_dir == READ) { + if (data_dir == REQ_SYNC) { /* In case we have to anticipate after this */ copy_as_io_context(&ad->as_io_context, &arq->as_io_context); } else @@ -1122,8 +1127,8 @@ static void as_move_to_dispatch(struct a static int as_dispatch_request(struct as_data *ad) { struct as_rq *arq; - const int reads = !list_empty(&ad->fifo_list[READ]); - const int writes = !list_empty(&ad->fifo_list[WRITE]); + const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); + const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); if (!(reads || writes)) return 0; @@ -1134,8 +1139,8 @@ static int as_dispatch_request(struct as */ arq = ad->next_arq[ad->batch_data_dir]; - if (ad->batch_data_dir == READ && ad->antic_expire) { - if (as_fifo_expired(ad, READ)) + if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { + if (as_fifo_expired(ad, REQ_SYNC)) goto fifo_expired; if (as_can_anticipate(ad, arq)) { @@ -1148,7 +1153,7 @@ static int as_dispatch_request(struct as /* we have a "next request" */ if (reads && !writes) ad->current_batch_expires = - jiffies + ad->batch_expire[READ]; + jiffies + ad->batch_expire[REQ_SYNC]; goto dispatch_request; } } @@ -1159,15 +1164,15 @@ static int as_dispatch_request(struct as */ if (reads) { - BUG_ON(RB_EMPTY(&ad->sort_list[READ])); + BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC])); - if (writes && ad->batch_data_dir == READ) + if (writes && ad->batch_data_dir == REQ_SYNC) /* * Last batch was a read, switch to writes */ goto dispatch_writes; - ad->batch_data_dir = READ; + ad->batch_data_dir = REQ_SYNC; arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); ad->current_batch_expires = jiffies + ad->batch_expire[ad->batch_data_dir]; @@ -1181,9 +1186,9 @@ static int as_dispatch_request(struct as if (writes) { dispatch_writes: - BUG_ON(RB_EMPTY(&ad->sort_list[WRITE])); + BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC])); - ad->batch_data_dir = WRITE; + ad->batch_data_dir = REQ_ASYNC; arq = ad->next_arq[ad->batch_data_dir]; ad->current_batch_expires = jiffies + ad->batch_expire[ad->batch_data_dir]; @@ -1231,7 +1236,14 @@ static struct request *as_next_request(r */ static void as_add_request(struct as_data *ad, struct as_rq *arq) { - const int data_dir = rq_data_dir(arq->request); + int data_dir; + + if (rq_data_dir(arq->request) == READ + || current->flags&PF_SYNCWRITE) + arq->is_sync = 1; + else + arq->is_sync = 0; + data_dir = arq->is_sync; arq->as_io_context = get_as_io_context(); @@ -1262,11 +1274,11 @@ as_insert_request(request_queue_t *q, st AS_INVALIDATE_HASH(ad); q->last_merge = NULL; - while (ad->next_arq[READ]) - as_move_to_dispatch(ad, ad->next_arq[READ]); + while (ad->next_arq[REQ_SYNC]) + as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]); - while (ad->next_arq[WRITE]) - as_move_to_dispatch(ad, ad->next_arq[WRITE]); + while (ad->next_arq[REQ_ASYNC]) + as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]); list_add_tail(&rq->queuelist, ad->dispatch); @@ -1314,8 +1326,8 @@ static int as_queue_empty(request_queue_ { struct as_data *ad = q->elevator.elevator_data; - if (!list_empty(&ad->fifo_list[WRITE]) - || !list_empty(&ad->fifo_list[READ]) + if (!list_empty(&ad->fifo_list[REQ_ASYNC]) + || !list_empty(&ad->fifo_list[REQ_SYNC]) || !list_empty(ad->dispatch)) return 0; @@ -1535,8 +1547,8 @@ static void as_exit(request_queue_t *q, del_timer_sync(&ad->antic_timer); kblockd_flush(); - BUG_ON(!list_empty(&ad->fifo_list[READ])); - BUG_ON(!list_empty(&ad->fifo_list[WRITE])); + BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); + BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); mempool_destroy(ad->arq_pool); put_as_io_context(&ad->as_io_context); @@ -1585,20 +1597,20 @@ static int as_init(request_queue_t *q, e for (i = 0; i < AS_HASH_ENTRIES; i++) INIT_LIST_HEAD(&ad->hash[i]); - INIT_LIST_HEAD(&ad->fifo_list[READ]); - INIT_LIST_HEAD(&ad->fifo_list[WRITE]); - ad->sort_list[READ] = RB_ROOT; - ad->sort_list[WRITE] = RB_ROOT; + INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); + INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); + ad->sort_list[REQ_SYNC] = RB_ROOT; + ad->sort_list[REQ_ASYNC] = RB_ROOT; ad->dispatch = &q->queue_head; - ad->fifo_expire[READ] = read_expire; - ad->fifo_expire[WRITE] = write_expire; + ad->fifo_expire[REQ_SYNC] = read_expire; + ad->fifo_expire[REQ_ASYNC] = write_expire; ad->hash_valid_count = 1; ad->antic_expire = antic_expire; - ad->batch_expire[READ] = read_batch_expire; - ad->batch_expire[WRITE] = write_batch_expire; + ad->batch_expire[REQ_SYNC] = read_batch_expire; + ad->batch_expire[REQ_ASYNC] = write_batch_expire; e->elevator_data = ad; - ad->current_batch_expires = jiffies + ad->batch_expire[READ]; + ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; return 0; } @@ -1631,11 +1643,11 @@ static ssize_t __FUNC(struct as_data *ad { \ return as_var_show(__VAR, (page)); \ } -SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[READ]); -SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[WRITE]); +SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]); +SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]); SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire); -SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[READ]); -SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[WRITE]); +SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]); +SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1648,13 +1660,13 @@ static ssize_t __FUNC(struct as_data *ad *(__PTR) = (MAX); \ return ret; \ } -STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[READ], 0, INT_MAX); -STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[WRITE], 0, INT_MAX); +STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); +STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX); STORE_FUNCTION(as_read_batchexpire_store, - &ad->batch_expire[READ], 0, INT_MAX); + &ad->batch_expire[REQ_SYNC], 0, INT_MAX); STORE_FUNCTION(as_write_batchexpire_store, - &ad->batch_expire[WRITE], 0, INT_MAX); + &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); #undef STORE_FUNCTION static struct as_fs_entry as_readexpire_entry = { diff -puN fs/buffer.c~as-sync-async fs/buffer.c --- 25/fs/buffer.c~as-sync-async 2003-06-02 21:10:26.000000000 -0700 +++ 25-akpm/fs/buffer.c 2003-06-02 21:10:26.000000000 -0700 @@ -318,6 +318,7 @@ asmlinkage long sys_fsync(unsigned int f /* We need to protect against concurrent writers.. */ down(&inode->i_sem); + current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(inode->i_mapping); err = file->f_op->fsync(file, dentry, 0); if (!ret) @@ -325,6 +326,7 @@ asmlinkage long sys_fsync(unsigned int f err = filemap_fdatawait(inode->i_mapping); if (!ret) ret = err; + current->flags &= ~PF_SYNCWRITE; up(&inode->i_sem); out_putf: @@ -353,6 +355,7 @@ asmlinkage long sys_fdatasync(unsigned i goto out_putf; down(&inode->i_sem); + current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(inode->i_mapping); err = file->f_op->fsync(file, dentry, 1); if (!ret) @@ -360,6 +363,7 @@ asmlinkage long sys_fdatasync(unsigned i err = filemap_fdatawait(inode->i_mapping); if (!ret) ret = err; + current->flags &= ~PF_SYNCWRITE; up(&inode->i_sem); out_putf: diff -puN fs/fs-writeback.c~as-sync-async fs/fs-writeback.c --- 25/fs/fs-writeback.c~as-sync-async 2003-06-02 21:10:26.000000000 -0700 +++ 25-akpm/fs/fs-writeback.c 2003-06-02 21:10:26.000000000 -0700 @@ -498,6 +498,7 @@ int generic_osync_inode(struct inode *in int need_write_inode_now = 0; int err2; + current->flags |= PF_SYNCWRITE; if (what & OSYNC_DATA) err = filemap_fdatawrite(inode->i_mapping); if (what & (OSYNC_METADATA|OSYNC_DATA)) { @@ -510,6 +511,7 @@ int generic_osync_inode(struct inode *in if (!err) err = err2; } + current->flags &= ~PF_SYNCWRITE; spin_lock(&inode_lock); if ((inode->i_state & I_DIRTY) && diff -puN include/linux/sched.h~as-sync-async include/linux/sched.h --- 25/include/linux/sched.h~as-sync-async 2003-06-02 21:10:26.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-06-02 21:10:26.000000000 -0700 @@ -484,6 +484,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_SYNCWRITE 0x00100000 /* I am doing a sync write */ #ifdef CONFIG_SMP extern void set_cpus_allowed(task_t *p, unsigned long new_mask); _