diff -urNp x-ref/drivers/block/elevator.c x/drivers/block/elevator.c --- x-ref/drivers/block/elevator.c 2003-01-30 22:39:12.000000000 +0100 +++ x/drivers/block/elevator.c 2003-01-30 22:39:32.000000000 +0100 @@ -80,30 +80,41 @@ int elevator_linus_merge(request_queue_t struct buffer_head *bh, int rw, int max_sectors) { - struct list_head *entry = &q->queue_head; + struct list_head *entry, *real_head; unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE; struct request *__rq; int backmerge_only = 0; + if (!bh_elv_seq(bh)) + entry = &q->queue_head; + else + entry = &q->atomic_head; + real_head = entry; + while (!backmerge_only && (entry = entry->prev) != head) { __rq = blkdev_entry_to_request(entry); /* * we can't insert beyond a zero sequence point */ - if (__rq->elevator_sequence <= 0) + if (__rq->elevator_sequence <= 0 && !bh_elv_seq(bh)) backmerge_only = 1; if (__rq->waiting) continue; if (__rq->rq_dev != bh->b_rdev) continue; - if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head) && !backmerge_only) + if (!*req && bh_rq_in_between(bh, __rq, real_head) && !backmerge_only) *req = __rq; if (__rq->cmd != rw) continue; if (__rq->nr_sectors + count > max_sectors) continue; + /* + * possibly move this inside the merge path and make it a break + */ + if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh)) + continue; if (__rq->sector + __rq->nr_sectors == bh->b_rsector) { ret = ELEVATOR_BACK_MERGE; *req = __rq; @@ -124,7 +135,7 @@ int elevator_linus_merge(request_queue_t int scan_cost = ret ? 1 : ELV_LINUS_SEEK_COST; struct list_head *entry = &(*req)->queue; - while ((entry = entry->next) != &q->queue_head) { + while ((entry = entry->next) != real_head) { __rq = blkdev_entry_to_request(entry); __rq->elevator_sequence -= scan_cost; } @@ -147,13 +158,18 @@ int elevator_noop_merge(request_queue_t struct buffer_head *bh, int rw, int max_sectors) { - struct list_head *entry; + struct list_head *entry, *real_head; unsigned int count = bh->b_size >> 9; - if (list_empty(&q->queue_head)) + if (!bh_elv_seq(bh)) + entry = &q->queue_head; + else + entry = &q->atomic_head; + real_head = entry; + + if (list_empty(real_head)) return ELEVATOR_NO_MERGE; - entry = &q->queue_head; while ((entry = entry->prev) != head) { struct request *__rq = blkdev_entry_to_request(entry); @@ -165,6 +181,11 @@ int elevator_noop_merge(request_queue_t continue; if (__rq->waiting) continue; + /* + * possibly move this inside the merge path and make it a break + */ + if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh)) + continue; if (__rq->sector + __rq->nr_sectors == bh->b_rsector) { *req = __rq; return ELEVATOR_BACK_MERGE; @@ -174,7 +195,7 @@ int elevator_noop_merge(request_queue_t } } - *req = blkdev_entry_to_request(q->queue_head.prev); + *req = blkdev_entry_to_request(real_head->prev); return ELEVATOR_NO_MERGE; } diff -urNp x-ref/drivers/block/ll_rw_blk.c x/drivers/block/ll_rw_blk.c --- x-ref/drivers/block/ll_rw_blk.c 2003-01-30 22:39:30.000000000 +0100 +++ x/drivers/block/ll_rw_blk.c 2003-01-30 22:39:32.000000000 +0100 @@ -51,6 +51,8 @@ static kmem_cache_t *request_cachep; */ DECLARE_TASK_QUEUE(tq_disk); +LIST_HEAD(blk_atomic_head); + /* * Protect the request list against multiple users.. * @@ -125,9 +127,63 @@ int * max_sectors[MAX_BLKDEV]; */ char * blkdev_varyio[MAX_BLKDEV]; +/* + * only allow merging of buffer_heads with identical sequence, for transparent + * support for writing atomic blocks larger than what a single bh can hold + */ +static unsigned int blk_atomic_seq; +static spinlock_cacheline_t blk_atomic_lock_cacheline = {SPIN_LOCK_UNLOCKED}; +static spinlock_cacheline_t blk_atomic_queue_lock_cacheline = {SPIN_LOCK_UNLOCKED}; + +#ifdef CONFIG_SMP +struct blk_atomic_cpu { + unsigned int seq; + unsigned int left; +} ____cacheline_aligned_in_smp; + +struct blk_atomic_cpu __cacheline_aligned_in_smp blk_atomic_cpu[NR_CPUS]; + +#define BLK_ATOMIC_SEQ_GRAB 1024 +#endif + unsigned long blk_max_low_pfn, blk_max_pfn; int blk_nohighio = 0; +unsigned int blk_get_atomic_seq(void) +{ + unsigned int ret; + +#ifdef CONFIG_SMP + { + struct blk_atomic_cpu *bcpu = &blk_atomic_cpu[smp_processor_id()]; + +restart: + if (unlikely(!bcpu->left)) { + spin_lock_irq(&blk_atomic_lock); + bcpu->seq = blk_atomic_seq; + blk_atomic_seq += BLK_ATOMIC_SEQ_GRAB; + spin_unlock_irq(&blk_atomic_lock); + bcpu->left = BLK_ATOMIC_SEQ_GRAB; + } + bcpu->seq++; + bcpu->left--; + if (unlikely(!bcpu->seq)) + goto restart; + + ret = bcpu->seq; + } +#else + spin_lock_irq(&blk_atomic_lock); + ret = ++blk_atomic_seq; + if (unlikely(!ret)) { + ret = 1; + ++blk_atomic_seq; + } + spin_unlock_irq(&blk_atomic_lock); +#endif + return ret; +} + static inline int get_max_sectors(kdev_t dev) { if (!max_sectors[MAJOR(dev)]) @@ -383,6 +439,91 @@ void generic_unplug_device(void *data) spin_unlock_irqrestore(q->queue_lock, flags); } +static void blk_atomic_add(request_queue_t *q) +{ + spin_lock_irq(&blk_atomic_queue_lock); + /* it's empty only when it's out of the blk_atomic_head queue */ + if (list_empty(&q->atomic_entry)) + list_add_tail(&q->atomic_entry, &blk_atomic_head); + spin_unlock_irq(&blk_atomic_queue_lock); +} + +static struct list_head *blk_find_insert_point(request_queue_t *q, + struct request *rq) +{ + struct list_head *head = &q->queue_head, *insert = q->queue_head.prev; + struct buffer_head *bh; + int elv_seq; + struct request *dummy; + + if (list_empty(head)) + goto done; + else if (q->head_active && !q->plugged) + head = head->next; + + dummy = NULL; + bh = rq->bh; + + elv_seq = bh_elv_seq(bh); + bh_elv_seq(bh) = 0; + + q->elevator.elevator_merge_fn(q, &dummy, head, bh, + -1 /* non cmd -> no merge */, + 0 /* too small max_sectors -> no merge */); + + bh_elv_seq(bh) = elv_seq; + + if (dummy) + insert = &dummy->queue; + +done: + return insert; +} + +void blk_refile_atomic_queue(int sequence) +{ + request_queue_t *q; + struct request * rq; + unsigned long flags; + struct list_head * q_entry, * rq_entry; + int __sequence; + + spin_lock_irqsave(&blk_atomic_queue_lock, flags); + + q_entry = blk_atomic_head.next; + while (q_entry != &blk_atomic_head) { + q = list_entry(q_entry, request_queue_t, atomic_entry); + q_entry = q_entry->next; + + spin_lock(q->queue_lock); + rq_entry = q->atomic_head.next; + while (rq_entry != &q->atomic_head) { + rq = list_entry(rq_entry, struct request, queue); + rq_entry = rq_entry->next; + + BUG_ON(!rq->q); + BUG_ON(!rq->bh); + __sequence = bh_elv_seq(rq->bh); + BUG_ON(!__sequence); + if (__sequence == sequence) { + struct list_head *ipoint; + + list_del(&rq->queue); + if (list_empty(&q->queue_head)) + q->plug_device_fn(q, rq->bh->b_rdev); + + ipoint = blk_find_insert_point(q, rq); + list_add(&rq->queue, ipoint); + } + } + if (list_empty(&q->atomic_head)) + list_del_init(&q->atomic_entry); + spin_unlock(q->queue_lock); + } + + spin_unlock_irqrestore(&blk_atomic_queue_lock, flags); +} + /** blk_grow_request_list * @q: The &request_queue_t * @nr_requests: how many requests are desired @@ -492,6 +633,8 @@ static int __make_request(request_queue_ void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) { INIT_LIST_HEAD(&q->queue_head); + INIT_LIST_HEAD(&q->atomic_head); + INIT_LIST_HEAD(&q->atomic_entry); elevator_init(&q->elevator, ELEVATOR_LINUS); q->queue_lock = &io_request_lock; blk_init_free_list(q); @@ -837,11 +980,6 @@ static inline void add_request(request_q { drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1); - if (!q->plugged && q->head_active && insert_here == &q->queue_head) { - spin_unlock_irq(q->queue_lock); - BUG(); - } - /* * elevator indicated where it wants this request to be * inserted at elevator_merge time @@ -891,6 +1029,8 @@ static void attempt_merge(request_queue_ || req->nr_sectors + next->nr_sectors > max_sectors || next->waiting) return; + if (bh_elv_seq(req->bh) != bh_elv_seq(next->bh)) + return; /* * If we are not allowed to merge these requests, then * return. If we are allowed to merge, then the count @@ -914,11 +1054,12 @@ static void attempt_merge(request_queue_ } static inline void attempt_back_merge(request_queue_t * q, + struct list_head * head, struct request *req, int max_sectors, int max_segments) { - if (&req->queue == q->queue_head.prev) + if (&req->queue == head->prev) return; attempt_merge(q, req, max_sectors, max_segments); } @@ -944,9 +1085,10 @@ static int __make_request(request_queue_ int max_segments = MAX_SEGMENTS; struct request * req, *freereq = NULL; int rw_ahead, max_sectors, el_ret; - struct list_head *head, *insert_here; + struct list_head *head, *real_head, *insert_here; int latency; elevator_t *elevator = &q->elevator; + int atomic = bh_elv_seq(bh), atomic_add = 0; count = bh->b_size >> 9; sector = bh->b_rsector; @@ -988,7 +1130,7 @@ static int __make_request(request_queue_ max_sectors = get_max_sectors(bh->b_rdev); req = NULL; - head = &q->queue_head; + real_head = head = !atomic ? &q->queue_head : &q->atomic_head; /* * Now we acquire the request spinlock, we have to be mega careful * not to schedule or do something nonatomic @@ -997,11 +1139,14 @@ static int __make_request(request_queue_ again: insert_here = head->prev; - if (list_empty(head)) { - q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + if (!atomic) { + if (list_empty(head)) { + q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + goto get_rq; + } else if (q->head_active && !q->plugged) + head = head->next; + } else if (list_empty(head)) goto get_rq; - } else if (q->head_active && !q->plugged) - head = head->next; el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); switch (el_ret) { @@ -1017,7 +1162,7 @@ again: blk_started_io(req, count); drive_stat_acct(req->rq_dev, req->cmd, count, 0); req_new_io(req, 1, count); - attempt_back_merge(q, req, max_sectors, max_segments); + attempt_back_merge(q, real_head, req, max_sectors, max_segments); goto out; case ELEVATOR_FRONT_MERGE: @@ -1080,8 +1225,10 @@ get_rq: req = get_request(q, rw); if (req == NULL) { spin_unlock_irq(q->queue_lock); + if (atomic) + blk_refile_atomic_queue(atomic); freereq = __get_request_wait(q, rw); - head = &q->queue_head; + head = real_head; spin_lock_irq(q->queue_lock); get_request_wait_wakeup(q, rw); goto again; @@ -1107,10 +1254,13 @@ get_rq: req_new_io(req, 0, count); blk_started_io(req, count); add_request(q, req, insert_here); + atomic_add = atomic; out: if (freereq) blkdev_release_request(freereq); spin_unlock_irq(q->queue_lock); + if (atomic_add) + blk_atomic_add(q); return 0; end_io: bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); @@ -1448,6 +1598,10 @@ int __init blk_dev_init(void) memset(max_readahead, 0, sizeof(max_readahead)); memset(max_sectors, 0, sizeof(max_sectors)); +#ifdef CONFIG_SMP + memset(blk_atomic_cpu, 0, sizeof(blk_atomic_cpu)); +#endif + blk_max_low_pfn = max_low_pfn - 1; blk_max_pfn = max_pfn - 1; @@ -1567,3 +1721,5 @@ EXPORT_SYMBOL(blk_max_low_pfn); EXPORT_SYMBOL(blk_max_pfn); EXPORT_SYMBOL(blk_seg_merge_ok); EXPORT_SYMBOL(blk_nohighio); +EXPORT_SYMBOL(blk_get_atomic_seq); +EXPORT_SYMBOL(blk_refile_atomic_queue); diff -urNp x-ref/drivers/md/md.c x/drivers/md/md.c --- x-ref/drivers/md/md.c 2003-01-30 22:39:14.000000000 +0100 +++ x/drivers/md/md.c 2003-01-30 22:40:36.000000000 +0100 @@ -494,6 +494,7 @@ static int sync_page_io(kdev_t dev, unsi bh.b_page = page; bh.b_reqnext = NULL; bh.b_data = page_address(page); + bh.b_elv_sequence = 0; generic_make_request(rw, &bh); run_task_queue(&tq_disk); diff -urNp x-ref/drivers/md/raid1.c x/drivers/md/raid1.c --- x-ref/drivers/md/raid1.c 2002-11-29 02:23:05.000000000 +0100 +++ x/drivers/md/raid1.c 2003-01-30 22:40:36.000000000 +0100 @@ -686,6 +686,7 @@ static int raid1_make_request (mddev_t * mbh->b_list = BUF_LOCKED; mbh->b_end_io = raid1_end_request; mbh->b_private = r1_bh; + mbh->b_elv_sequence = bh->b_elv_sequence; mbh->b_next = r1_bh->mirror_bh_list; r1_bh->mirror_bh_list = mbh; @@ -1456,6 +1457,7 @@ static int raid1_sync_request (mddev_t * bh->b_private = r1_bh; bh->b_blocknr = sector_nr; bh->b_rsector = sector_nr; + bh->b_elv_sequence = 0; init_waitqueue_head(&bh->b_wait); generic_make_request(READ, bh); diff -urNp x-ref/drivers/md/raid5.c x/drivers/md/raid5.c --- x-ref/drivers/md/raid5.c 2002-02-25 22:05:07.000000000 +0100 +++ x/drivers/md/raid5.c 2003-01-30 22:43:15.000000000 +0100 @@ -151,7 +151,7 @@ static void shrink_buffers(struct stripe return; sh->bh_cache[i] = NULL; free_page((unsigned long) bh->b_data); - kfree(bh); + kmem_cache_free(bh_cachep, bh); } } @@ -162,7 +162,7 @@ static int grow_buffers(struct stripe_he for (i=0; ib_data = page_address(page); else { - kfree(bh); + kmem_cache_free(bh_cachep, bh); return 1; } atomic_set(&bh->b_count, 0); @@ -474,6 +474,7 @@ static struct buffer_head *raid5_build_b bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); bh->b_size = sh->size; bh->b_list = BUF_LOCKED; + bh->b_elv_sequence = 0; return bh; } diff -urNp x-ref/fs/buffer.c x/fs/buffer.c --- x-ref/fs/buffer.c 2003-01-30 22:39:30.000000000 +0100 +++ x/fs/buffer.c 2003-01-30 22:39:32.000000000 +0100 @@ -130,6 +130,7 @@ static inline void write_buffer(struct b void unlock_buffer(struct buffer_head *bh) { + bh_elv_seq(bh) = 0; clear_bit(BH_Wait_IO, &bh->b_state); clear_bit(BH_Launder, &bh->b_state); /* @@ -2278,6 +2279,7 @@ int brw_kiovec(int rw, int nr, struct ki struct page * map; struct buffer_head *tmp, **bhs = NULL; int iosize = size; + unsigned int atomic_seq; if (!nr) return 0; @@ -2294,6 +2296,8 @@ int brw_kiovec(int rw, int nr, struct ki panic("brw_kiovec: iobuf not initialised"); } + atomic_seq = blk_get_atomic_seq(); + /* * OK to walk down the iovec doing page IO on each page we find. */ @@ -2351,6 +2355,7 @@ int brw_kiovec(int rw, int nr, struct ki tmp->b_dev = dev; tmp->b_blocknr = blocknr; tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + bh_elv_seq(tmp) = atomic_seq; if (rw == WRITE) { set_bit(BH_Uptodate, &tmp->b_state); @@ -2368,12 +2373,14 @@ int brw_kiovec(int rw, int nr, struct ki * Wait for IO if we have got too much */ if (bhind >= KIO_MAX_SECTORS) { + blk_refile_atomic_queue(atomic_seq); kiobuf_wait_for_io(iobuf); /* wake-one */ err = wait_kio(rw, bhind, bhs, size); if (err >= 0) transferred += err; else goto finished; + atomic_seq = blk_get_atomic_seq(); bhind = 0; } @@ -2392,12 +2399,11 @@ int brw_kiovec(int rw, int nr, struct ki /* Is there any IO still left to submit? */ if (bhind) { + blk_refile_atomic_queue(atomic_seq); kiobuf_wait_for_io(iobuf); /* wake-one */ err = wait_kio(rw, bhind, bhs, size); if (err >= 0) transferred += err; - else - goto finished; } finished: diff -urNp x-ref/include/linux/blkdev.h x/include/linux/blkdev.h --- x-ref/include/linux/blkdev.h 2003-01-30 22:39:30.000000000 +0100 +++ x/include/linux/blkdev.h 2003-01-30 22:39:32.000000000 +0100 @@ -111,6 +111,7 @@ struct request_queue * Together with queue_head for cacheline sharing */ struct list_head queue_head; + struct list_head atomic_head; elevator_t elevator; request_fn_proc * request_fn; @@ -129,6 +130,7 @@ struct request_queue * This is used to remove the plug when tq_disk runs. */ struct tq_struct plug_tq; + struct list_head atomic_entry; /* * Boolean that indicates whether this queue is plugged or not. @@ -176,6 +178,14 @@ extern unsigned long blk_max_low_pfn, bl #define BLK_BOUNCE_HIGH (blk_max_low_pfn << PAGE_SHIFT) #define BLK_BOUNCE_ANY (blk_max_pfn << PAGE_SHIFT) +/* + * max guaranteed atomic I/O size while dealing with bounce buffers. + * highmemio capable devices (pci64 in particular) can go well beyond + * this limit. Must be a multiple of 512bytes obviously. + */ +#define BLK_ATOMIC_BOUNCE_SIZE 32768 +#define BLK_ATOMIC_BOUNCE_ENTRIES (BLK_ATOMIC_BOUNCE_SIZE >> 9) + extern void blk_queue_bounce_limit(request_queue_t *, u64); #ifdef CONFIG_HIGHMEM @@ -233,6 +243,13 @@ extern void generic_make_request(int rw, extern inline request_queue_t *blk_get_queue(kdev_t dev); extern void blkdev_release_request(struct request *); +extern spinlock_cacheline_t blk_atomic_lock_cacheline; +#define blk_atomic_lock (blk_atomic_lock_cacheline.lock) +extern unsigned int blk_get_atomic_seq(void); +extern spinlock_cacheline_t blk_atomic_queue_lock_cacheline; +#define blk_atomic_queue_lock (blk_atomic_queue_lock_cacheline.lock) +extern void FASTCALL(blk_refile_atomic_queue(int sequence)); + /* * Access functions for manipulating queue properties */ diff -urNp x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2003-01-30 22:39:30.000000000 +0100 +++ x/include/linux/fs.h 2003-01-30 22:39:32.000000000 +0100 @@ -270,6 +270,7 @@ struct buffer_head { void *b_private; /* reserved for b_end_io */ unsigned long b_rsector; /* Real buffer location on disk */ + int b_elv_sequence; /* for atomic blocks */ wait_queue_head_t b_wait; struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */ @@ -296,6 +297,7 @@ extern void set_bh_page(struct buffer_he #define touch_buffer(bh) mark_page_accessed(bh->b_page) +#define bh_elv_seq(bh) (bh)->b_elv_sequence #include #include diff -urNp x-ref/mm/highmem.c x/mm/highmem.c --- x-ref/mm/highmem.c 2003-01-30 22:39:20.000000000 +0100 +++ x/mm/highmem.c 2003-01-30 22:39:32.000000000 +0100 @@ -22,6 +22,7 @@ #include #include #include +#include #include /* @@ -205,6 +206,14 @@ static LIST_HEAD(emergency_pages); int nr_emergency_bhs; static LIST_HEAD(emergency_bhs); +int nr_atomic_emergency_pages; +static LIST_HEAD(atomic_emergency_pages); + +int nr_atomic_emergency_bhs; +static LIST_HEAD(atomic_emergency_bhs); + +int atomic_emergency_owner; + /* * Simple bounce buffer support for highmem pages. * This will be moved to the block layer in 2.5. @@ -244,35 +253,66 @@ static inline void bounce_end_io (struct struct page *page; struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); unsigned long flags; + int atomic = bh_elv_seq(bh); bh_orig->b_end_io(bh_orig, uptodate); page = bh->b_page; spin_lock_irqsave(&emergency_lock, flags); - if (nr_emergency_pages >= POOL_SIZE) - __free_page(page); - else { - /* - * We are abusing page->list to manage - * the highmem emergency pool: - */ - list_add(&page->list, &emergency_pages); - nr_emergency_pages++; - } - - if (nr_emergency_bhs >= POOL_SIZE) { + if (!atomic) { + if (nr_emergency_pages >= POOL_SIZE) + __free_page(page); + else { + /* + * We are abusing page->list to manage + * the highmem emergency pool: + */ + list_add(&page->list, &emergency_pages); + nr_emergency_pages++; + } + + if (nr_emergency_bhs >= POOL_SIZE) { #ifdef HIGHMEM_DEBUG - /* Don't clobber the constructed slab cache */ - init_waitqueue_head(&bh->b_wait); + /* Don't clobber the constructed slab cache */ + init_waitqueue_head(&bh->b_wait); #endif - kmem_cache_free(bh_cachep, bh); + kmem_cache_free(bh_cachep, bh); + } else { + /* + * Ditto in the bh case, here we abuse b_inode_buffers: + */ + list_add(&bh->b_inode_buffers, &emergency_bhs); + nr_emergency_bhs++; + } } else { - /* - * Ditto in the bh case, here we abuse b_inode_buffers: - */ - list_add(&bh->b_inode_buffers, &emergency_bhs); - nr_emergency_bhs++; + if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES) + __free_page(page); + else { + /* + * We are abusing page->list to manage + * the highmem emergency pool: + */ + list_add(&page->list, &atomic_emergency_pages); + nr_atomic_emergency_pages++; + } + + if (nr_atomic_emergency_bhs >= BLK_ATOMIC_BOUNCE_ENTRIES) { +#ifdef HIGHMEM_DEBUG + /* Don't clobber the constructed slab cache */ + init_waitqueue_head(&bh->b_wait); +#endif + kmem_cache_free(bh_cachep, bh); + } else { + /* + * Ditto in the bh case, here we abuse b_inode_buffers: + */ + list_add(&bh->b_inode_buffers, &atomic_emergency_bhs); + nr_atomic_emergency_bhs++; + } + BUG_ON(nr_atomic_emergency_pages != nr_atomic_emergency_bhs); + if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES) + atomic_emergency_owner = 0; } spin_unlock_irqrestore(&emergency_lock, flags); } @@ -305,6 +345,24 @@ static __init int init_emergency_pool(vo list_add(&bh->b_inode_buffers, &emergency_bhs); nr_emergency_bhs++; } + while (nr_atomic_emergency_pages < BLK_ATOMIC_BOUNCE_ENTRIES) { + struct page * page = alloc_page(GFP_ATOMIC); + if (!page) { + printk("couldn't refill highmem emergency pages"); + break; + } + list_add(&page->list, &atomic_emergency_pages); + nr_atomic_emergency_pages++; + } + while (nr_atomic_emergency_bhs < BLK_ATOMIC_BOUNCE_ENTRIES) { + struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC); + if (!bh) { + printk("couldn't refill highmem emergency bhs"); + break; + } + list_add(&bh->b_inode_buffers, &atomic_emergency_bhs); + nr_atomic_emergency_bhs++; + } spin_unlock_irq(&emergency_lock); printk("allocated %d pages and %d bhs reserved for the highmem bounces\n", nr_emergency_pages, nr_emergency_bhs); @@ -328,7 +386,7 @@ static void bounce_end_io_read (struct b bounce_end_io(bh, uptodate); } -struct page *alloc_bounce_page (void) +struct page *alloc_bounce_page (int atomic) { struct list_head *tmp; struct page *page; @@ -346,17 +404,30 @@ repeat_alloc: /* * Try to allocate from the emergency pool. */ - tmp = &emergency_pages; spin_lock_irq(&emergency_lock); - if (!list_empty(tmp)) { - page = list_entry(tmp->next, struct page, list); - list_del(tmp->next); - nr_emergency_pages--; + if (!atomic) { + tmp = &emergency_pages; + if (!list_empty(tmp)) { + page = list_entry(tmp->next, struct page, list); + list_del(tmp->next); + nr_emergency_pages--; + } + } else { + tmp = &atomic_emergency_pages; + if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) && + !list_empty(tmp)) { + page = list_entry(tmp->next, struct page, list); + list_del(tmp->next); + nr_atomic_emergency_pages--; + atomic_emergency_owner = atomic; + } } spin_unlock_irq(&emergency_lock); if (page) return page; + if (atomic) + blk_refile_atomic_queue(atomic); /* we need to wait I/O completion */ run_task_queue(&tq_disk); @@ -364,7 +435,7 @@ repeat_alloc: goto repeat_alloc; } -struct buffer_head *alloc_bounce_bh (void) +struct buffer_head *alloc_bounce_bh (int atomic) { struct list_head *tmp; struct buffer_head *bh; @@ -382,17 +453,31 @@ repeat_alloc: /* * Try to allocate from the emergency pool. */ - tmp = &emergency_bhs; spin_lock_irq(&emergency_lock); - if (!list_empty(tmp)) { - bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); - list_del(tmp->next); - nr_emergency_bhs--; + if (!atomic) { + tmp = &emergency_bhs; + if (!list_empty(tmp)) { + bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); + list_del(tmp->next); + nr_emergency_bhs--; + } + } else { + tmp = &atomic_emergency_bhs; + if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) && + !list_empty(tmp)) { + bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); + list_del(tmp->next); + nr_atomic_emergency_bhs--; + atomic_emergency_owner = atomic; + } + } spin_unlock_irq(&emergency_lock); if (bh) return bh; + if (atomic) + blk_refile_atomic_queue(atomic); /* we need to wait I/O completion */ run_task_queue(&tq_disk); @@ -408,14 +493,14 @@ struct buffer_head * create_bounce(int r if (!PageHighMem(bh_orig->b_page)) return bh_orig; - bh = alloc_bounce_bh(); + bh = alloc_bounce_bh(bh_elv_seq(bh_orig)); /* * This is wasteful for 1k buffers, but this is a stopgap measure * and we are being ineffective anyway. This approach simplifies * things immensly. On boxes with more than 4GB RAM this should * not be an issue anyway. */ - page = alloc_bounce_page(); + page = alloc_bounce_page(bh_elv_seq(bh_orig)); set_bh_page(bh, page, 0); @@ -443,6 +528,7 @@ struct buffer_head * create_bounce(int r bh->b_end_io = bounce_end_io_read; bh->b_private = (void *)bh_orig; bh->b_rsector = bh_orig->b_rsector; + bh_elv_seq(bh) = bh_elv_seq(bh_orig); #ifdef HIGHMEM_DEBUG memset(&bh->b_wait, -1, sizeof(bh->b_wait)); #endif