diff -urN 2.4.4/fs/buffer.c o_direct/fs/buffer.c --- 2.4.4/fs/buffer.c Sat Apr 28 05:24:43 2001 +++ o_direct/fs/buffer.c Mon Apr 30 17:36:47 2001 @@ -582,6 +582,16 @@ spin_unlock(&lru_list_lock); } +void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); +} + /* The caller must have the lru_list lock before calling the remove_inode_queue functions. */ static void __remove_inode_queue(struct buffer_head *bh) @@ -601,7 +611,7 @@ int ret; spin_lock(&lru_list_lock); - ret = !list_empty(&inode->i_dirty_buffers); + ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers); spin_unlock(&lru_list_lock); return ret; @@ -836,6 +846,113 @@ bh->b_end_io = end_buffer_io_async ; } +int osync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + repeat: + + for (list = inode->i_dirty_data_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + return err; +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ + +int osync_inode_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + repeat: + + for (list = inode->i_dirty_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + return err; +} + +int fsync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0; + + INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(&inode->i_dirty_data_buffers)) { + bh = BH_ENTRY(inode->i_dirty_data_buffers.next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers); + if (buffer_dirty(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_data_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev); + remove_inode_queue(bh); + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + spin_unlock(&lru_list_lock); + + return err; +} + /* * Synchronise all the inode's dirty buffers to the disk. * @@ -860,7 +977,7 @@ { struct buffer_head *bh; struct inode tmp; - int err = 0, err2; + int err = 0; INIT_LIST_HEAD(&tmp.i_dirty_buffers); @@ -895,58 +1012,11 @@ brelse(bh); spin_lock(&lru_list_lock); } - spin_unlock(&lru_list_lock); - err2 = osync_inode_buffers(inode); - - if (err) - return err; - else - return err2; -} - -/* - * osync is designed to support O_SYNC io. It waits synchronously for - * all already-submitted IO to complete, but does not queue any new - * writes to the disk. - * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for - * completion. Any other dirty buffers which are not yet queued for - * write will not be flushed to disk by the osync. - */ - -int osync_inode_buffers(struct inode *inode) -{ - struct buffer_head *bh; - struct list_head *list; - int err = 0; - - spin_lock(&lru_list_lock); - - repeat: - - for (list = inode->i_dirty_buffers.prev; - bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; - list = bh->b_inode_buffers.prev) { - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - err = -EIO; - brelse(bh); - spin_lock(&lru_list_lock); - goto repeat; - } - } - - spin_unlock(&lru_list_lock); return err; } - /* * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already @@ -954,15 +1024,13 @@ */ void invalidate_inode_buffers(struct inode *inode) { - struct list_head *list, *next; + struct list_head * entry; spin_lock(&lru_list_lock); - list = inode->i_dirty_buffers.next; - while (list != &inode->i_dirty_buffers) { - next = list->next; - remove_inode_queue(BH_ENTRY(list)); - list = next; - } + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + remove_inode_queue(BH_ENTRY(entry)); + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + remove_inode_queue(BH_ENTRY(entry)); spin_unlock(&lru_list_lock); } @@ -1156,8 +1224,8 @@ if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf)) goto in_use; __hash_unlink(buf); - remove_inode_queue(buf); write_unlock(&hash_table_lock); + remove_inode_queue(buf); __remove_from_lru_list(buf, buf->b_list); spin_unlock(&lru_list_lock); put_last_free(buf); @@ -1381,7 +1449,7 @@ * we have truncated the file and are going to free the * blocks on-disk.. */ -int block_flushpage(struct page *page, unsigned long offset) +int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; @@ -1418,7 +1486,8 @@ */ if (!offset) { if (!try_to_free_buffers(page, 0)) { - atomic_inc(&buffermem_pages); + if (drop_pagecache) + atomic_inc(&buffermem_pages); return 0; } } @@ -1648,7 +1717,7 @@ set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh); - buffer_insert_inode_queue(bh, inode); + buffer_insert_inode_data_queue(bh, inode); need_balance_dirty = 1; } } @@ -1977,6 +2046,47 @@ return tmp.b_blocknr; } +int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +{ + int i, nr_blocks, retval; + unsigned long * blocks = iobuf->blocks; + + nr_blocks = iobuf->length / blocksize; + /* build the blocklist */ + for (i = 0; i < nr_blocks; i++, blocknr++) { + struct buffer_head bh; + + bh.b_state = 0; + bh.b_dev = inode->i_dev; + bh.b_size = blocksize; + + retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1); + if (retval) + goto out; + + if (rw == READ) { + if (buffer_new(&bh)) + BUG(); + if (!buffer_mapped(&bh)) { + /* there was an hole in the filesystem */ + blocks[i] = -1UL; + continue; + } + } else { + if (buffer_new(&bh)) + unmap_underlying_metadata(&bh); + if (!buffer_mapped(&bh)) + BUG(); + } + blocks[i] = bh.b_blocknr; + } + + retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize); + + out: + return retval; +} + /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -2092,6 +2202,18 @@ while (length > 0) { blocknr = b[bufind++]; + if (blocknr == -1UL) { + if (rw == READ) { + /* there was an hole in the filesystem */ + memset(kmap(map) + offset, 0, size); + flush_dcache_page(map); + kunmap(map); + + transferred += size; + goto skip_block; + } else + BUG(); + } tmp = bhs[bhind++]; tmp->b_dev = B_FREE; @@ -2110,9 +2232,6 @@ } else set_bit(BH_Uptodate, &tmp->b_state); - length -= size; - offset += size; - atomic_inc(&iobuf->io_count); submit_bh(rw, tmp); @@ -2128,7 +2247,11 @@ goto finished; bhind = 0; } - + + skip_block: + length -= size; + offset += size; + if (offset >= PAGE_SIZE) { offset = 0; break; diff -urN 2.4.4/fs/ext2/fsync.c o_direct/fs/ext2/fsync.c --- 2.4.4/fs/ext2/fsync.c Thu Dec 14 22:34:11 2000 +++ o_direct/fs/ext2/fsync.c Mon Apr 30 17:36:47 2001 @@ -44,6 +44,7 @@ int err; err = fsync_inode_buffers(inode); + err |= fsync_inode_data_buffers(inode); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff -urN 2.4.4/fs/ext2/inode.c o_direct/fs/ext2/inode.c --- 2.4.4/fs/ext2/inode.c Sat Apr 28 05:24:43 2001 +++ o_direct/fs/ext2/inode.c Mon Apr 30 17:36:47 2001 @@ -666,13 +666,18 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } +static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +{ + return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); +} struct address_space_operations ext2_aops = { readpage: ext2_readpage, writepage: ext2_writepage, sync_page: block_sync_page, prepare_write: ext2_prepare_write, commit_write: generic_commit_write, - bmap: ext2_bmap + bmap: ext2_bmap, + direct_IO: ext2_direct_IO, }; /* diff -urN 2.4.4/fs/fcntl.c o_direct/fs/fcntl.c --- 2.4.4/fs/fcntl.c Tue Nov 28 18:40:01 2000 +++ o_direct/fs/fcntl.c Mon Apr 30 17:37:03 2001 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -194,7 +195,7 @@ return ret; } -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC) +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT) static int setfl(int fd, struct file * filp, unsigned long arg) { @@ -215,6 +216,25 @@ if (error < 0) return error; } + } + + if (arg & O_DIRECT) { + /* + * alloc_kiovec() can sleep and we are only serialized by + * the big kernel lock here, so abuse the i_sem to serialize + * this case too. We of course wouldn't need to go deep down + * to the inode layer, we could stay at the file layer, but + * we don't want to pay for the memory of a semaphore in each + * file structure too and we use the inode semaphore that we just + * pay for anyways. + */ + error = 0; + down(&inode->i_sem); + if (!filp->f_iobuf) + error = alloc_kiovec(1, &filp->f_iobuf); + up(&inode->i_sem); + if (error < 0) + return error; } /* required for strict SunOS emulation */ diff -urN 2.4.4/fs/file_table.c o_direct/fs/file_table.c --- 2.4.4/fs/file_table.c Sat Apr 28 05:24:43 2001 +++ o_direct/fs/file_table.c Mon Apr 30 17:36:47 2001 @@ -11,6 +11,7 @@ #include #include #include +#include /* sysctl tunables... */ struct files_stat_struct files_stat = {0, 0, NR_FILE}; @@ -104,6 +105,10 @@ if (atomic_dec_and_test(&file->f_count)) { locks_remove_flock(file); + + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); + if (file->f_op && file->f_op->release) file->f_op->release(inode, file); fops_put(file->f_op); diff -urN 2.4.4/fs/inode.c o_direct/fs/inode.c --- 2.4.4/fs/inode.c Sat Apr 28 05:24:43 2001 +++ o_direct/fs/inode.c Mon Apr 30 17:36:47 2001 @@ -77,7 +77,7 @@ ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL)) static void destroy_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) + if (inode_has_buffers(inode)) BUG(); kmem_cache_free(inode_cachep, (inode)); } @@ -103,6 +103,7 @@ INIT_LIST_HEAD(&inode->i_data.locked_pages); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_dirty_buffers); + INIT_LIST_HEAD(&inode->i_dirty_data_buffers); sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); spin_lock_init(&inode->i_data.i_shared_lock); @@ -409,6 +410,8 @@ while (inode->i_state & I_DIRTY) sync_one(inode, sync); spin_unlock(&inode_lock); + if (sync) + wait_on_inode(inode); } else printk("write_inode_now: no super block\n"); @@ -423,9 +426,9 @@ * O_SYNC flag set, to flush dirty writes to disk. */ -int generic_osync_inode(struct inode *inode, int datasync) +int generic_osync_inode(struct inode *inode, int what) { - int err; + int err = 0, err2 = 0, need_write_inode_now = 0; /* * WARNING @@ -448,23 +451,24 @@ * every O_SYNC write, not just the synchronous I/Os. --sct */ -#ifdef WRITERS_QUEUE_IO - err = osync_inode_buffers(inode); -#else - err = fsync_inode_buffers(inode); -#endif + if (what & OSYNC_METADATA) + err = fsync_inode_buffers(inode); + if (what & OSYNC_DATA) + err2 = fsync_inode_data_buffers(inode); + if (!err) + err = err2; spin_lock(&inode_lock); - if (!(inode->i_state & I_DIRTY)) - goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + if ((inode->i_state & I_DIRTY) && + ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) + need_write_inode_now = 1; spin_unlock(&inode_lock); - write_inode_now(inode, 1); - return err; - out: - spin_unlock(&inode_lock); + if (need_write_inode_now) + write_inode_now(inode, 1); + else + wait_on_inode(inode); + return err; } @@ -479,8 +483,7 @@ void clear_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) - invalidate_inode_buffers(inode); + invalidate_inode_buffers(inode); if (inode->i_data.nrpages) BUG(); diff -urN 2.4.4/fs/open.c o_direct/fs/open.c --- 2.4.4/fs/open.c Thu Feb 22 03:45:10 2001 +++ o_direct/fs/open.c Mon Apr 30 17:36:47 2001 @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -662,6 +663,15 @@ goto cleanup_all; } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + /* preallocate kiobuf for O_DIRECT */ + f->f_iobuf = NULL; + f->f_iobuf_lock = 0; + if (f->f_flags & O_DIRECT) { + error = alloc_kiovec(1, &f->f_iobuf); + if (error) + goto cleanup_all; + } return f; diff -urN 2.4.4/fs/reiserfs/file.c o_direct/fs/reiserfs/file.c --- 2.4.4/fs/reiserfs/file.c Sat Apr 28 05:24:44 2001 +++ o_direct/fs/reiserfs/file.c Mon Apr 30 17:36:47 2001 @@ -84,7 +84,7 @@ ) { struct inode * p_s_inode = p_s_dentry->d_inode; struct reiserfs_transaction_handle th ; - int n_err = 0; + int n_err; int windex ; int jbegin_count = 1 ; @@ -94,6 +94,7 @@ BUG (); n_err = fsync_inode_buffers(p_s_inode) ; + n_err |= fsync_inode_data_buffers(p_s_inode); /* commit the current transaction to flush any metadata ** changes. sys_fsync takes care of flushing the dirty pages for us */ diff -urN 2.4.4/include/asm-i386/fcntl.h o_direct/include/asm-i386/fcntl.h --- 2.4.4/include/asm-i386/fcntl.h Thu Nov 16 15:37:33 2000 +++ o_direct/include/asm-i386/fcntl.h Mon Apr 30 17:36:47 2001 @@ -16,7 +16,7 @@ #define O_NDELAY O_NONBLOCK #define O_SYNC 010000 #define FASYNC 020000 /* fcntl, for BSD compatibility */ -#define O_DIRECT 040000 /* direct disk access hint - currently ignored */ +#define O_DIRECT 040000 /* direct disk access hint */ #define O_LARGEFILE 0100000 #define O_DIRECTORY 0200000 /* must be a directory */ #define O_NOFOLLOW 0400000 /* don't follow links */ diff -urN 2.4.4/include/linux/fs.h o_direct/include/linux/fs.h --- 2.4.4/include/linux/fs.h Sat Apr 28 05:24:47 2001 +++ o_direct/include/linux/fs.h Mon Apr 30 17:36:47 2001 @@ -354,6 +354,7 @@ */ struct page; struct address_space; +struct kiobuf; struct address_space_operations { int (*writepage)(struct page *); @@ -363,6 +364,7 @@ int (*commit_write)(struct file *, struct page *, unsigned, unsigned); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); + int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); }; struct address_space { @@ -394,6 +396,7 @@ struct list_head i_dentry; struct list_head i_dirty_buffers; + struct list_head i_dirty_data_buffers; unsigned long i_ino; atomic_t i_count; @@ -489,6 +492,10 @@ /* needed for tty driver, and maybe others */ void *private_data; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; + long f_iobuf_lock; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -1087,6 +1094,7 @@ extern int check_disk_change(kdev_t); extern int invalidate_inodes(struct super_block *); extern void invalidate_inode_pages(struct inode *); +extern void invalidate_inode_pages2(struct address_space *); extern void invalidate_inode_buffers(struct inode *); #define invalidate_buffers(dev) __invalidate_buffers((dev), 0) #define destroy_buffers(dev) __invalidate_buffers((dev), 1) @@ -1098,8 +1106,10 @@ extern int fsync_dev(kdev_t); extern int fsync_super(struct super_block *); extern void sync_inodes_sb(struct super_block *); -extern int fsync_inode_buffers(struct inode *); extern int osync_inode_buffers(struct inode *); +extern int osync_inode_data_buffers(struct inode *); +extern int fsync_inode_buffers(struct inode *); +extern int fsync_inode_data_buffers(struct inode *); extern int inode_has_buffers(struct inode *); extern void filemap_fdatasync(struct address_space *); extern void filemap_fdatawait(struct address_space *); @@ -1259,7 +1269,9 @@ typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int); /* Generic buffer handling for block filesystems.. */ -extern int block_flushpage(struct page *, unsigned long); +extern int discard_bh_page(struct page *, unsigned long, int); +#define block_flushpage(page, offset) discard_bh_page(page, offset, 1) +#define block_invalidate_page(page) discard_bh_page(page, 0, 0) extern int block_symlink(struct inode *, const char *, int); extern int block_write_full_page(struct page*, get_block_t*); extern int block_read_full_page(struct page*, get_block_t*); @@ -1271,6 +1283,7 @@ int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); +extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); @@ -1318,6 +1331,9 @@ extern int file_fsync(struct file *, struct dentry *, int); extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx); extern int generic_osync_inode(struct inode *, int); +#define OSYNC_METADATA (1<<0) +#define OSYNC_DATA (1<<1) +#define OSYNC_INODE (1<<2) extern int inode_change_ok(struct inode *, struct iattr *); extern void inode_setattr(struct inode *, struct iattr *); diff -urN 2.4.4/kernel/ksyms.c o_direct/kernel/ksyms.c --- 2.4.4/kernel/ksyms.c Sat Apr 28 05:24:48 2001 +++ o_direct/kernel/ksyms.c Mon Apr 30 17:36:47 2001 @@ -205,6 +205,7 @@ EXPORT_SYMBOL(generic_file_read); EXPORT_SYMBOL(do_generic_file_read); EXPORT_SYMBOL(generic_file_write); +EXPORT_SYMBOL(generic_direct_IO); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_ro_fops); EXPORT_SYMBOL(generic_buffer_fdatasync); @@ -480,6 +481,7 @@ EXPORT_SYMBOL(__wait_on_super); EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(fsync_inode_buffers); +EXPORT_SYMBOL(fsync_inode_data_buffers); EXPORT_SYMBOL(clear_inode); EXPORT_SYMBOL(nr_async_pages); EXPORT_SYMBOL(___strtok); diff -urN 2.4.4/mm/filemap.c o_direct/mm/filemap.c --- 2.4.4/mm/filemap.c Sat Apr 28 05:24:48 2001 +++ o_direct/mm/filemap.c Mon Apr 30 17:36:47 2001 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -199,7 +200,7 @@ } -static inline void truncate_complete_page(struct page *page) +static void truncate_complete_page(struct page *page) { /* Leave it on the LRU if it gets converted into anonymous buffers */ if (!page->buffers || block_flushpage(page, 0)) @@ -234,15 +235,14 @@ /* Is one of the pages to truncate? */ if ((offset >= start) || (*partial && (offset + 1) == start)) { - if (TryLockPage(page)) { - page_cache_get(page); - spin_unlock(&pagecache_lock); - wait_on_page(page); - page_cache_release(page); - return 1; - } + int failed; + failed = TryLockPage(page); page_cache_get(page); spin_unlock(&pagecache_lock); + if (failed) { + wait_on_page(page); + goto again; + } if (*partial && (offset + 1) == start) { truncate_partial_page(page, *partial); @@ -251,7 +251,12 @@ truncate_complete_page(page); UnlockPage(page); + again: page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } return 1; } } @@ -284,6 +289,82 @@ spin_unlock(&pagecache_lock); } +static inline int invalidate_this_page2(struct page * page) +{ + int loop = 0; + + if (page_count(page) == 1 + !!page->buffers) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + loop = 1; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return loop; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + + curr = head->next; + while (curr != head) { + int loop; + + page = list_entry(curr, struct page, list); + curr = curr->next; + + if (TryLockPage(page)) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + wait_on_page(page); + goto again; + } + + loop = invalidate_this_page2(page); + UnlockPage(page); + if (loop) + continue; + again: + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + return 1; + } + return 0; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ +repeat: + spin_lock(&pagecache_lock); + if (invalidate_list_pages2(&mapping->clean_pages)) + goto repeat; + if (invalidate_list_pages2(&mapping->dirty_pages)) + goto repeat; + if (invalidate_list_pages2(&mapping->locked_pages)) + goto repeat; + spin_unlock(&pagecache_lock); +} + static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { goto inside; @@ -1236,6 +1317,87 @@ UPDATE_ATIME(inode); } +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; + struct kiobuf * iobuf; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = inode->i_sb->s_blocksize; + blocksize_mask = blocksize - 1; + blocksize_bits = inode->i_sb->s_blocksize_bits; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask)) + goto out_free; + if (!mapping->a_ops->direct_IO) + goto out_free; + + /* + * Flush to disk exlusively the _data_, metadata must remains + * completly asynchronous or performance will go to /dev/null. + */ + filemap_fdatasync(mapping); + retval = fsync_inode_data_buffers(inode); + filemap_fdatawait(mapping); + if (retval < 0) + goto out; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) { char *kaddr; @@ -1266,6 +1428,9 @@ { ssize_t retval; + if (filp->f_flags & O_DIRECT) + goto o_direct; + retval = -EFAULT; if (access_ok(VERIFY_WRITE, buf, count)) { retval = 0; @@ -1284,7 +1449,22 @@ retval = desc.error; } } + out: return retval; + + o_direct: + { + loff_t pos = *ppos; + struct inode * inode = filp->f_dentry->d_inode; + + if (pos + count > inode->i_size) + count = inode->i_size - pos; + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } } static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) @@ -2474,7 +2654,7 @@ * okir@monad.swb.de */ ssize_t -generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -2569,6 +2749,9 @@ inode->i_ctime = inode->i_mtime = CURRENT_TIME; mark_inode_dirty_sync(inode); + if (file->f_flags & O_DIRECT) + goto o_direct; + while (count) { unsigned long index, offset; char *kaddr; @@ -2643,7 +2826,7 @@ /* For now, when the user asks for O_SYNC, we'll actually * provide O_DSYNC. */ if ((status >= 0) && (file->f_flags & O_SYNC)) - status = generic_osync_inode(inode, 1); /* 1 means datasync */ + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); err = written ? written : status; out: @@ -2655,6 +2838,25 @@ ClearPageUptodate(page); kunmap(page); goto unlock; + +o_direct: + err = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (err > 0) { + loff_t end = pos + err; + if (end > inode->i_size) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (file->f_flags & O_SYNC) + err = generic_osync_inode(inode, OSYNC_METADATA); + } + goto out; } void __init page_cache_init(unsigned long mempages)