diff -urN 2.4.10pre3/fs/buffer.c o_direct/fs/buffer.c --- 2.4.10pre3/fs/buffer.c Sat Sep 1 22:03:13 2001 +++ o_direct/fs/buffer.c Sun Sep 2 05:17:30 2001 @@ -623,6 +623,16 @@ spin_unlock(&lru_list_lock); } +void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); +} + /* The caller must have the lru_list lock before calling the remove_inode_queue functions. */ static void __remove_inode_queue(struct buffer_head *bh) @@ -642,7 +652,7 @@ int ret; spin_lock(&lru_list_lock); - ret = !list_empty(&inode->i_dirty_buffers); + ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers); spin_unlock(&lru_list_lock); return ret; @@ -960,6 +970,54 @@ return err2; } +int fsync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(&inode->i_dirty_data_buffers)) { + bh = BH_ENTRY(inode->i_dirty_data_buffers.next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_data_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev); + remove_inode_queue(bh); + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + + spin_unlock(&lru_list_lock); + err2 = osync_inode_data_buffers(inode); + + if (err) + return err; + else + return err2; +} /* * osync is designed to support O_SYNC io. It waits synchronously for @@ -1001,6 +1059,35 @@ return err; } +int osync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + spin_lock(&lru_list_lock); + + repeat: + + for (list = inode->i_dirty_data_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + spin_unlock(&lru_list_lock); + return err; +} + /* * Invalidate any and all dirty buffers on a given inode. We are @@ -1009,15 +1096,13 @@ */ void invalidate_inode_buffers(struct inode *inode) { - struct list_head *list, *next; + struct list_head * entry; spin_lock(&lru_list_lock); - list = inode->i_dirty_buffers.next; - while (list != &inode->i_dirty_buffers) { - next = list->next; - remove_inode_queue(BH_ENTRY(list)); - list = next; - } + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + remove_inode_queue(BH_ENTRY(entry)); + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + remove_inode_queue(BH_ENTRY(entry)); spin_unlock(&lru_list_lock); } @@ -1214,8 +1299,8 @@ if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf)) goto in_use; __hash_unlink(buf); - remove_inode_queue(buf); write_unlock(&hash_table_lock); + remove_inode_queue(buf); __remove_from_lru_list(buf, buf->b_list); spin_unlock(&lru_list_lock); put_last_free(buf); @@ -1436,7 +1521,7 @@ * we have truncated the file and are going to free the * blocks on-disk.. */ -int block_flushpage(struct page *page, unsigned long offset) +int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; @@ -1473,7 +1558,8 @@ */ if (!offset) { if (!try_to_free_buffers(page, 0)) { - atomic_inc(&buffermem_pages); + if (drop_pagecache) + atomic_inc(&buffermem_pages); return 0; } } @@ -1705,7 +1791,7 @@ set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh); - buffer_insert_inode_queue(bh, inode); + buffer_insert_inode_data_queue(bh, inode); need_balance_dirty = 1; } } @@ -2034,6 +2120,47 @@ return tmp.b_blocknr; } +int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +{ + int i, nr_blocks, retval; + unsigned long * blocks = iobuf->blocks; + + nr_blocks = iobuf->length / blocksize; + /* build the blocklist */ + for (i = 0; i < nr_blocks; i++, blocknr++) { + struct buffer_head bh; + + bh.b_state = 0; + bh.b_dev = inode->i_dev; + bh.b_size = blocksize; + + retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1); + if (retval) + goto out; + + if (rw == READ) { + if (buffer_new(&bh)) + BUG(); + if (!buffer_mapped(&bh)) { + /* there was an hole in the filesystem */ + blocks[i] = -1UL; + continue; + } + } else { + if (buffer_new(&bh)) + unmap_underlying_metadata(&bh); + if (!buffer_mapped(&bh)) + BUG(); + } + blocks[i] = bh.b_blocknr; + } + + retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize); + + out: + return retval; +} + /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -2149,6 +2276,18 @@ while (length > 0) { blocknr = b[bufind++]; + if (blocknr == -1UL) { + if (rw == READ) { + /* there was an hole in the filesystem */ + memset(kmap(map) + offset, 0, size); + flush_dcache_page(map); + kunmap(map); + + transferred += size; + goto skip_block; + } else + BUG(); + } tmp = bhs[bhind++]; tmp->b_dev = B_FREE; @@ -2167,9 +2306,6 @@ } else set_bit(BH_Uptodate, &tmp->b_state); - length -= size; - offset += size; - atomic_inc(&iobuf->io_count); submit_bh(rw, tmp); /* @@ -2184,7 +2320,11 @@ goto finished; bhind = 0; } - + + skip_block: + length -= size; + offset += size; + if (offset >= PAGE_SIZE) { offset = 0; break; diff -urN 2.4.10pre3/fs/ext2/fsync.c o_direct/fs/ext2/fsync.c --- 2.4.10pre3/fs/ext2/fsync.c Thu Dec 14 22:34:11 2000 +++ o_direct/fs/ext2/fsync.c Sun Sep 2 05:17:30 2001 @@ -44,6 +44,7 @@ int err; err = fsync_inode_buffers(inode); + err |= fsync_inode_data_buffers(inode); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff -urN 2.4.10pre3/fs/ext2/inode.c o_direct/fs/ext2/inode.c --- 2.4.10pre3/fs/ext2/inode.c Sat Jul 21 00:04:27 2001 +++ o_direct/fs/ext2/inode.c Sun Sep 2 05:17:30 2001 @@ -586,13 +586,18 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } +static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +{ + return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); +} struct address_space_operations ext2_aops = { readpage: ext2_readpage, writepage: ext2_writepage, sync_page: block_sync_page, prepare_write: ext2_prepare_write, commit_write: generic_commit_write, - bmap: ext2_bmap + bmap: ext2_bmap, + direct_IO: ext2_direct_IO, }; /* diff -urN 2.4.10pre3/fs/fcntl.c o_direct/fs/fcntl.c --- 2.4.10pre3/fs/fcntl.c Sat May 26 04:03:46 2001 +++ o_direct/fs/fcntl.c Sun Sep 2 05:17:30 2001 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -194,7 +195,7 @@ return ret; } -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC) +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT) static int setfl(int fd, struct file * filp, unsigned long arg) { @@ -215,6 +216,25 @@ if (error < 0) return error; } + } + + if (arg & O_DIRECT) { + /* + * alloc_kiovec() can sleep and we are only serialized by + * the big kernel lock here, so abuse the i_sem to serialize + * this case too. We of course wouldn't need to go deep down + * to the inode layer, we could stay at the file layer, but + * we don't want to pay for the memory of a semaphore in each + * file structure too and we use the inode semaphore that we just + * pay for anyways. + */ + error = 0; + down(&inode->i_sem); + if (!filp->f_iobuf) + error = alloc_kiovec(1, &filp->f_iobuf); + up(&inode->i_sem); + if (error < 0) + return error; } /* required for strict SunOS emulation */ diff -urN 2.4.10pre3/fs/file_table.c o_direct/fs/file_table.c --- 2.4.10pre3/fs/file_table.c Tue May 1 19:35:29 2001 +++ o_direct/fs/file_table.c Sun Sep 2 05:17:30 2001 @@ -11,6 +11,7 @@ #include #include #include +#include /* sysctl tunables... */ struct files_stat_struct files_stat = {0, 0, NR_FILE}; @@ -104,6 +105,10 @@ if (atomic_dec_and_test(&file->f_count)) { locks_remove_flock(file); + + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); + if (file->f_op && file->f_op->release) file->f_op->release(inode, file); fops_put(file->f_op); diff -urN 2.4.10pre3/fs/inode.c o_direct/fs/inode.c --- 2.4.10pre3/fs/inode.c Sat Sep 1 22:03:13 2001 +++ o_direct/fs/inode.c Sun Sep 2 05:17:30 2001 @@ -77,7 +77,7 @@ ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL)) static void destroy_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) + if (inode_has_buffers(inode)) BUG(); kmem_cache_free(inode_cachep, (inode)); } @@ -103,6 +103,7 @@ INIT_LIST_HEAD(&inode->i_data.locked_pages); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_dirty_buffers); + INIT_LIST_HEAD(&inode->i_dirty_data_buffers); sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); spin_lock_init(&inode->i_data.i_shared_lock); @@ -433,6 +434,8 @@ while (inode->i_state & I_DIRTY) sync_one(inode, sync); spin_unlock(&inode_lock); + if (sync) + wait_on_inode(inode); } else printk("write_inode_now: no super block\n"); @@ -447,9 +450,9 @@ * O_SYNC flag set, to flush dirty writes to disk. */ -int generic_osync_inode(struct inode *inode, int datasync) +int generic_osync_inode(struct inode *inode, int what) { - int err; + int err = 0, err2 = 0, need_write_inode_now = 0; /* * WARNING @@ -472,23 +475,24 @@ * every O_SYNC write, not just the synchronous I/Os. --sct */ -#ifdef WRITERS_QUEUE_IO - err = osync_inode_buffers(inode); -#else - err = fsync_inode_buffers(inode); -#endif + if (what & OSYNC_METADATA) + err = fsync_inode_buffers(inode); + if (what & OSYNC_DATA) + err2 = fsync_inode_data_buffers(inode); + if (!err) + err = err2; spin_lock(&inode_lock); - if (!(inode->i_state & I_DIRTY)) - goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + if ((inode->i_state & I_DIRTY) && + ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) + need_write_inode_now = 1; spin_unlock(&inode_lock); - write_inode_now(inode, 1); - return err; - out: - spin_unlock(&inode_lock); + if (need_write_inode_now) + write_inode_now(inode, 1); + else + wait_on_inode(inode); + return err; } @@ -503,8 +507,7 @@ void clear_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) - invalidate_inode_buffers(inode); + invalidate_inode_buffers(inode); if (inode->i_data.nrpages) BUG(); diff -urN 2.4.10pre3/fs/open.c o_direct/fs/open.c --- 2.4.10pre3/fs/open.c Thu Aug 16 22:03:38 2001 +++ o_direct/fs/open.c Sun Sep 2 05:17:32 2001 @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -656,6 +657,16 @@ f->f_op = fops_get(inode->i_fop); if (inode->i_sb) file_move(f, &inode->i_sb->s_files); + + /* preallocate kiobuf for O_DIRECT */ + f->f_iobuf = NULL; + f->f_iobuf_lock = 0; + if (f->f_flags & O_DIRECT) { + error = alloc_kiovec(1, &f->f_iobuf); + if (error) + goto cleanup_all; + } + if (f->f_op && f->f_op->open) { error = f->f_op->open(inode,f); if (error) @@ -666,6 +677,8 @@ return f; cleanup_all: + if (f->f_iobuf) + free_kiovec(1, &f->f_iobuf); fops_put(f->f_op); if (f->f_mode & FMODE_WRITE) put_write_access(inode); diff -urN 2.4.10pre3/fs/reiserfs/file.c o_direct/fs/reiserfs/file.c --- 2.4.10pre3/fs/reiserfs/file.c Tue May 1 19:35:29 2001 +++ o_direct/fs/reiserfs/file.c Sun Sep 2 05:17:30 2001 @@ -84,7 +84,7 @@ ) { struct inode * p_s_inode = p_s_dentry->d_inode; struct reiserfs_transaction_handle th ; - int n_err = 0; + int n_err; int windex ; int jbegin_count = 1 ; @@ -94,6 +94,7 @@ BUG (); n_err = fsync_inode_buffers(p_s_inode) ; + n_err |= fsync_inode_data_buffers(p_s_inode); /* commit the current transaction to flush any metadata ** changes. sys_fsync takes care of flushing the dirty pages for us */ diff -urN 2.4.10pre3/include/asm-alpha/fcntl.h o_direct/include/asm-alpha/fcntl.h --- 2.4.10pre3/include/asm-alpha/fcntl.h Thu Nov 16 15:37:42 2000 +++ o_direct/include/asm-alpha/fcntl.h Sun Sep 2 05:17:30 2001 @@ -17,10 +17,10 @@ #define O_NDELAY O_NONBLOCK #define O_SYNC 040000 #define FASYNC 020000 /* fcntl, for BSD compatibility */ -#define O_DIRECT 040000 /* direct disk access - should check with OSF/1 */ #define O_DIRECTORY 0100000 /* must be a directory */ #define O_NOFOLLOW 0200000 /* don't follow links */ #define O_LARGEFILE 0400000 /* will be set by the kernel on every open */ +#define O_DIRECT 02000000 /* direct disk access - should check with OSF/1 */ #define F_DUPFD 0 /* dup */ #define F_GETFD 1 /* get close_on_exec */ diff -urN 2.4.10pre3/include/asm-i386/fcntl.h o_direct/include/asm-i386/fcntl.h --- 2.4.10pre3/include/asm-i386/fcntl.h Thu Nov 16 15:37:33 2000 +++ o_direct/include/asm-i386/fcntl.h Sun Sep 2 05:17:30 2001 @@ -16,7 +16,7 @@ #define O_NDELAY O_NONBLOCK #define O_SYNC 010000 #define FASYNC 020000 /* fcntl, for BSD compatibility */ -#define O_DIRECT 040000 /* direct disk access hint - currently ignored */ +#define O_DIRECT 040000 /* direct disk access hint */ #define O_LARGEFILE 0100000 #define O_DIRECTORY 0200000 /* must be a directory */ #define O_NOFOLLOW 0400000 /* don't follow links */ diff -urN 2.4.10pre3/include/asm-sparc/fcntl.h o_direct/include/asm-sparc/fcntl.h --- 2.4.10pre3/include/asm-sparc/fcntl.h Thu Nov 16 15:37:42 2000 +++ o_direct/include/asm-sparc/fcntl.h Sun Sep 2 05:17:30 2001 @@ -20,6 +20,7 @@ #define O_DIRECTORY 0x10000 /* must be a directory */ #define O_NOFOLLOW 0x20000 /* don't follow links */ #define O_LARGEFILE 0x40000 +#define O_DIRECT 0x100000 /* direct disk access hint */ #define F_DUPFD 0 /* dup */ #define F_GETFD 1 /* get close_on_exec */ diff -urN 2.4.10pre3/include/asm-sparc64/fcntl.h o_direct/include/asm-sparc64/fcntl.h --- 2.4.10pre3/include/asm-sparc64/fcntl.h Thu Nov 16 15:37:42 2000 +++ o_direct/include/asm-sparc64/fcntl.h Sun Sep 2 05:17:30 2001 @@ -20,6 +20,8 @@ #define O_DIRECTORY 0x10000 /* must be a directory */ #define O_NOFOLLOW 0x20000 /* don't follow links */ #define O_LARGEFILE 0x40000 +#define O_DIRECT 0x100000 /* direct disk access hint */ + #define F_DUPFD 0 /* dup */ #define F_GETFD 1 /* get close_on_exec */ diff -urN 2.4.10pre3/include/linux/fs.h o_direct/include/linux/fs.h --- 2.4.10pre3/include/linux/fs.h Sat Sep 1 22:03:16 2001 +++ o_direct/include/linux/fs.h Sun Sep 2 05:17:30 2001 @@ -363,6 +363,7 @@ */ struct page; struct address_space; +struct kiobuf; struct address_space_operations { int (*writepage)(struct page *); @@ -372,6 +373,8 @@ int (*commit_write)(struct file *, struct page *, unsigned, unsigned); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); +#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ + int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); }; struct address_space { @@ -411,6 +414,7 @@ struct list_head i_dentry; struct list_head i_dirty_buffers; + struct list_head i_dirty_data_buffers; unsigned long i_ino; atomic_t i_count; @@ -508,6 +512,10 @@ /* needed for tty driver, and maybe others */ void *private_data; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; + long f_iobuf_lock; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -1151,6 +1159,7 @@ extern int invalidate_inodes(struct super_block *); extern int invalidate_device(kdev_t, int); extern void invalidate_inode_pages(struct inode *); +extern void invalidate_inode_pages2(struct address_space *); extern void invalidate_inode_buffers(struct inode *); #define invalidate_buffers(dev) __invalidate_buffers((dev), 0) #define destroy_buffers(dev) __invalidate_buffers((dev), 1) @@ -1163,8 +1172,10 @@ extern int fsync_super(struct super_block *); extern int fsync_no_super(kdev_t); extern void sync_inodes_sb(struct super_block *); -extern int fsync_inode_buffers(struct inode *); extern int osync_inode_buffers(struct inode *); +extern int osync_inode_data_buffers(struct inode *); +extern int fsync_inode_buffers(struct inode *); +extern int fsync_inode_data_buffers(struct inode *); extern int inode_has_buffers(struct inode *); extern void filemap_fdatasync(struct address_space *); extern void filemap_fdatawait(struct address_space *); @@ -1324,7 +1335,9 @@ typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int); /* Generic buffer handling for block filesystems.. */ -extern int block_flushpage(struct page *, unsigned long); +extern int discard_bh_page(struct page *, unsigned long, int); +#define block_flushpage(page, offset) discard_bh_page(page, offset, 1) +#define block_invalidate_page(page) discard_bh_page(page, 0, 0) extern int block_symlink(struct inode *, const char *, int); extern int block_write_full_page(struct page*, get_block_t*); extern int block_read_full_page(struct page*, get_block_t*); @@ -1336,6 +1349,7 @@ int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); +extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *); extern int waitfor_one_page(struct page*); extern int generic_file_mmap(struct file *, struct vm_area_struct *); @@ -1395,6 +1409,9 @@ extern int file_fsync(struct file *, struct dentry *, int); extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx); extern int generic_osync_inode(struct inode *, int); +#define OSYNC_METADATA (1<<0) +#define OSYNC_DATA (1<<1) +#define OSYNC_INODE (1<<2) extern int inode_change_ok(struct inode *, struct iattr *); extern void inode_setattr(struct inode *, struct iattr *); diff -urN 2.4.10pre3/kernel/ksyms.c o_direct/kernel/ksyms.c --- 2.4.10pre3/kernel/ksyms.c Sat Sep 1 22:03:16 2001 +++ o_direct/kernel/ksyms.c Sun Sep 2 05:17:30 2001 @@ -210,6 +210,7 @@ EXPORT_SYMBOL(generic_file_read); EXPORT_SYMBOL(do_generic_file_read); EXPORT_SYMBOL(generic_file_write); +EXPORT_SYMBOL(generic_direct_IO); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_ro_fops); EXPORT_SYMBOL(generic_buffer_fdatasync); @@ -496,6 +497,7 @@ EXPORT_SYMBOL(sys_tz); EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(fsync_inode_buffers); +EXPORT_SYMBOL(fsync_inode_data_buffers); EXPORT_SYMBOL(clear_inode); EXPORT_SYMBOL(nr_async_pages); EXPORT_SYMBOL(___strtok); diff -urN 2.4.10pre3/mm/filemap.c o_direct/mm/filemap.c --- 2.4.10pre3/mm/filemap.c Sat Sep 1 22:03:16 2001 +++ o_direct/mm/filemap.c Sun Sep 2 05:17:30 2001 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -200,7 +201,7 @@ } -static inline void truncate_complete_page(struct page *page) +static void truncate_complete_page(struct page *page) { /* Leave it on the LRU if it gets converted into anonymous buffers */ if (!page->buffers || block_flushpage(page, 0)) @@ -224,8 +225,10 @@ { struct list_head *curr; struct page * page; + int unlocked = 0; - curr = head->next; + restart: + curr = head->prev; while (curr != head) { unsigned long offset; @@ -234,33 +237,46 @@ /* Is one of the pages to truncate? */ if ((offset >= start) || (*partial && (offset + 1) == start)) { - list_del(head); - list_add(head, curr); - if (TryLockPage(page)) { - page_cache_get(page); - spin_unlock(&pagecache_lock); - wait_on_page(page); - goto out_restart; - } + int failed; + page_cache_get(page); + failed = TryLockPage(page); + + list_del(head); + if (!failed) + /* Restart after this page */ + list_add_tail(head, curr); + else + /* Restart on this page */ + list_add(head, curr); + spin_unlock(&pagecache_lock); + unlocked = 1; - if (*partial && (offset + 1) == start) { - truncate_partial_page(page, *partial); - *partial = 0; - } else - truncate_complete_page(page); + if (!failed) { + if (*partial && (offset + 1) == start) { + truncate_partial_page(page, *partial); + *partial = 0; + } else + truncate_complete_page(page); + + UnlockPage(page); + } else + wait_on_page(page); - UnlockPage(page); - goto out_restart; + page_cache_release(page); + + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; } - curr = curr->next; + curr = curr->prev; } - return 0; -out_restart: - page_cache_release(page); - spin_lock(&pagecache_lock); - return 1; + return unlocked; } @@ -277,22 +293,118 @@ { unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - int complete; + int unlocked; spin_lock(&pagecache_lock); do { - complete = 1; - while (truncate_list_pages(&mapping->clean_pages, start, &partial)) - complete = 0; - while (truncate_list_pages(&mapping->dirty_pages, start, &partial)) - complete = 0; - while (truncate_list_pages(&mapping->locked_pages, start, &partial)) - complete = 0; - } while (!complete); + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + } while (unlocked); /* Traversed all three lists without dropping the lock */ spin_unlock(&pagecache_lock); } +static inline int invalidate_this_page2(struct page * page, + struct list_head * curr, + struct list_head * head) +{ + int unlocked = 1; + + /* + * The page is locked and we hold the pagecache_lock as well + * so both page_count(page) and page->buffers stays constant here. + */ + if (page_count(page) == 1 + !!page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + unlocked = 0; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return unlocked; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + page = list_entry(curr, struct page, list); + + if (!TryLockPage(page)) { + int __unlocked; + + __unlocked = invalidate_this_page2(page, curr, head); + UnlockPage(page); + unlocked |= __unlocked; + if (!__unlocked) { + curr = curr->prev; + continue; + } + } else { + /* Restart on this page */ + list_del(head); + list_add(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + unlocked = 1; + wait_on_page(page); + } + + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + return unlocked; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = invalidate_list_pages2(&mapping->clean_pages); + unlocked |= invalidate_list_pages2(&mapping->dirty_pages); + unlocked |= invalidate_list_pages2(&mapping->locked_pages); + } while (unlocked); + spin_unlock(&pagecache_lock); +} + static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { goto inside; @@ -1200,6 +1312,87 @@ UPDATE_ATIME(inode); } +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; + struct kiobuf * iobuf; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = inode->i_sb->s_blocksize; + blocksize_mask = blocksize - 1; + blocksize_bits = inode->i_sb->s_blocksize_bits; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask)) + goto out_free; + if (!mapping->a_ops->direct_IO) + goto out_free; + + /* + * Flush to disk exlusively the _data_, metadata must remains + * completly asynchronous or performance will go to /dev/null. + */ + filemap_fdatasync(mapping); + retval = fsync_inode_data_buffers(inode); + filemap_fdatawait(mapping); + if (retval < 0) + goto out_free; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) { char *kaddr; @@ -1230,6 +1423,12 @@ { ssize_t retval; + if ((ssize_t) count < 0) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + goto o_direct; + retval = -EFAULT; if (access_ok(VERIFY_WRITE, buf, count)) { retval = 0; @@ -1248,7 +1447,27 @@ retval = desc.error; } } + out: return retval; + + o_direct: + { + loff_t pos = *ppos; + struct inode * inode = filp->f_dentry->d_inode; + + retval = 0; + if (!count) + goto out; /* skip atime */ + if (pos < inode->i_size) { + if (pos + count > inode->i_size) + count = inode->i_size - pos; + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + } + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } } static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) @@ -2441,7 +2660,7 @@ * okir@monad.swb.de */ ssize_t -generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -2453,11 +2672,14 @@ int err; unsigned bytes; - cached_page = NULL; + if ((ssize_t) count < 0) + return -EINVAL; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - + + cached_page = NULL; + down(&inode->i_sem); pos = *ppos; @@ -2539,6 +2761,9 @@ inode->i_ctime = inode->i_mtime = CURRENT_TIME; mark_inode_dirty_sync(inode); + if (file->f_flags & O_DIRECT) + goto o_direct; + while (count) { unsigned long index, offset; long page_fault; @@ -2611,8 +2836,9 @@ /* For now, when the user asks for O_SYNC, we'll actually * provide O_DSYNC. */ if ((status >= 0) && (file->f_flags & O_SYNC)) - status = generic_osync_inode(inode, 1); /* 1 means datasync */ + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); +out_status: err = written ? written : status; out: @@ -2621,6 +2847,25 @@ fail_write: status = -EFAULT; goto unlock; + +o_direct: + written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (written > 0) { + loff_t end = pos + written; + if (end > inode->i_size) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && file->f_flags & O_SYNC) + status = generic_osync_inode(inode, OSYNC_METADATA); + goto out_status; } void __init page_cache_init(unsigned long mempages)