fs/nfs/direct.c | 455 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 320 insertions(+), 135 deletions(-) diff -puN fs/nfs/direct.c~25-odirect fs/nfs/direct.c --- 25/fs/nfs/direct.c~25-odirect 2003-10-02 00:48:14.000000000 -0700 +++ 25-akpm/fs/nfs/direct.c 2003-10-02 00:48:14.000000000 -0700 @@ -1,7 +1,7 @@ /* * linux/fs/nfs/direct.c * - * Copyright (C) 2001 by Chuck Lever + * Copyright (C) 2003 by Chuck Lever * * High-performance uncached I/O for the Linux NFS client * @@ -26,19 +26,23 @@ * also supports uncaching whole NFS partitions with "-o forcedirectio," * an undocumented mount option. * - * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. * * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy - * 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel + * 08 Jun 2003 Port to 2.5 APIs --cel * */ #include +#include #include #include +#include #include -#include +#include + #include #include #include @@ -46,35 +50,41 @@ #include #include -#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define NFSDBG_FACILITY NFSDBG_VFS #define VERF_SIZE (2 * sizeof(__u32)) +#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) /** - * nfs_get_user_pages - find and set up page representing user buffer - * addr: user-space address of target buffer - * size: total size in bytes of target buffer - * @pages: returned array of page struct pointers underlying target buffer - * write: whether or not buffer is target of a write operation + * nfs_get_user_pages - find and set up pages underlying user's buffer + * rw: direction (read or write) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * @pages: returned array of page struct pointers underlying user's buffer */ static inline int -nfs_get_user_pages(unsigned long addr, size_t size, - struct page ***pages, int rw) +nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, + struct page ***pages) { int result = -ENOMEM; - unsigned page_count = (unsigned) size >> PAGE_SHIFT; - unsigned array_size = (page_count * sizeof(struct page *)) + 2U; + unsigned long page_count; + size_t array_size; + + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) + return -EFBIG; - *pages = (struct page **) kmalloc(array_size, GFP_KERNEL); + page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count -= user_addr >> PAGE_SHIFT; + + array_size = (page_count * sizeof(struct page *)); + *pages = kmalloc(array_size, GFP_KERNEL); if (*pages) { down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, addr, - page_count, (rw == WRITE), 0, + result = get_user_pages(current, current->mm, user_addr, + page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); - if (result < 0) - printk(KERN_ERR "%s: get_user_pages result %d\n", - __FUNCTION__, result); } return result; } @@ -84,174 +94,349 @@ nfs_get_user_pages(unsigned long addr, s * @pages: array of page struct pointers underlying target buffer */ static inline void -nfs_free_user_pages(struct page **pages, unsigned count) +nfs_free_user_pages(struct page **pages) { - unsigned page = 0; + kfree(pages); +} - while (count--) - page_cache_release(pages[page++]); +/** + * nfs_direct_read_seg - Read in one iov segment. Generate separate + * read RPCs for each "rsize" bytes. + * @inode: target inode + * @cred: user's credential + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_read_seg(struct inode *inode, struct rpc_cred *cred, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int rsize = NFS_SERVER(inode)->rsize; + int tot_bytes = 0; + int curpage = 0; + struct nfs_read_data rdata = { + .flags = 0, + .cred = cred, + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &rdata.fattr, + }, + }; + + do { + int request, result; + + request = count; + if (count > rsize) + request = rsize; + rdata.args.count = request, + rdata.args.pgbase = user_addr & ~PAGE_MASK; + rdata.args.offset = file_offset; + rdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + rdata.args.count, (long long) rdata.args.offset, + user_addr, rdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->read(&rdata); + unlock_kernel(); + + if (result < 0) { + if (result == -EISDIR) + result = -EINVAL; + return result; + } - kfree(pages); + tot_bytes += result; + count -= result; + file_offset += result; + user_addr += result; + + if (rdata.res.eof) + break; + + curpage += (rdata.args.pgbase + result) >> PAGE_SHIFT; + } while (count); + + /* XXX: should we zero the rest of the user's buffer if we + * hit eof? */ + + return tot_bytes; } /** - * nfs_iov2pagelist - convert an array of iovecs to a list of page requests - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_read - For each iov segment, map the user's buffer + * then generate read RPCs. + * @inode: target inode + * @cred: user's credential * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array - * @requests: append new page requests to this list head + * + * generic_file_direct_IO has already pushed out any non-direct + * writes so that this read will see them when we read from the + * server. */ static int -nfs_iov2pagelist(int rw, const struct inode *inode, - const struct rpc_cred *cred, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs, struct list_head *requests) +nfs_direct_read(struct inode *inode, struct rpc_cred *cred, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - unsigned seg; int tot_bytes = 0; - struct page **pages; + unsigned long seg = 0; - /* for each iovec in the array... */ - for (seg = 0; seg < nr_segs; seg++) { - const unsigned long user_addr = - (unsigned long) iov[seg].iov_base; - size_t bytes = iov[seg].iov_len; - unsigned int pg_offset = (user_addr & ~PAGE_MASK); - int page_count, page = 0; - - page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw); - if (page_count < 0) { - return page_count; + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(READ, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages); + return page_count; + } + + result = nfs_direct_read_seg(inode, cred, user_addr, size, + file_offset, pages, page_count); + if (result < 0) + tot_bytes = result; + else { + tot_bytes += result; + file_offset += result; } - /* ...build as many page requests as required */ - while (bytes > 0) { - struct nfs_page *new; - const unsigned int pg_bytes = (bytes > PAGE_SIZE) ? - PAGE_SIZE : bytes; - - new = nfs_create_request((struct rpc_cred *) cred, - (struct inode *) inode, - pages[page], - pg_offset, pg_bytes); - if (IS_ERR(new)) { - nfs_free_user_pages(pages, page_count); - nfs_release_list(requests); - return PTR_ERR(new); - } - new->wb_index = offset; - nfs_list_add_request(new, requests); - - /* after the first page */ - pg_offset = 0; - offset += PAGE_SIZE; - tot_bytes += pg_bytes; - bytes -= pg_bytes; - page++; + nfs_free_user_pages(pages); + } + + return tot_bytes; +} + +/** + * nfs_direct_write_seg - Write out one iov segment. Generate separate + * write RPCs for each "wsize" bytes, then commit. + * @inode: target inode + * @cred: user's credential + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_write_seg(struct inode *inode, struct rpc_cred *cred, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int wsize = NFS_SERVER(inode)->wsize; + loff_t save_offset = file_offset; + size_t save_count = count; + int need_commit = 0; + int tot_bytes = 0; + int curpage = 0; + struct nfs_writeverf first_verf; + struct nfs_write_data wdata = { + .cred = cred, + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &wdata.fattr, + .verf = &wdata.verf, + }, + }; + + wdata.args.stable = NFS_UNSTABLE; + if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) + wdata.args.stable = NFS_FILE_SYNC; + +retry: + do { + int request, result; + + request = count; + if (count > wsize) + request = wsize; + wdata.args.count = request, + wdata.args.pgbase = user_addr & ~PAGE_MASK; + wdata.args.offset = file_offset; + wdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + wdata.args.count, (long long) wdata.args.offset, + user_addr, wdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->write(&wdata); + unlock_kernel(); + + if (result < 0) + return result; + + if (!tot_bytes) + memcpy(&first_verf.verifier, &wdata.verf.verifier, + VERF_SIZE); + if (wdata.verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, + &wdata.verf.verifier, VERF_SIZE)) + goto sync_retry; } - /* don't release pages here -- I/O completion will do that */ - nfs_free_user_pages(pages, 0); + tot_bytes += result; + count -= result; + file_offset += result; + user_addr += result; + + curpage += (wdata.args.pgbase + result) >> PAGE_SHIFT; + } while (count); + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + int result; + + wdata.args.count = tot_bytes; + wdata.args.offset = save_offset; + + lock_kernel(); + result = NFS_PROTO(inode)->commit(&wdata); + unlock_kernel(); + + if (result < 0) + goto sync_retry; + if (memcmp(&first_verf.verifier, &wdata.verf.verifier, + VERF_SIZE)) + goto sync_retry; } return tot_bytes; + +sync_retry: + wdata.args.stable = NFS_FILE_SYNC; + file_offset = save_offset; + count = save_count; + goto retry; } /** - * do_nfs_direct_IO - Read or write data without caching - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_write - For each iov segment, map the user's buffer + * then generate write and commit RPCs. + * @inode: target inode + * @cred: user's credential * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * - * Break the passed-in iovec into a series of page-sized or smaller - * requests, where each page is mapped for direct user-land I/O. - * - * For each of these pages, create an NFS page request and - * append it to an automatic list of page requests. - * - * When all page requests have been queued, start the I/O on the - * whole list. The underlying routines coalesce the pages on the - * list into a bunch of asynchronous "r/wsize" network requests. - * - * I/O completion automatically unmaps and releases the pages. + * Upon return, generic_file_direct_IO invalidates any cached pages + * that non-direct readers might access, so they will pick up these + * writes immediately. */ static int -do_nfs_direct_IO(int rw, const struct inode *inode, - const struct rpc_cred *cred, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +nfs_direct_write(struct inode *inode, struct rpc_cred *cred, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - LIST_HEAD(requests); - int result, tot_bytes; + int tot_bytes = 0; + unsigned long seg = 0; - result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs, - &requests); - if (result < 0) - return result; - tot_bytes = result; + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages); + return page_count; + } - switch (rw) { - case READ: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) { - result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs); - break; + result = nfs_direct_write_seg(inode, cred, user_addr, size, + file_offset, pages, page_count); + if (result < 0) + tot_bytes = result; + else { + tot_bytes += result; + file_offset += result; } - result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages); - break; - case WRITE: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE)) - result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs); - else - result = nfs_flush_list(&requests, - NFS_SERVER(inode)->wpages, FLUSH_WAIT); - /* invalidate cache so non-direct readers pick up changes */ - invalidate_inode_pages((struct inode *) inode); - break; - default: - result = -EINVAL; - break; + nfs_free_user_pages(pages); } - if (result < 0) - return result; return tot_bytes; } /** * nfs_direct_IO - NFS address space operation for direct I/O * rw: direction (read or write) - * @file: file struct of target file + * @iocb: target I/O control block * @iov: array of vectors that define I/O buffer - * offset: offset in file to begin the operation + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * + * Usually a file system implements direct I/O by calling out to + * blockdev_direct_IO. The NFS client doesn't have a backing block + * device, so we do everything by hand instead. + * * The inode's i_sem is no longer held by the VFS layer before it calls * this function to do a write. */ int nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + loff_t file_offset, unsigned long nr_segs) { - /* None of this works yet, so prevent it from compiling. */ -#if 0 - int result; + int result = -EINVAL; + struct file *file = iocb->ki_filp; struct dentry *dentry = file->f_dentry; - const struct inode *inode = dentry->d_inode->i_mapping->host; - const struct rpc_cred *cred = nfs_file_cred(file); -#endif - - dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n", - ((rw == READ) ? "READ" : "WRITE"), - dentry->d_parent->d_name.name, - dentry->d_name.name, offset, nr_segs); + struct inode *inode = dentry->d_inode; + struct rpc_cred *cred; + + /* + * No support for async yet + */ + if (!is_sync_kiocb(iocb)) + goto out; + + cred = get_rpccred(nfs_file_cred(file)); + if (!cred) + cred = get_rpccred(NFS_I(inode)->mm_cred); + + switch (rw) { + case READ: + dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); + + result = nfs_direct_read(inode, cred, iov, + file_offset, nr_segs); + break; + case WRITE: + dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); - result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs); + result = nfs_direct_write(inode, cred, iov, + file_offset, nr_segs); + break; + default: + break; + } - dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + if (cred) + put_rpccred(cred); +out: + dprintk("NFS: direct_IO result=%d\n", result); return result; } _