From: Chuck Lever <cel@citi.umich.edu>

O_DIRECT|O_APPEND cannot possibly work on NFS, so NFS needs some way of
preventing the user from setting this combination.  We felt that the best
way of implementing this restriction is to allow the filesytem to implement
its own fcntl() handler.

This patch does, that, and provide the appropriate handler for NFS.

Additional details from Chuck:

Forgetting O_DIRECT for a moment, O_APPEND writes on NFS don't work in any
case when multiple clients are writing to a file, since an NFS client can
never guarantee it knows where the true end of file is 100% of the time. 
it works as expected iff only one client writes to an O_APPEND file at a
time.

Multi-client O_APPEND writing doesn't seem to be a problem for any
application I'm aware of.  Since it can be made to behave in the
multi-client case with careful application logic or by using file locking,
I don't think we should disallow it.

I want to drop the inode semaphore when doing NFS direct I/O because it is
synchronous; holding the i_sem means we reduce direct I/O concurrency to
one I/O per file at a time.  the important thing sct was worried about was
the case where a single client is writing with O_APPEND and O_DIRECT, and
we don't hold the i_sem during the write.

We must at least hold the i_sem when determining where the end of file is
to do the O_APPEND write.  In 2.6, I believe that is handled correctly in
the VFS layer, so this is not an issue for 2.6, right?


---

 25-akpm/fs/fcntl.c         |  136 +++++++++++++++++++++++----------------------
 25-akpm/fs/nfs/file.c      |   28 +++++++++
 25-akpm/include/linux/fs.h |    5 +
 3 files changed, 105 insertions(+), 64 deletions(-)

diff -puN fs/fcntl.c~file-operations-fcntl fs/fcntl.c
--- 25/fs/fcntl.c~file-operations-fcntl	2004-04-03 02:59:58.747421976 -0800
+++ 25-akpm/fs/fcntl.c	2004-04-03 02:59:58.754420912 -0800
@@ -282,80 +282,88 @@ void f_delown(struct file *filp)
 
 EXPORT_SYMBOL(f_delown);
 
-static long do_fcntl(unsigned int fd, unsigned int cmd,
-		     unsigned long arg, struct file * filp)
+long generic_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
 {
 	long err = -EINVAL;
 
 	switch (cmd) {
-		case F_DUPFD:
-			get_file(filp);
-			err = dupfd(filp, arg);
-			break;
-		case F_GETFD:
-			err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
-			break;
-		case F_SETFD:
-			err = 0;
-			set_close_on_exec(fd, arg & FD_CLOEXEC);
-			break;
-		case F_GETFL:
-			err = filp->f_flags;
-			break;
-		case F_SETFL:
-			err = setfl(fd, filp, arg);
-			break;
-		case F_GETLK:
-			err = fcntl_getlk(filp, (struct flock __user *) arg);
-			break;
-		case F_SETLK:
-		case F_SETLKW:
-			err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
-			break;
-		case F_GETOWN:
-			/*
-			 * XXX If f_owner is a process group, the
-			 * negative return value will get converted
-			 * into an error.  Oops.  If we keep the
-			 * current syscall conventions, the only way
-			 * to fix this will be in libc.
-			 */
-			err = filp->f_owner.pid;
-			force_successful_syscall_return();
-			break;
-		case F_SETOWN:
-			err = f_setown(filp, arg, 1);
-			break;
-		case F_GETSIG:
-			err = filp->f_owner.signum;
-			break;
-		case F_SETSIG:
-			/* arg == 0 restores default behaviour. */
-			if (arg < 0 || arg > _NSIG) {
-				break;
-			}
-			err = 0;
-			filp->f_owner.signum = arg;
-			break;
-		case F_GETLEASE:
-			err = fcntl_getlease(filp);
-			break;
-		case F_SETLEASE:
-			err = fcntl_setlease(fd, filp, arg);
-			break;
-		case F_NOTIFY:
-			err = fcntl_dirnotify(fd, filp, arg);
-			break;
-		default:
+	case F_DUPFD:
+		get_file(filp);
+		err = dupfd(filp, arg);
+		break;
+	case F_GETFD:
+		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+		break;
+	case F_SETFD:
+		err = 0;
+		set_close_on_exec(fd, arg & FD_CLOEXEC);
+		break;
+	case F_GETFL:
+		err = filp->f_flags;
+		break;
+	case F_SETFL:
+		err = setfl(fd, filp, arg);
+		break;
+	case F_GETLK:
+		err = fcntl_getlk(filp, (struct flock __user *) arg);
+		break;
+	case F_SETLK:
+	case F_SETLKW:
+		err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
+		break;
+	case F_GETOWN:
+		/*
+		 * XXX If f_owner is a process group, the
+		 * negative return value will get converted
+		 * into an error.  Oops.  If we keep the
+		 * current syscall conventions, the only way
+		 * to fix this will be in libc.
+		 */
+		err = filp->f_owner.pid;
+		force_successful_syscall_return();
+		break;
+	case F_SETOWN:
+		err = f_setown(filp, arg, 1);
+		break;
+	case F_GETSIG:
+		err = filp->f_owner.signum;
+		break;
+	case F_SETSIG:
+		/* arg == 0 restores default behaviour. */
+		if (arg < 0 || arg > _NSIG) {
 			break;
+		}
+		err = 0;
+		filp->f_owner.signum = arg;
+		break;
+	case F_GETLEASE:
+		err = fcntl_getlease(filp);
+		break;
+	case F_SETLEASE:
+		err = fcntl_setlease(fd, filp, arg);
+		break;
+	case F_NOTIFY:
+		err = fcntl_dirnotify(fd, filp, arg);
+		break;
+	default:
+		break;
 	}
-
 	return err;
 }
+EXPORT_SYMBOL(generic_file_fcntl);
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static long do_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
+{
+	if (filp->f_op && filp->f_op->fcntl)
+		return filp->f_op->fcntl(fd, cmd, arg, filp);
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
+asmlinkage long sys_fcntl(int fd, unsigned int cmd, unsigned long arg)
 {	
-	struct file * filp;
+	struct file *filp;
 	long err = -EBADF;
 
 	filp = fget(fd);
diff -puN fs/nfs/file.c~file-operations-fcntl fs/nfs/file.c
--- 25/fs/nfs/file.c~file-operations-fcntl	2004-04-03 02:59:58.749421672 -0800
+++ 25-akpm/fs/nfs/file.c	2004-04-03 02:59:58.755420760 -0800
@@ -33,6 +33,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
+static long nfs_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 static int nfs_file_open(struct inode *, struct file *);
 static int nfs_file_release(struct inode *, struct file *);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
@@ -55,6 +57,7 @@ struct file_operations nfs_file_operatio
 	.fsync		= nfs_fsync,
 	.lock		= nfs_lock,
 	.sendfile	= nfs_file_sendfile,
+	.fcntl		= nfs_file_fcntl,
 };
 
 struct inode_operations nfs_file_inode_operations = {
@@ -68,6 +71,28 @@ struct inode_operations nfs_file_inode_o
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
+#define nfs_invalid_flags	(O_APPEND | O_DIRECT)
+
+/*
+ * Check for special cases that NFS doesn't support, and
+ * pass the rest to the generic fcntl function.
+ */
+static long
+nfs_file_fcntl(int fd, unsigned int cmd,
+		unsigned long arg, struct file *filp)
+{
+	switch (cmd) {
+	case F_SETFL:
+		if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
 /*
  * Open file
  */
@@ -78,6 +103,9 @@ nfs_file_open(struct inode *inode, struc
 	int (*open)(struct inode *, struct file *);
 	int res = 0;
 
+	if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+		return -EINVAL;
+
 	lock_kernel();
 	/* Do NFSv4 open() call */
 	if ((open = server->rpc_ops->file_open) != NULL)
diff -puN include/linux/fs.h~file-operations-fcntl include/linux/fs.h
--- 25/include/linux/fs.h~file-operations-fcntl	2004-04-03 02:59:58.750421520 -0800
+++ 25-akpm/include/linux/fs.h	2004-04-03 02:59:58.757420456 -0800
@@ -621,6 +621,9 @@ extern struct list_head file_lock_list;
 
 #include <linux/fcntl.h>
 
+extern long generic_file_fcntl(int fd, unsigned int cmd,
+				unsigned long arg, struct file *filp);
+
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *);
 
@@ -830,6 +833,8 @@ struct file_operations {
 	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	long (*fcntl)(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 };
 
 struct inode_operations {

_