From: "Michael S. Tsirkin" The patch introduces two new methods (ioctl_native and ioctl_compat): ioctl_native is called on native ioctl syscall, without BKL being taken, and is, in that respect, equivalent to Ingo's unlocked_ioctl (which is why it conflicts). ioctl_compat is called on compat (i.e. 32 bit app on 64 bit OS) ioctl, again without BKL being taken. If a new call is not defined, default to the old behaviour. (It should be possible for me to build a patch that applies on top of Ingo's unlocked_ioctl, if its really needed let me know and I'll look at it the next week.) There was an additional motivation for my patch: As noted by Juergen Kreileder, the compat hash does not work for ioctls that encode additional information in the command, like this: #define EVIOCGBIT(ev,len) _IOC(_IOC_READ, 'E', 0x20 + ev, len) Signed-off-by: Michael Tsirkin Signed-off-by: Andrew Morton --- 25-akpm/Documentation/filesystems/Locking | 9 +++ 25-akpm/fs/compat.c | 24 +++++++--- 25-akpm/fs/ioctl.c | 68 +++++++++++++++++++++--------- 25-akpm/include/linux/fs.h | 24 ++++++++++ 25-akpm/include/linux/ioctl.h | 8 +++ 5 files changed, 105 insertions(+), 28 deletions(-) diff -puN Documentation/filesystems/Locking~ioctl-rework Documentation/filesystems/Locking --- 25/Documentation/filesystems/Locking~ioctl-rework Thu Dec 16 15:48:31 2004 +++ 25-akpm/Documentation/filesystems/Locking Thu Dec 16 15:48:31 2004 @@ -350,6 +350,10 @@ prototypes: unsigned int (*poll) (struct file *, struct poll_table_struct *); int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + long (*ioctl_native) (struct inode *, struct file *, unsigned int, + unsigned long); + long (*ioctl_compat) (struct inode *, struct file *, unsigned int, + unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *); @@ -383,6 +387,8 @@ aio_write: no readdir: no poll: no ioctl: yes (see below) +ioctl_native: no (see below) +ioctl_compat: no (see below) mmap: no open: maybe (see below) flush: no @@ -428,6 +434,9 @@ move ->readdir() to inode_operations and anything that resembles union-mount we won't have a struct file for all components. And there are other reasons why the current interface is a mess... +->ioctl() on regular files is superceded by the ->ioctl_native() and +->ioctl_compat() pair. The lock is not taken for these new calls. + ->read on directories probably must go away - we should just enforce -EISDIR in sys_read() and friends. diff -puN fs/compat.c~ioctl-rework fs/compat.c --- 25/fs/compat.c~ioctl-rework Thu Dec 16 15:48:31 2004 +++ 25-akpm/fs/compat.c Thu Dec 16 15:48:31 2004 @@ -401,16 +401,21 @@ asmlinkage long compat_sys_ioctl(unsigne unsigned long arg) { struct file * filp; - int error = -EBADF; + long error = -EBADF; struct ioctl_trans *t; + int fput_needed; - filp = fget(fd); - if(!filp) + filp = fget_light(fd, &fput_needed); + if (!filp) goto out2; - if (!filp->f_op || !filp->f_op->ioctl) { - error = sys_ioctl (fd, cmd, arg); + if (!std_sys_ioctl(fd, cmd, arg, filp, &error)) goto out; + else if (filp->f_op && filp->f_op->ioctl_compat) { + error = filp->f_op->ioctl_compat(filp->f_dentry->d_inode, + filp, cmd, arg); + if (error != -ENOIOCTLCMD) + goto out; } down_read(&ioctl32_sem); @@ -425,9 +430,12 @@ asmlinkage long compat_sys_ioctl(unsigne error = t->handler(fd, cmd, arg, filp); unlock_kernel(); up_read(&ioctl32_sem); - } else { + } else if (filp->f_op && filp->f_op->ioctl) { up_read(&ioctl32_sem); - error = sys_ioctl(fd, cmd, arg); + lock_kernel(); + error = filp->f_op->ioctl(filp->f_dentry->d_inode, + filp, cmd, arg); + unlock_kernel(); } } else { up_read(&ioctl32_sem); @@ -466,7 +474,7 @@ asmlinkage long compat_sys_ioctl(unsigne } } out: - fput(filp); + fput_light(filp, fput_needed); out2: return error; } diff -puN fs/ioctl.c~ioctl-rework fs/ioctl.c --- 25/fs/ioctl.c~ioctl-rework Thu Dec 16 15:48:31 2004 +++ 25-akpm/fs/ioctl.c Thu Dec 16 15:48:31 2004 @@ -36,7 +36,9 @@ static int file_ioctl(struct file *filp, if ((error = get_user(block, p)) != 0) return error; + lock_kernel(); res = mapping->a_ops->bmap(mapping, block); + unlock_kernel(); return put_user(res, p); } case FIGETBSZ: @@ -46,29 +48,21 @@ static int file_ioctl(struct file *filp, case FIONREAD: return put_user(i_size_read(inode) - filp->f_pos, p); } - if (filp->f_op && filp->f_op->ioctl) - return filp->f_op->ioctl(inode, filp, cmd, arg); return -ENOTTY; } -asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct file * filp; +EXPORT_SYMBOL(std_sys_ioctl); +int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, + struct file *filp, long *status) +{ unsigned int flag; - int on, error = -EBADF; - - filp = fget(fd); - if (!filp) - goto out; + int on, error, unknown = 0; error = security_file_ioctl(filp, cmd, arg); - if (error) { - fput(filp); + if (error) goto out; - } - lock_kernel(); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); @@ -100,8 +94,11 @@ asmlinkage long sys_ioctl(unsigned int f /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { - if (filp->f_op && filp->f_op->fasync) + if (filp->f_op && filp->f_op->fasync) { + lock_kernel(); error = filp->f_op->fasync(fd, filp, on); + unlock_kernel(); + } else error = -ENOTTY; } if (error != 0) @@ -125,15 +122,46 @@ asmlinkage long sys_ioctl(unsigned int f break; default: error = -ENOTTY; - if (S_ISREG(filp->f_dentry->d_inode->i_mode)) + if (S_ISREG(filp->f_dentry->d_inode->i_mode)) { error = file_ioctl(filp, cmd, arg); - else if (filp->f_op && filp->f_op->ioctl) - error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg); + } + if (error == -ENOTTY) { + unknown = 1; + goto out; + } + break; + } +out: + *status = error; + return unknown; +} + +asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + struct file *filp; + long error = -EBADF; + int fput_needed; + + filp = fget_light(fd, &fput_needed); + if (!filp) + goto out2; + + if (!std_sys_ioctl(fd, cmd, arg, filp, &error)) + goto out; + + if (filp->f_op && filp->f_op->ioctl_native) + error = filp->f_op->ioctl_native(filp->f_dentry->d_inode, + filp, cmd, arg); + else if (filp->f_op && filp->f_op->ioctl) { + lock_kernel(); + error = filp->f_op->ioctl(filp->f_dentry->d_inode, + filp, cmd, arg); + unlock_kernel(); } - unlock_kernel(); - fput(filp); out: + fput_light(filp, fput_needed); +out2: return error; } diff -puN include/linux/fs.h~ioctl-rework include/linux/fs.h --- 25/include/linux/fs.h~ioctl-rework Thu Dec 16 15:48:31 2004 +++ 25-akpm/include/linux/fs.h Thu Dec 16 15:48:31 2004 @@ -907,6 +907,12 @@ typedef struct { typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); +/* These macros are for out of kernel modules to test that + * the kernel supports the ioctl_native and ioctl_compat + * fields in struct file_operations. */ +#define HAVE_IOCTL_COMPAT 1 +#define HAVE_IOCTL_NATIVE 1 + /* * NOTE: * read, write, poll, fsync, readv, writev can be called @@ -922,6 +928,24 @@ struct file_operations { int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + /* The two calls ioctl_native and ioctl_compat described below + * can be used as a replacement for the ioctl call above. They + * take precedence over ioctl: thus if they are set, ioctl is + * not used. Unlike ioctl, BKL is not taken: drivers manage + * their own locking. */ + + /* If ioctl_native is set, it is used instead of ioctl for + * native ioctl syscalls. + * Note that the standard glibc ioctl trims the return code to + * type int, so dont try to put a 64 bit value there. + */ + long (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long); + /* If ioctl_compat is set, it is used for a 32 bit compatible + * ioctl (i.e. a 32 bit binary running on a 64 bit OS). + * Return -ENOIOCTLCMD if you dont handle it. + * Note that only the low 32 bit of the return code are passed + * to the user-space application. */ + long (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *); diff -puN include/linux/ioctl.h~ioctl-rework include/linux/ioctl.h --- 25/include/linux/ioctl.h~ioctl-rework Thu Dec 16 15:48:31 2004 +++ 25-akpm/include/linux/ioctl.h Thu Dec 16 15:48:31 2004 @@ -3,5 +3,13 @@ #include +/* Handles standard ioctl commands, and returns the result in status. + Does nothing and returns non-zero if cmd is not one of the standard commands. +*/ + +struct file; +int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, + struct file *filp, long *status); + #endif /* _LINUX_IOCTL_H */ _