aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 19:03:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 19:03:10 -0800
commitcf619f891971bfac659ac64968f8c35db605c884 (patch)
tree838e0d623064b8ae813b57a0a7842b4770a40e32
parent6a518afcc2066732e6c5c24281ce017bbbd85506 (diff)
parent8d84e39d76bd83474b26cb44f4b338635676e7e8 (diff)
downloadlinux-cf619f891971bfac659ac64968f8c35db605c884.tar.gz
Merge tag 'fs.ovl.setgid.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping
Pull setgid inheritance updates from Christian Brauner: "This contains the work to make setgid inheritance consistent between modifying a file and when changing ownership or mode as this has been a repeated source of very subtle bugs. The gist is that we perform the same permission checks in the write path as we do in the ownership and mode changing paths after this series where we're currently doing different things. We've already made setgid inheritance a lot more consistent and reliable in the last releases by moving setgid stripping from the individual filesystems up into the vfs. This aims to make the logic even more consistent and easier to understand and also to fix long-standing overlayfs setgid inheritance bugs. Miklos was nice enough to just let me carry the trivial overlayfs patches from Amir too. Below is a more detailed explanation how the current difference in setgid handling lead to very subtle bugs exemplified via overlayfs which is a victim of the current rules. I hope this explains why I think taking the regression risk here is worth it. A long while ago I found a few setgid inheritance bugs in overlayfs in the write path in certain conditions. Amir recently picked this back up in [1] and I jumped on board to fix this more generally. On the surface all that overlayfs would need to fix setgid inheritance would be to call file_remove_privs() or file_modified() but actually that isn't enough because the setgid inheritance api is wildly inconsistent in that area. Before this pr setgid stripping in file_remove_privs()'s old should_remove_suid() helper was inconsistent with other parts of the vfs. Specifically, it only raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the inode isn't in the caller's groups and the caller isn't privileged over the inode although we require this already in setattr_prepare() and setattr_copy() and so all filesystem implement this requirement implicitly because they have to use setattr_{prepare,copy}() anyway. But the inconsistency shows up in setgid stripping bugs for overlayfs in xfstests (e.g., generic/673, generic/683, generic/685, generic/686, generic/687). For example, we test whether suid and setgid stripping works correctly when performing various write-like operations as an unprivileged user (fallocate, reflink, write, etc.): echo "Test 1 - qa_user, non-exec file $verb" setup_testfile chmod a+rws $junk_file commit_and_check "$qa_user" "$verb" 64k 64k The test basically creates a file with 6666 permissions. While the file has the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a regular filesystem like xfs what will happen is: sys_fallocate() -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> setattr_copy() In should_remove_suid() we can see that ATTR_KILL_SUID is raised unconditionally because the file in the test has S_ISUID set. But we also see that ATTR_KILL_SGID won't be set because while the file is S_ISGID it is not S_IXGRP (see above) which is a condition for ATTR_KILL_SGID being raised. So by the time we call notify_change() we have attr->ia_valid set to ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that ATTR_KILL_SUID is set and does: ia_valid = attr->ia_valid |= ATTR_MODE attr->ia_mode = (inode->i_mode & ~S_ISUID); which means that when we call setattr_copy() later we will definitely update inode->i_mode. Note that attr->ia_mode still contains S_ISGID. Now we call into the filesystem's ->setattr() inode operation which will end up calling setattr_copy(). Since ATTR_MODE is set we will hit: if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } and since the caller in the test is neither capable nor in the group of the inode the S_ISGID bit is stripped. But assume the file isn't suid then ATTR_KILL_SUID won't be raised which has the consequence that neither the setgid nor the suid bits are stripped even though it should be stripped because the inode isn't in the caller's groups and the caller isn't privileged over the inode. If overlayfs is in the mix things become a bit more complicated and the bug shows up more clearly. When e.g., ovl_setattr() is hit from ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and ATTR_KILL_SGID might be raised but because the check in notify_change() is questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be stripped the S_ISGID bit isn't removed even though it should be stripped: sys_fallocate() -> vfs_fallocate() -> ovl_fallocate() -> file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> ovl_setattr() /* TAKE ON MOUNTER'S CREDS */ -> ovl_do_notify_change() -> notify_change() /* GIVE UP MOUNTER'S CREDS */ /* TAKE ON MOUNTER'S CREDS */ -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = attr_force | kill; -> notify_change() The fix for all of this is to make file_remove_privs()'s should_remove_suid() helper perform the same checks as we already require in setattr_prepare() and setattr_copy() and have notify_change() not pointlessly requiring S_IXGRP again. It doesn't make any sense in the first place because the caller must calculate the flags via should_remove_suid() anyway which would raise ATTR_KILL_SGID Note that some xfstests will now fail as these patches will cause the setgid bit to be lost in certain conditions for unprivileged users modifying a setgid file when they would've been kept otherwise. I think this risk is worth taking and I explained and mentioned this multiple times on the list [2]. Enforcing the rules consistently across write operations and chmod/chown will lead to losing the setgid bit in cases were it might've been retained before. While I've mentioned this a few times but it's worth repeating just to make sure that this is understood. For the sake of maintainability, consistency, and security this is a risk worth taking. If we really see regressions for workloads the fix is to have special setgid handling in the write path again with different semantics from chmod/chown and possibly additional duct tape for overlayfs. I'll update the relevant xfstests with if you should decide to merge this second setgid cleanup. Before that people should be aware that there might be failures for fstests where unprivileged users modify a setgid file" Link: https://lore.kernel.org/linux-fsdevel/20221003123040.900827-1-amir73il@gmail.com [1] Link: https://lore.kernel.org/linux-fsdevel/20221122142010.zchf2jz2oymx55qi@wittgenstein [2] * tag 'fs.ovl.setgid.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping: fs: use consistent setgid checks in is_sxid() ovl: remove privs in ovl_fallocate() ovl: remove privs in ovl_copyfile() attr: use consistent sgid stripping checks attr: add setattr_should_drop_sgid() fs: move should_remove_suid() attr: add in_group_or_capable()
-rw-r--r--Documentation/trace/ftrace.rst2
-rw-r--r--fs/attr.c74
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/inode.c64
-rw-r--r--fs/internal.h10
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/open.c8
-rw-r--r--fs/overlayfs/file.c28
-rw-r--r--include/linux/fs.h4
9 files changed, 140 insertions, 56 deletions
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 60bceb018d6a93..21f01d32c95985 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -2940,7 +2940,7 @@ Produces::
bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement
bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action
bash-1994 [000] .... 4342.324899: do_truncate <-do_last
- bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate
+ bash-1994 [000] .... 4342.324899: setattr_should_drop_suidgid <-do_truncate
bash-1994 [000] .... 4342.324899: notify_change <-do_truncate
bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change
bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time
diff --git a/fs/attr.c b/fs/attr.c
index 1552a5f23d6b35..b45f30e516fad3 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,70 @@
#include <linux/evm.h>
#include <linux/ima.h>
+#include "internal.h"
+
+/**
+ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
+ * removed
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the setgid bit needs to be removed.
+ * We retain backwards compatibility and require setgid bit to be removed
+ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
+ * requirements as setattr_prepare() and setattr_copy().
+ *
+ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+
+ if (!(mode & S_ISGID))
+ return 0;
+ if (mode & S_IXGRP)
+ return ATTR_KILL_SGID;
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
+ return ATTR_KILL_SGID;
+ return 0;
+}
+
+/**
+ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
+ * be dropped
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the set{g,u}id bits need to be removed.
+ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
+ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
+ * set{g,u}id bits need to be removed the corresponding mask of both flags is
+ * returned.
+ *
+ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
+ * to remove, 0 otherwise.
+ */
+int setattr_should_drop_suidgid(struct user_namespace *mnt_userns,
+ struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ kill |= setattr_should_drop_sgid(mnt_userns, inode);
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(setattr_should_drop_suidgid);
+
/**
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
@@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode, vfsgid))
attr->ia_mode &= ~S_ISGID;
}
@@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
@@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
}
}
if (ia_valid & ATTR_KILL_SGID) {
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ if (mode & S_ISGID) {
if (!(ia_valid & ATTR_MODE)) {
ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = inode->i_mode;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 89f4741728ba34..c996c0ef8c632e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- should_remove_suid(file_dentry(file))) {
+ setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
goto writethrough;
}
diff --git a/fs/inode.c b/fs/inode.c
index 762feb8d0c6c3d..5ccc61fe8a1faf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1949,40 +1949,12 @@ skip_update:
EXPORT_SYMBOL(touch_atime);
/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = d_inode(dentry)->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-/*
* Return mask of changes for notify_change() that need to be done as a
* response to write or truncate. Return 0 if nothing has to be changed.
* Negative value on error (change should be denied).
*/
-int dentry_needs_remove_privs(struct dentry *dentry)
+int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int mask = 0;
@@ -1991,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
if (IS_NOSEC(inode))
return 0;
- mask = should_remove_suid(dentry);
+ mask = setattr_should_drop_suidgid(mnt_userns, inode);
ret = security_inode_need_killpriv(dentry);
if (ret < 0)
return ret;
@@ -2023,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
- kill = dentry_needs_remove_privs(dentry);
+ kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry);
if (kill < 0)
return kill;
@@ -2485,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode)
EXPORT_SYMBOL(current_time);
/**
+ * in_group_or_capable - check whether caller is CAP_FSETID privileged
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ * @vfsgid: the new/current vfsgid of @inode
+ *
+ * Check wether @vfsgid is in the caller's group list or if the caller is
+ * privileged with CAP_FSETID over @inode. This can be used to determine
+ * whether the setgid bit can be kept or must be dropped.
+ *
+ * Return: true if the caller is sufficiently privileged, false if not.
+ */
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid)
+{
+ if (vfsgid_in_group_p(vfsgid))
+ return true;
+ if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ return true;
+ return false;
+}
+
+/**
* mode_strip_sgid - handle the sgid bit for non-directories
* @mnt_userns: User namespace of the mount the inode was created from
* @dir: parent directory inode
@@ -2505,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
return mode;
if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
return mode;
- if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
- return mode;
- if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ if (in_group_or_capable(mnt_userns, dir,
+ i_gid_into_vfsgid(mnt_userns, dir)))
return mode;
-
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index e377eb7bbe7f2f..0c8812fe7ca465 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -150,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern int dentry_needs_remove_privs(struct dentry *dentry);
+int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry);
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid);
/*
* fs-writeback.c
@@ -255,3 +257,9 @@ static inline ssize_t do_get_acl(struct user_namespace *mnt_userns,
#endif
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+
+/*
+ * fs/attr.c
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index af900aaa92752b..5c60b6bc85bf85 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
}
- if (file && should_remove_suid(file->f_path.dentry)) {
+ if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (should_remove_suid(dentry)) {
+ if (setattr_should_drop_suidgid(&init_user_ns, inode)) {
if (meta_level == 0) {
ocfs2_inode_unlock_for_extent_tree(inode,
&di_bh,
diff --git a/fs/open.c b/fs/open.c
index a81319b6177f69..9d0197db15e7b3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Remove suid, sgid, and file capabilities on truncate too */
- ret = dentry_needs_remove_privs(dentry);
+ ret = dentry_needs_remove_privs(mnt_userns, dentry);
if (ret < 0)
return ret;
if (ret)
@@ -723,10 +723,10 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- if (!S_ISDIR(inode->i_mode))
- newattrs.ia_valid |=
- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
+ setattr_should_drop_sgid(mnt_userns, inode);
/* Continue to send actual fs values, not the mount values. */
error = security_path_chown(
path,
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a1a22f58ba183f..d066be3b9226ec 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -517,9 +517,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
const struct cred *old_cred;
int ret;
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
ret = ovl_real_fdget(file, &real);
if (ret)
- return ret;
+ goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fallocate(real.file, mode, offset, len);
@@ -530,6 +537,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
fdput(real);
+out_unlock:
+ inode_unlock(inode);
+
return ret;
}
@@ -567,14 +577,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
const struct cred *old_cred;
loff_t ret;
+ inode_lock(inode_out);
+ if (op != OVL_DEDUPE) {
+ /* Update mode */
+ ovl_copyattr(inode_out);
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
- return ret;
+ goto out_unlock;
ret = ovl_real_fdget(file_in, &real_in);
if (ret) {
fdput(real_out);
- return ret;
+ goto out_unlock;
}
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
@@ -603,6 +622,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
fdput(real_in);
fdput(real_out);
+out_unlock:
+ inode_unlock(inode_out);
+
return ret;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d1347311010c93..effa4a2e5098ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3133,7 +3133,7 @@ extern void __destroy_inode(struct inode *);
extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
-extern int should_remove_suid(struct dentry *);
+extern int setattr_should_drop_suidgid(struct user_namespace *, struct inode *);
extern int file_remove_privs(struct file *);
/*
@@ -3564,7 +3564,7 @@ int __init list_bdev_fs_names(char *buf, size_t size);
static inline bool is_sxid(umode_t mode)
{
- return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
+ return mode & (S_ISUID | S_ISGID);
}
static inline int check_sticky(struct user_namespace *mnt_userns,