Merge tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe: "A few fixes that should go into the 6.6-rc merge window: - Fix for a regression this merge window caused by the SQPOLL affinity patch, where we can race with SQPOLL thread shutdown and cause an oops when trying to set affinity (Gabriel) - Fix for a regression this merge window where fdinfo reading with for a ring setup with IORING_SETUP_NO_SQARRAY will attempt to deference the non-existing SQ ring array (me) - Add the patch that allows more finegrained control over who can use io_uring (Matteo) - Locking fix for a regression added this merge window for IOPOLL overflow (Pavel) - IOPOLL fix for stable, breaking our loop if helper threads are exiting (Pavel) Also had a fix for unreaped iopoll requests from io-wq from Ming, but we found an issue with that and hence it got reverted. Will get this sorted for a future rc" * tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux: Revert "io_uring: fix IO hang in io_wq_put_and_exit from do_exit()" io_uring: fix unprotected iopoll overflow io_uring: break out of iowq iopoll on teardown io_uring: add a sysctl to disable io_uring system-wide io_uring/fdinfo: only print ->sq_array[] if it's there io_uring: fix IO hang in io_wq_put_and_exit from do_exit() io_uring: Don't set affinity on a dying sqpoll thread
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-09-08 21:32:28 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-09-08 21:32:28 -0700
commit: 7ccc3ebf0c575728bff2d3cb4719ccd84aa186ab (patch)
tree: 43eeb987f3aa5af1bc061801f40de383f464e813
parent: 32bf43e4efdb87e0f7e90ba3883e07b8522322ad (diff)
parent: 023464fe33a53d7e3fa0a1967a2adcb17e5e40e3 (diff)
download: linux-7ccc3ebf0c575728bff2d3cb4719ccd84aa186ab.tar.gz
6 files changed, 99 insertions, 3 deletions
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 8019103aac1099..cf33de56da27dc 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -450,6 +450,35 @@ this allows system administrators to override the
 ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
 
 
+io_uring_disabled
+=================
+
+Prevents all processes from creating new io_uring instances. Enabling this
+shrinks the kernel's attack surface.
+
+= ======================================================================
+0 All processes can create io_uring instances as normal. This is the
+  default setting.
+1 io_uring creation is disabled (io_uring_setup() will fail with
+  -EPERM) for unprivileged processes not in the io_uring_group group.
+  Existing io_uring instances can still be used.  See the
+  documentation for io_uring_group for more information.
+2 io_uring creation is disabled for all processes. io_uring_setup()
+  always fails with -EPERM. Existing io_uring instances can still be
+  used.
+= ======================================================================
+
+
+io_uring_group
+==============
+
+When io_uring_disabled is set to 1, a process must either be
+privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
+to create an io_uring instance.  If io_uring_group is set to -1 (the
+default), only processes with the CAP_SYS_ADMIN capability may create
+io_uring instances.
+
+
 kexec_load_disabled
 ===================
 
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 300455b4bc122b..c5367887541642 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -93,6 +93,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 		struct io_uring_sqe *sqe;
 		unsigned int sq_idx;
 
+		if (ctx->flags & IORING_SETUP_NO_SQARRAY)
+			break;
 		sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
 		if (sq_idx > sq_mask)
 			continue;
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 62f345587df547..1ecc8c7487682c 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -174,6 +174,16 @@ static void io_worker_ref_put(struct io_wq *wq)
 		complete(&wq->worker_done);
 }
 
+bool io_wq_worker_stopped(void)
+{
+	struct io_worker *worker = current->worker_private;
+
+	if (WARN_ON_ONCE(!io_wq_current_is_worker()))
+		return true;
+
+	return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state);
+}
+
 static void io_worker_cancel_cb(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 06d9ca90c5771d..2b2a6406dd8ee8 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -52,6 +52,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val);
 
 int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
 int io_wq_max_workers(struct io_wq *wq, int *new_count);
+bool io_wq_worker_stopped(void);
 
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e7675355048dbf..783ed0fff71b58 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -150,6 +150,31 @@ static void io_queue_sqe(struct io_kiocb *req);
 
 struct kmem_cache *req_cachep;
 
+static int __read_mostly sysctl_io_uring_disabled;
+static int __read_mostly sysctl_io_uring_group = -1;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kernel_io_uring_disabled_table[] = {
+	{
+		.procname	= "io_uring_disabled",
+		.data		= &sysctl_io_uring_disabled,
+		.maxlen		= sizeof(sysctl_io_uring_disabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "io_uring_group",
+		.data		= &sysctl_io_uring_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+#endif
+
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
@@ -883,7 +908,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 		struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
 
 		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
-			if (ctx->task_complete) {
+			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
 				io_cqring_event_overflow(ctx, cqe->user_data,
 							cqe->res, cqe->flags, 0, 0);
@@ -1541,7 +1566,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 
 		if (!(req->flags & REQ_F_CQE_SKIP) &&
 		    unlikely(!io_fill_cqe_req(ctx, req))) {
-			if (ctx->task_complete) {
+			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
 				io_req_cqe_overflow(req);
 				spin_unlock(&ctx->completion_lock);
@@ -1950,6 +1975,8 @@ fail:
 		if (!needs_poll) {
 			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
 				break;
+			if (io_wq_worker_stopped())
+				break;
 			cond_resched();
 			continue;
 		}
@@ -4038,9 +4065,30 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	return io_uring_create(entries, &p, params);
 }
 
+static inline bool io_uring_allowed(void)
+{
+	int disabled = READ_ONCE(sysctl_io_uring_disabled);
+	kgid_t io_uring_group;
+
+	if (disabled == 2)
+		return false;
+
+	if (disabled == 0 || capable(CAP_SYS_ADMIN))
+		return true;
+
+	io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
+	if (!gid_valid(io_uring_group))
+		return false;
+
+	return in_group_p(io_uring_group);
+}
+
 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 		struct io_uring_params __user *, params)
 {
+	if (!io_uring_allowed())
+		return -EPERM;
+
 	return io_uring_setup(entries, params);
 }
 
@@ -4634,6 +4682,10 @@ static int __init io_uring_init(void)
 				offsetof(struct io_kiocb, cmd.data),
 				sizeof_field(struct io_kiocb, cmd.data), NULL);
 
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
+#endif
+
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index ee2d2c687fdaa9..bd6c2c7959a5bf 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -430,7 +430,9 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
 
 	if (sqd) {
 		io_sq_thread_park(sqd);
-		ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
+		/* Don't set affinity for a dying thread */
+		if (sqd->thread)
+			ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
 		io_sq_thread_unpark(sqd);
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-09-08 21:32:28 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-09-08 21:32:28 -0700
commit	7ccc3ebf0c575728bff2d3cb4719ccd84aa186ab (patch)
tree	43eeb987f3aa5af1bc061801f40de383f464e813
parent	32bf43e4efdb87e0f7e90ba3883e07b8522322ad (diff)
parent	023464fe33a53d7e3fa0a1967a2adcb17e5e40e3 (diff)
download	linux-7ccc3ebf0c575728bff2d3cb4719ccd84aa186ab.tar.gz