aboutsummaryrefslogtreecommitdiffstats
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 11:09:19 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 11:09:19 -1000
commitffa059b262ba72571e7fefe7fa2b4ebb6776b277 (patch)
treed837908365664bc1a7192f89609546a60cff8e37 /io_uring
parentca995ce438cc641c47d4b8e4abeb1878a3d07c5f (diff)
parent6ce4a93dbb5bd93bc2bdf14da63f9360a4dcd6a1 (diff)
downloadlinux-ffa059b262ba72571e7fefe7fa2b4ebb6776b277.tar.gz
Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "This contains the core io_uring updates, of which there are not many, and adds support for using WAITID through io_uring and hence not needing to block on these kinds of events. Outside of that, tweaks to the legacy provided buffer handling and some cleanups related to cancelations for uring_cmd support" * tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux: io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups io_uring/kbuf: Use slab for struct io_buffer objects io_uring/kbuf: Allow the full buffer id space for provided buffers io_uring/kbuf: Fix check of BID wrapping in provided buffers io_uring/rsrc: cleanup io_pin_pages() io_uring: cancelable uring_cmd io_uring: retain top 8bits of uring_cmd flags for kernel internal use io_uring: add IORING_OP_WAITID support exit: add internal include file with helpers exit: add kernel_waitid_prepare() helper exit: move core of do_wait() into helper exit: abstract out should_wake helper for child_wait_callback() io_uring/rw: add support for IORING_OP_READ_MULTISHOT io_uring/rw: mark readv/writev as vectored in the opcode definition io_uring/rw: split io_read() into a helper
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile3
-rw-r--r--io_uring/cancel.c5
-rw-r--r--io_uring/io_uring.c43
-rw-r--r--io_uring/io_uring.h1
-rw-r--r--io_uring/kbuf.c58
-rw-r--r--io_uring/opdef.c24
-rw-r--r--io_uring/opdef.h2
-rw-r--r--io_uring/poll.c2
-rw-r--r--io_uring/rsrc.c37
-rw-r--r--io_uring/rw.c92
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/uring_cmd.c49
-rw-r--r--io_uring/waitid.c372
-rw-r--r--io_uring/waitid.h15
14 files changed, 649 insertions, 56 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75e5..7bd64e44256736 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+ cancel.o kbuf.o rsrc.o rw.o opdef.o \
+ notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 7b23607cf4afd9..eb77a51c5a79d0 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -15,6 +15,7 @@
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
+#include "waitid.h"
#include "cancel.h"
struct io_cancel {
@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
+ ret = io_waitid_cancel(ctx, cd, issue_flags);
+ if (ret != -ENOENT)
+ return ret;
+
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8d1bc6cdfe712e..36ae5ac2b070bf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
+#include "waitid.h"
#include "timeout.h"
#include "poll.h"
@@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
+ INIT_HLIST_HEAD(&ctx->waitid_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx;
err:
kfree(ctx->cancel_table.hbs);
@@ -3276,6 +3278,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool ret = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+ hash_node) {
+ struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+ struct io_uring_cmd);
+ struct file *file = req->file;
+
+ if (!cancel_all && req->task != task)
+ continue;
+
+ if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+ /* ->sqe isn't available if no async data */
+ if (!req_has_async_data(req))
+ cmd->sqe = NULL;
+ file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+ ret = true;
+ }
+ }
+ io_submit_flush_completions(ctx);
+
+ return ret;
+}
+
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@@ -3323,6 +3356,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
+ ret |= io_waitid_remove_all(ctx, task, cancel_all);
+ ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@@ -4686,6 +4721,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+ /* top 8bits are for internal use */
+ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
io_uring_optable_init();
/*
@@ -4701,6 +4739,9 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
+ io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0bc145614a6e66..dc6d779b452b9b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -343,6 +343,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
}
extern struct kmem_cache *req_cachep;
+extern struct kmem_cache *io_buf_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9123138aa9f48b..fea06810b43dbb 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -19,12 +19,17 @@
#define BGID_ARRAY 64
+/* BIDs are addressed by a 16-bit field in a CQE */
+#define MAX_BIDS_PER_BGID (1 << 16)
+
+struct kmem_cache *io_buf_cachep;
+
struct io_provide_buf {
struct file *file;
__u64 addr;
__u32 len;
__u32 bgid;
- __u16 nbufs;
+ __u32 nbufs;
__u16 bid;
};
@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
+ struct list_head *item, *tmp;
+ struct io_buffer *buf;
unsigned long index;
int i;
@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
kfree(bl);
}
- while (!list_empty(&ctx->io_buffers_pages)) {
- struct page *page;
-
- page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
- list_del_init(&page->lru);
- __free_page(page);
+ list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+ buf = list_entry(item, struct io_buffer, list);
+ kmem_cache_free(io_buf_cachep, buf);
}
}
@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
+ if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -EINVAL;
memset(p, 0, sizeof(*p));
@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
+ if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -E2BIG;
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX)
return -E2BIG;
- if (tmp + p->nbufs >= USHRT_MAX)
+ if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
return -EINVAL;
p->bid = tmp;
return 0;
}
+#define IO_BUFFER_ALLOC_BATCH 64
+
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- struct page *page;
- int bufs_in_page;
+ struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+ int allocated;
/*
* Completions that don't happen inline (eg not under uring_lock) will
@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
/*
* No free buffers and no completion entries either. Allocate a new
- * page worth of buffer entries and add those to our freelist.
+ * batch of buffer entries and add those to our freelist.
*/
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return -ENOMEM;
-
- list_add(&page->lru, &ctx->io_buffers_pages);
- buf = page_address(page);
- bufs_in_page = PAGE_SIZE / sizeof(*buf);
- while (bufs_in_page) {
- list_add_tail(&buf->list, &ctx->io_buffers_cache);
- buf++;
- bufs_in_page--;
+ allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+ ARRAY_SIZE(bufs), (void **) bufs);
+ if (unlikely(!allocated)) {
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+ if (!bufs[0])
+ return -ENOMEM;
+ allocated = 1;
}
+ while (allocated)
+ list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
return 0;
}
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b8b6d2..aadcbf7136b000 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
+#include "waitid.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
+ .vectored = 1,
.prep = io_prep_rw,
.issue = io_read,
},
@@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
+ .vectored = 1,
.prep = io_prep_rw,
.issue = io_write,
},
@@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
+ .audit_skip = 1,
+ .prep = io_read_mshot_prep,
+ .issue = io_read_mshot,
+ },
+ [IORING_OP_WAITID] = {
+ .prep = io_waitid_prep,
+ .issue = io_waitid,
+ },
};
-
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
@@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .name = "READ_MULTISHOT",
+ },
+ [IORING_OP_WAITID] = {
+ .name = "WAITID",
+ .async_size = sizeof(struct io_waitid_async),
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c22c8696e749ba..9e5435ec27d00f 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
+ /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+ unsigned vectored : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 4c360ba8793a50..d38d05edb4fa26 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -370,7 +370,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask)
req->io_task_work.func = io_poll_task_func;
trace_io_uring_task_add(req, mask);
- io_req_task_work_add(req);
+ __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
static inline void io_poll_execute(struct io_kiocb *req, int res)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d9c853d1058780..7034be555334d2 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
struct page **pages = NULL;
- int pret, ret = -ENOMEM;
+ int ret;
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
+ WARN_ON(!nr_pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
- goto done;
+ return ERR_PTR(-ENOMEM);
- ret = 0;
mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages);
- if (pret == nr_pages)
+ ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
+ mmap_read_unlock(current->mm);
+
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
*npages = nr_pages;
- else
- ret = pret < 0 ? pret : -EFAULT;
+ return pages;
+ }
- mmap_read_unlock(current->mm);
- if (ret) {
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
/* if we did partial map, release any pages we did get */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- goto done;
- }
- ret = 0;
-done:
- if (ret < 0) {
- kvfree(pages);
- pages = ERR_PTR(ret);
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
}
- return pages;
+ kvfree(pages);
+ return ERR_PTR(ret);
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 8f68d5ad4564fe..3398e1d944c261 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+/*
+ * Multishot read is prepared just like a normal read/write request, only
+ * difference is that we set the MULTISHOT flag.
+ */
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
+
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ return 0;
+}
+
void io_readv_writev_cleanup(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
@@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
buf = u64_to_user_ptr(rw->addr);
sqe_len = rw->len;
- if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
- (req->flags & REQ_F_BUFFER_SELECT)) {
+ if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags);
if (!buf)
@@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return 0;
}
-int io_read(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_rw_state __s, *s = &__s;
@@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
- /* if we can poll, just do that */
- if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ /*
+ * If we can poll, just do that. For a vectored read, we'll
+ * need to copy state first.
+ */
+ if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -853,7 +871,69 @@ done:
/* it's faster to check here then delegate to kfree */
if (iovec)
kfree(iovec);
- return kiocb_done(req, ret, issue_flags);
+ return ret;
+}
+
+int io_read(struct io_kiocb *req, unsigned int issue_flags)
+{
+ int ret;
+
+ ret = __io_read(req, issue_flags);
+ if (ret >= 0)
+ return kiocb_done(req, ret, issue_flags);
+
+ return ret;
+}
+
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+{
+ unsigned int cflags = 0;
+ int ret;
+
+ /*
+ * Multishot MUST be used on a pollable file
+ */
+ if (!file_can_poll(req->file))
+ return -EBADFD;
+
+ ret = __io_read(req, issue_flags);
+
+ /*
+ * If we get -EAGAIN, recycle our buffer and just let normal poll
+ * handling arm it.
+ */
+ if (ret == -EAGAIN) {
+ io_kbuf_recycle(req, issue_flags);
+ return -EAGAIN;
+ }
+
+ /*
+ * Any successful return value will keep the multishot read armed.
+ */
+ if (ret > 0) {
+ /*
+ * Put our buffer and post a CQE. If we fail to post a CQE, then
+ * jump to the termination path. This request is then done.
+ */
+ cflags = io_put_kbuf(req, issue_flags);
+
+ if (io_fill_cqe_req_aux(req,
+ issue_flags & IO_URING_F_COMPLETE_DEFER,
+ ret, cflags | IORING_CQE_F_MORE)) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_ISSUE_SKIP_COMPLETE;
+ return -EAGAIN;
+ }
+ }
+
+ /*
+ * Either an error, or we've hit overflow posting the CQE. For any
+ * multishot request, hitting overflow will terminate it.
+ */
+ io_req_set_res(req, ret, cflags);
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_STOP_MULTISHOT;
+ return IOU_OK;
}
int io_write(struct io_kiocb *req, unsigned int issue_flags)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 4b89f9659366a0..c5aed03d42a4d1 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 537795fddc87d9..00a5e5621a288a 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -13,6 +13,51 @@
#include "rsrc.h"
#include "uring_cmd.h"
+static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
+ return;
+
+ cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_del(&req->hash_node);
+ io_ring_submit_unlock(ctx, issue_flags);
+}
+
+/*
+ * Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
+ * will try to cancel this issued command by sending ->uring_cmd() with
+ * issue_flags of IO_URING_F_CANCEL.
+ *
+ * The command is guaranteed to not be done when calling ->uring_cmd()
+ * with IO_URING_F_CANCEL, but it is driver's responsibility to deal
+ * with race between io_uring canceling and normal completion.
+ */
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
+ cmd->flags |= IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
+
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+ return cmd_to_io_kiocb(cmd)->task;
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
+
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ io_uring_cmd_del_cancelable(ioucmd, issue_flags);
+
if (ret < 0)
req_set_fail(req);
@@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
- if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+ if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
new file mode 100644
index 00000000000000..6f851978606d98
--- /dev/null
+++ b/io_uring/waitid.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for async notification of waitid
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "cancel.h"
+#include "waitid.h"
+#include "../kernel/exit.h"
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
+
+#define IO_WAITID_CANCEL_FLAG BIT(31)
+#define IO_WAITID_REF_MASK GENMASK(30, 0)
+
+struct io_waitid {
+ struct file *file;
+ int which;
+ pid_t upid;
+ int options;
+ atomic_t refs;
+ struct wait_queue_head *head;
+ struct siginfo __user *infop;
+ struct waitid_info info;
+};
+
+static void io_waitid_free(struct io_kiocb *req)
+{
+ struct io_waitid_async *iwa = req->async_data;
+
+ put_pid(iwa->wo.wo_pid);
+ kfree(req->async_data);
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+}
+
+#ifdef CONFIG_COMPAT
+static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+{
+ struct compat_siginfo __user *infop;
+ bool ret;
+
+ infop = (struct compat_siginfo __user *) iw->infop;
+
+ if (!user_write_access_begin(infop, sizeof(*infop)))
+ return false;
+
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+#endif
+
+static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ bool ret;
+
+ if (!iw->infop)
+ return true;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return io_waitid_compat_copy_si(iw, signo);
+#endif
+
+ if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
+ return false;
+
+ unsafe_put_user(signo, &iw->infop->si_signo, Efault);
+ unsafe_put_user(0, &iw->infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+
+static int io_waitid_finish(struct io_kiocb *req, int ret)
+{
+ int signo = 0;
+
+ if (ret > 0) {
+ signo = SIGCHLD;
+ ret = 0;
+ }
+
+ if (!io_waitid_copy_si(req, signo))
+ ret = -EFAULT;
+ io_waitid_free(req);
+ return ret;
+}
+
+static void io_waitid_complete(struct io_kiocb *req, int ret)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_tw_state ts = { .locked = true };
+
+ /* anyone completing better be holding a reference */
+ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
+
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ /*
+ * Did cancel find it meanwhile?
+ */
+ if (hlist_unhashed(&req->hash_node))
+ return;
+
+ hlist_del_init(&req->hash_node);
+
+ ret = io_waitid_finish(req, ret);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ io_req_task_complete(req, &ts);
+}
+
+static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ /*
+ * Mark us canceled regardless of ownership. This will prevent a
+ * potential retry from a spurious wakeup.
+ */
+ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
+
+ /* claim ownership */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return false;
+
+ spin_lock_irq(&iw->head->lock);
+ list_del_init(&iwa->wo.child_wait.entry);
+ spin_unlock_irq(&iw->head->lock);
+ io_waitid_complete(req, -ECANCELED);
+ return true;
+}
+
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ int nr = 0;
+
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+ return -ENOENT;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (req->cqe.user_data != cd->data &&
+ !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+ continue;
+ if (__io_waitid_cancel(ctx, req))
+ nr++;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+ break;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (nr)
+ return nr;
+
+ return -ENOENT;
+}
+
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool found = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (!io_match_task_safe(req, task, cancel_all))
+ continue;
+ __io_waitid_cancel(ctx, req);
+ found = true;
+ }
+
+ return found;
+}
+
+static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ if (!atomic_sub_return(1, &iw->refs))
+ return false;
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ return true;
+}
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_waitid_async *iwa = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ io_tw_lock(ctx, ts);
+
+ ret = __do_wait(&iwa->wo);
+
+ /*
+ * If we get -ERESTARTSYS here, we need to re-arm and check again
+ * to ensure we get another callback. If the retry works, then we can
+ * just remove ourselves from the waitqueue again and finish the
+ * request.
+ */
+ if (unlikely(ret == -ERESTARTSYS)) {
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ /* Don't retry if cancel found it meanwhile */
+ ret = -ECANCELED;
+ if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /* retry armed, drop our ref */
+ io_waitid_drop_issue_ref(req);
+ return;
+ }
+
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ }
+ }
+
+ io_waitid_complete(req, ret);
+}
+
+static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
+ struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
+ struct io_kiocb *req = iwa->req;
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct task_struct *p = key;
+
+ if (!pid_child_should_wake(wo, p))
+ return 0;
+
+ /* cancel is in progress */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return 1;
+
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ list_del_init(&wait->entry);
+ return 1;
+}
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
+ return -EINVAL;
+
+ iw->which = READ_ONCE(sqe->len);
+ iw->upid = READ_ONCE(sqe->fd);
+ iw->options = READ_ONCE(sqe->file_index);
+ iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ return 0;
+}
+
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_waitid_async *iwa;
+ int ret;
+
+ if (io_alloc_async_data(req))
+ return -ENOMEM;
+
+ iwa = req->async_data;
+ iwa->req = req;
+
+ ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
+ iw->options, NULL);
+ if (ret)
+ goto done;
+
+ /*
+ * Mark the request as busy upfront, in case we're racing with the
+ * wakeup. If we are, then we'll notice when we drop this initial
+ * reference again after arming.
+ */
+ atomic_set(&iw->refs, 1);
+
+ /*
+ * Cancel must hold the ctx lock, so there's no risk of cancelation
+ * finding us until a) we remain on the list, and b) the lock is
+ * dropped. We only need to worry about racing with the wakeup
+ * callback.
+ */
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->waitid_list);
+
+ init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
+ iwa->wo.child_wait.private = req->task;
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /*
+ * Nobody else grabbed a reference, it'll complete when we get
+ * a waitqueue callback, or if someone cancels it.
+ */
+ if (!io_waitid_drop_issue_ref(req)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ hlist_del_init(&req->hash_node);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = io_waitid_finish(req, ret);
+
+ io_ring_submit_unlock(ctx, issue_flags);
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
new file mode 100644
index 00000000000000..956a8adafe8c03
--- /dev/null
+++ b/io_uring/waitid.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/exit.h"
+
+struct io_waitid_async {
+ struct io_kiocb *req;
+ struct wait_opts wo;
+};
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags);
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all);