aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-06-16 11:20:12 -0600
committerJens Axboe <axboe@kernel.dk>2022-06-17 08:26:05 -0600
commit9adb5df42261fb7c7058755d288b3bc794769a14 (patch)
treecadc11db2225f7ea1d3fa678ac16e6dc8b19e384
parent389bdfc03121c9a075da31d090d7249d032ba508 (diff)
downloadlinux-block-for-5.20/io_uring-tw.tar.gz
io_uring: switch to per-cpu task_workfor-5.20/io_uring-tw
We see contention on the task_work locking and list management for networked workloads, where it's not uncommon to have task_work arriving from multiple CPUs in the system. The task_work handling ends up with the original task, but to save on the overhead of repeatedly re-adding that (which is an expensive cmpxchg), it's wrapped in a per-tctx task_list which belongs to the original submitter. Having many networked requests inflight can mean that there's a lot of addition activity on the structure. Move from a single per-tctx target list to a per-cpu one instead. This allows multiple completers to add task_work without having to synchronize on the same lock and list. Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--io_uring/io_uring.c10
-rw-r--r--io_uring/tctx.c23
-rw-r--r--io_uring/tctx.h5
3 files changed, 29 insertions, 9 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13d177ab9cd857..fd166fb249eb3c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1011,7 +1011,6 @@ void tctx_task_work(struct callback_head *cb)
bool uring_locked = false;
struct io_ring_ctx *ctx = NULL;
struct tctx_tw *tw = container_of(cb, struct tctx_tw, task_work);
- struct io_uring_task *tctx = container_of(tw, struct io_uring_task, tw);
while (1) {
struct io_wq_work_node *node;
@@ -1035,7 +1034,7 @@ void tctx_task_work(struct callback_head *cb)
ctx_flush_and_put(ctx, &uring_locked);
/* relaxed read is enough as only the task itself sets ->in_idle */
- if (unlikely(atomic_read(&tctx->in_idle)))
+ if (unlikely(atomic_read(&tw->tctx->in_idle)))
io_uring_drop_tctx_refs(current);
}
@@ -1043,12 +1042,15 @@ void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- struct tctx_tw *tw = &tctx->tw;
struct io_wq_work_node *node;
unsigned long flags;
+ struct tctx_tw *tw;
bool running;
- spin_lock_irqsave(&tw->task_lock, flags);
+ local_irq_save(flags);
+ tw = this_cpu_ptr(tctx->tw);
+
+ spin_lock(&tw->task_lock);
wq_list_add_tail(&req->io_task_work.node, &tw->task_list);
running = tw->task_running;
if (!running)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 00a2fc8ed11036..7dc653b19e617d 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -53,6 +53,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->cached_refs);
percpu_counter_destroy(&tctx->inflight);
+ free_percpu(tctx->tw);
kfree(tctx);
tsk->io_uring = NULL;
}
@@ -61,7 +62,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
- int ret;
+ int ret, cpu;
tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
@@ -73,22 +74,36 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
return ret;
}
+ tctx->tw = alloc_percpu(struct tctx_tw);
+ if (!tctx->tw) {
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
tctx->io_wq = io_init_wq_offload(ctx, task);
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ free_percpu(tctx->tw);
kfree(tctx);
return ret;
}
+ for_each_possible_cpu(cpu) {
+ struct tctx_tw *tw = per_cpu_ptr(tctx->tw, cpu);
+
+ spin_lock_init(&tw->task_lock);
+ INIT_WQ_LIST(&tw->task_list);
+ init_task_work(&tw->task_work, tctx_task_work);
+ tw->tctx = tctx;
+ }
+
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
- spin_lock_init(&tctx->tw.task_lock);
- INIT_WQ_LIST(&tctx->tw.task_list);
- init_task_work(&tctx->tw.task_work, tctx_task_work);
return 0;
}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index b1cab2e84b16f7..c50432906dc853 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+
/*
* Arbitrary limit, can be raised if need be
*/
@@ -9,6 +11,7 @@ struct tctx_tw {
spinlock_t task_lock;
struct io_wq_work_list task_list;
struct callback_head task_work;
+ struct io_uring_task *tctx;
bool task_running;
};
@@ -23,7 +26,7 @@ struct io_uring_task {
atomic_t inflight_tracked;
atomic_t in_idle;
- struct tctx_tw tw;
+ struct __percpu tctx_tw *tw;
struct file *registered_rings[IO_RINGFD_REG_MAX];
};