aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-03-20 21:05:44 -0700
committerJakub Kicinski <kuba@kernel.org>2024-03-20 21:05:45 -0700
commit3201de46a2013293abe28caa46aa0387864d7cf7 (patch)
treebb54c5de6214b985d103ec0d400e6b8d9527e10f
parentf7bf0ec1e73d43a347489e958b42841b111d63d6 (diff)
parent00bf63122459e87193ee7f1bc6161c83a525569f (diff)
downloadlinux-3201de46a2013293abe28caa46aa0387864d7cf7.tar.gz
Merge branch 'report-rcu-qs-for-busy-network-kthreads'
Yan Zhai says: ==================== Report RCU QS for busy network kthreads This changeset fixes a common problem for busy networking kthreads. These threads, e.g. NAPI threads, typically will do: * polling a batch of packets * if there are more work, call cond_resched() to allow scheduling * continue to poll more packets when rx queue is not empty We observed this being a problem in production, since it can block RCU tasks from making progress under heavy load. Investigation indicates that just calling cond_resched() is insufficient for RCU tasks to reach quiescent states. This also has the side effect of frequently clearing the TIF_NEED_RESCHED flag on voluntary preempt kernels. As a result, schedule() will not be called in these circumstances, despite schedule() in fact provides required quiescent states. This at least affects NAPI threads, napi_busy_loop, and also cpumap kthread. By reporting RCU QSes in these kthreads periodically before cond_resched, the blocked RCU waiters can correctly progress. Instead of just reporting QS for RCU tasks, these code share the same concern as noted in the commit d28139c4e967 ("rcu: Apply RCU-bh QSes to RCU-sched and RCU-preempt when safe"). So report a consolidated QS for safety. It is worth noting that, although this problem is reproducible in napi_busy_loop, it only shows up when setting the polling interval to as high as 2ms, which is far larger than recommended 50us-100us in the documentation. So napi_busy_loop is left untouched. Lastly, this does not affect RT kernels, which does not enter the scheduler through cond_resched(). Without the mentioned side effect, schedule() will be called time by time, and clear the RCU task holdouts. V4: https://lore.kernel.org/bpf/cover.1710525524.git.yan@cloudflare.com/ V3: https://lore.kernel.org/lkml/20240314145459.7b3aedf1@kernel.org/t/ V2: https://lore.kernel.org/bpf/ZeFPz4D121TgvCje@debian.debian/ V1: https://lore.kernel.org/lkml/Zd4DXTyCf17lcTfq@debian.debian/#t ==================== Link: https://lore.kernel.org/r/cover.1710877680.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/linux/rcupdate.h31
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--net/core/dev.c3
3 files changed, 37 insertions, 0 deletions
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 16f519914415e..17d7ed5f3ae6e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -247,6 +247,37 @@ do { \
cond_resched(); \
} while (0)
+/**
+ * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
+ * @old_ts: jiffies at start of processing.
+ *
+ * This helper is for long-running softirq handlers, such as NAPI threads in
+ * networking. The caller should initialize the variable passed in as @old_ts
+ * at the beginning of the softirq handler. When invoked frequently, this macro
+ * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
+ * provide both RCU and RCU-Tasks quiescent states. Note that this macro
+ * modifies its old_ts argument.
+ *
+ * Because regions of code that have disabled softirq act as RCU read-side
+ * critical sections, this macro should be invoked with softirq (and
+ * preemption) enabled.
+ *
+ * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
+ * have more chance to invoke schedule() calls and provide necessary quiescent
+ * states. As a contrast, calling cond_resched() only won't achieve the same
+ * effect because cond_resched() does not provide RCU-Tasks quiescent states.
+ */
+#define rcu_softirq_qs_periodic(old_ts) \
+do { \
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
+ time_after(jiffies, (old_ts) + HZ / 10)) { \
+ preempt_disable(); \
+ rcu_softirq_qs(); \
+ preempt_enable(); \
+ (old_ts) = jiffies; \
+ } \
+} while (0)
+
/*
* Infrastructure to implement the synchronize_() primitives in
* TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 9ee8da4774656..a8e34416e960f 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -263,6 +263,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -288,10 +289,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 303a6ff46e4e1..9a67003e49db8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6743,6 +6743,8 @@ static int napi_threaded_poll(void *data)
void *have;
while (!napi_thread_wait(napi)) {
+ unsigned long last_qs = jiffies;
+
for (;;) {
bool repoll = false;
@@ -6767,6 +6769,7 @@ static int napi_threaded_poll(void *data)
if (!repoll)
break;
+ rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
}