aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeliang Tang <tanggeliang@kylinos.cn>2024-04-20 09:25:18 +0000
committerMatthieu Baerts (NGI0) <matttbe@kernel.org>2024-04-20 09:25:18 +0000
commit866bf6c4a4d8b5138816b5a0e62e5e10c983c801 (patch)
tree002d779bc3785cd516ef5c39e4f6e72288bb8c94
parentccbf3b9a4f07829d1a358390677ca627c8363ae2 (diff)
downloadmptcp_net-next-866bf6c4a4d8b5138816b5a0e62e5e10c983c801.tar.gz
selftests/bpf: Add bpf_burst scheduler & test
This patch implements the burst BPF MPTCP scheduler, named bpf_burst, which is the default scheduler in protocol.c. bpf_burst_get_send() uses the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans uses the same logic as mptcp_subflow_get_retrans(). Using MPTCP_SCHED_TEST macro to add a new test for this bpf_burst scheduler, the arguments "1 1" means data has been sent on both net devices. Run this test by RUN_MPTCP_TEST macro. Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> Reviewed-by: Mat Martineau <martineau@kernel.org> Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
-rw-r--r--tools/testing/selftests/bpf/bpf_tcp_helpers.h7
-rw-r--r--tools/testing/selftests/bpf/prog_tests/mptcp.c3
-rw-r--r--tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c199
3 files changed, 209 insertions, 0 deletions
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
index cd4b6e6f14f9da..2c71226b3631d4 100644
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -36,6 +36,8 @@ enum sk_pacing {
struct sock {
struct sock_common __sk_common;
#define sk_state __sk_common.skc_state
+ int sk_sndbuf;
+ int sk_wmem_queued;
unsigned long sk_pacing_rate;
__u32 sk_pacing_status; /* see enum sk_pacing */
} __attribute__((preserve_access_index));
@@ -85,6 +87,7 @@ struct tcp_sock {
__u32 lsndtime;
__u32 prior_cwnd;
__u64 tcp_mstamp; /* most recent packet received/sent */
+ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
bool is_mptcp;
} __attribute__((preserve_access_index));
@@ -234,7 +237,9 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
#define MPTCP_SUBFLOWS_MAX 8
struct mptcp_subflow_context {
+ unsigned long avg_pacing_rate;
__u32 backup : 1;
+ __u8 stale_count;
struct sock *tcp_sock; /* tcp sk backpointer */
} __attribute__((preserve_access_index));
@@ -257,6 +262,8 @@ struct mptcp_sched_ops {
struct mptcp_sock {
struct inet_connection_sock sk;
+ __u64 snd_nxt;
+ int snd_burst;
__u32 token;
struct sock *first;
char ca_name[TCP_CA_NAME_MAX];
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index 3964dda7900f65..d3cc3541dd4ffe 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -13,6 +13,7 @@
#include "mptcp_bpf_bkup.skel.h"
#include "mptcp_bpf_rr.skel.h"
#include "mptcp_bpf_red.skel.h"
+#include "mptcp_bpf_burst.skel.h"
#define NS_TEST "mptcp_ns"
#define WITH_DATA true
@@ -545,6 +546,7 @@ MPTCP_SCHED_TEST(first, WITH_DATA, WITHOUT_DATA);
MPTCP_SCHED_TEST(bkup, WITH_DATA, WITHOUT_DATA);
MPTCP_SCHED_TEST(rr, WITH_DATA, WITH_DATA);
MPTCP_SCHED_TEST(red, WITH_DATA, WITH_DATA);
+MPTCP_SCHED_TEST(burst, WITH_DATA, WITH_DATA);
#define RUN_MPTCP_TEST(suffix) \
do { \
@@ -561,4 +563,5 @@ void test_mptcp(void)
RUN_MPTCP_TEST(bkup);
RUN_MPTCP_TEST(rr);
RUN_MPTCP_TEST(red);
+ RUN_MPTCP_TEST(burst);
}
diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
new file mode 100644
index 00000000000000..b3c8115648667f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023, SUSE. */
+
+#include <linux/bpf.h>
+#include <limits.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define MPTCP_SEND_BURST_SIZE 65428
+
+struct subflow_send_info {
+ __u8 subflow_id;
+ __u64 linger_time;
+};
+
+extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
+extern void mptcp_set_timeout(struct sock *sk) __ksym;
+extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
+extern bool tcp_stream_memory_free(const struct sock *sk, int wake) __ksym;
+extern bool bpf_mptcp_subflow_queues_empty(struct sock *sk) __ksym;
+extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
+
+#define SSK_MODE_ACTIVE 0
+#define SSK_MODE_BACKUP 1
+#define SSK_MODE_MAX 2
+
+static __always_inline __u64 div_u64(__u64 dividend, __u32 divisor)
+{
+ return dividend / divisor;
+}
+
+static __always_inline bool tcp_write_queue_empty(struct sock *sk)
+{
+ const struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk);
+
+ return tp ? tp->write_seq == tp->snd_nxt : true;
+}
+
+static __always_inline bool tcp_rtx_and_write_queues_empty(struct sock *sk)
+{
+ return bpf_mptcp_subflow_queues_empty(sk) && tcp_write_queue_empty(sk);
+}
+
+static __always_inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
+{
+ if (sk->sk_wmem_queued >= sk->sk_sndbuf)
+ return false;
+
+ return tcp_stream_memory_free(sk, wake);
+}
+
+static __always_inline bool sk_stream_memory_free(const struct sock *sk)
+{
+ return __sk_stream_memory_free(sk, 0);
+}
+
+SEC("struct_ops/mptcp_sched_burst_init")
+void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk)
+{
+}
+
+SEC("struct_ops/mptcp_sched_burst_release")
+void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk)
+{
+}
+
+static int bpf_burst_get_send(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ struct subflow_send_info send_info[SSK_MODE_MAX];
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ __u32 pace, burst, wmem;
+ __u64 linger_time;
+ struct sock *ssk;
+ int i;
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < SSK_MODE_MAX; ++i) {
+ send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX;
+ send_info[i].linger_time = -1;
+ }
+
+ for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
+ subflow = bpf_mptcp_subflow_ctx_by_pos(data, i);
+ if (!subflow)
+ break;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
+ pace = subflow->avg_pacing_rate;
+ if (!pace) {
+ /* init pacing rate from socket */
+ subflow->avg_pacing_rate = ssk->sk_pacing_rate;
+ pace = subflow->avg_pacing_rate;
+ if (!pace)
+ continue;
+ }
+
+ linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
+ if (linger_time < send_info[subflow->backup].linger_time) {
+ send_info[subflow->backup].subflow_id = i;
+ send_info[subflow->backup].linger_time = linger_time;
+ }
+ }
+ mptcp_set_timeout(sk);
+
+ /* pick the best backup if no other subflow is active */
+ if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX)
+ send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id;
+
+ subflow = bpf_mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id);
+ if (!subflow)
+ return -1;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!ssk || !sk_stream_memory_free(ssk))
+ return -1;
+
+ burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ wmem = ssk->sk_wmem_queued;
+ if (!burst)
+ goto out;
+
+ subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem +
+ ssk->sk_pacing_rate * burst,
+ burst + wmem);
+ msk->snd_burst = burst;
+
+out:
+ mptcp_subflow_set_scheduled(subflow, true);
+ return 0;
+}
+
+static int bpf_burst_get_retrans(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id;
+ struct mptcp_subflow_context *subflow;
+ int min_stale_count = INT_MAX;
+ struct sock *ssk;
+
+ for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
+ subflow = bpf_mptcp_subflow_ctx_by_pos(data, i);
+ if (!subflow)
+ break;
+
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ /* still data outstanding at TCP level? skip this */
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ mptcp_pm_subflow_chk_stale(msk, ssk);
+ min_stale_count = min(min_stale_count, subflow->stale_count);
+ continue;
+ }
+
+ if (subflow->backup) {
+ if (backup == MPTCP_SUBFLOWS_MAX)
+ backup = i;
+ continue;
+ }
+
+ if (pick == MPTCP_SUBFLOWS_MAX)
+ pick = i;
+ }
+
+ if (pick < MPTCP_SUBFLOWS_MAX) {
+ subflow_id = pick;
+ goto out;
+ }
+ subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX;
+
+out:
+ subflow = bpf_mptcp_subflow_ctx_by_pos(data, subflow_id);
+ if (!subflow)
+ return -1;
+ mptcp_subflow_set_scheduled(subflow, true);
+ return 0;
+}
+
+int BPF_STRUCT_OPS(bpf_burst_get_subflow, struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ if (data->reinject)
+ return bpf_burst_get_retrans(msk, data);
+ return bpf_burst_get_send(msk, data);
+}
+
+SEC(".struct_ops")
+struct mptcp_sched_ops burst = {
+ .init = (void *)mptcp_sched_burst_init,
+ .release = (void *)mptcp_sched_burst_release,
+ .get_subflow = (void *)bpf_burst_get_subflow,
+ .name = "bpf_burst",
+};