diff options
author | Geliang Tang <tanggeliang@kylinos.cn> | 2024-04-20 09:25:18 +0000 |
---|---|---|
committer | Matthieu Baerts (NGI0) <matttbe@kernel.org> | 2024-04-20 09:25:18 +0000 |
commit | 866bf6c4a4d8b5138816b5a0e62e5e10c983c801 (patch) | |
tree | 002d779bc3785cd516ef5c39e4f6e72288bb8c94 | |
parent | ccbf3b9a4f07829d1a358390677ca627c8363ae2 (diff) | |
download | mptcp_net-next-866bf6c4a4d8b5138816b5a0e62e5e10c983c801.tar.gz |
selftests/bpf: Add bpf_burst scheduler & test
This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
which is the default scheduler in protocol.c. bpf_burst_get_send() uses
the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
uses the same logic as mptcp_subflow_get_retrans().
Using MPTCP_SCHED_TEST macro to add a new test for this bpf_burst
scheduler, the arguments "1 1" means data has been sent on both net
devices. Run this test by RUN_MPTCP_TEST macro.
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
-rw-r--r-- | tools/testing/selftests/bpf/bpf_tcp_helpers.h | 7 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/mptcp.c | 3 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c | 199 |
3 files changed, 209 insertions, 0 deletions
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index cd4b6e6f14f9da..2c71226b3631d4 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -36,6 +36,8 @@ enum sk_pacing { struct sock { struct sock_common __sk_common; #define sk_state __sk_common.skc_state + int sk_sndbuf; + int sk_wmem_queued; unsigned long sk_pacing_rate; __u32 sk_pacing_status; /* see enum sk_pacing */ } __attribute__((preserve_access_index)); @@ -85,6 +87,7 @@ struct tcp_sock { __u32 lsndtime; __u32 prior_cwnd; __u64 tcp_mstamp; /* most recent packet received/sent */ + __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ bool is_mptcp; } __attribute__((preserve_access_index)); @@ -234,7 +237,9 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_subflow_context { + unsigned long avg_pacing_rate; __u32 backup : 1; + __u8 stale_count; struct sock *tcp_sock; /* tcp sk backpointer */ } __attribute__((preserve_access_index)); @@ -257,6 +262,8 @@ struct mptcp_sched_ops { struct mptcp_sock { struct inet_connection_sock sk; + __u64 snd_nxt; + int snd_burst; __u32 token; struct sock *first; char ca_name[TCP_CA_NAME_MAX]; diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 3964dda7900f65..d3cc3541dd4ffe 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -13,6 +13,7 @@ #include "mptcp_bpf_bkup.skel.h" #include "mptcp_bpf_rr.skel.h" #include "mptcp_bpf_red.skel.h" +#include "mptcp_bpf_burst.skel.h" #define NS_TEST "mptcp_ns" #define WITH_DATA true @@ -545,6 +546,7 @@ MPTCP_SCHED_TEST(first, WITH_DATA, WITHOUT_DATA); MPTCP_SCHED_TEST(bkup, WITH_DATA, WITHOUT_DATA); MPTCP_SCHED_TEST(rr, WITH_DATA, WITH_DATA); MPTCP_SCHED_TEST(red, WITH_DATA, WITH_DATA); +MPTCP_SCHED_TEST(burst, WITH_DATA, WITH_DATA); #define RUN_MPTCP_TEST(suffix) \ do { \ @@ -561,4 +563,5 @@ void test_mptcp(void) RUN_MPTCP_TEST(bkup); RUN_MPTCP_TEST(rr); RUN_MPTCP_TEST(red); + RUN_MPTCP_TEST(burst); } diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c new file mode 100644 index 00000000000000..b3c8115648667f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023, SUSE. */ + +#include <linux/bpf.h> +#include <limits.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +#define MPTCP_SEND_BURST_SIZE 65428 + +struct subflow_send_info { + __u8 subflow_id; + __u64 linger_time; +}; + +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym; +extern void mptcp_set_timeout(struct sock *sk) __ksym; +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym; +extern bool tcp_stream_memory_free(const struct sock *sk, int wake) __ksym; +extern bool bpf_mptcp_subflow_queues_empty(struct sock *sk) __ksym; +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym; + +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + +static __always_inline __u64 div_u64(__u64 dividend, __u32 divisor) +{ + return dividend / divisor; +} + +static __always_inline bool tcp_write_queue_empty(struct sock *sk) +{ + const struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk); + + return tp ? tp->write_seq == tp->snd_nxt : true; +} + +static __always_inline bool tcp_rtx_and_write_queues_empty(struct sock *sk) +{ + return bpf_mptcp_subflow_queues_empty(sk) && tcp_write_queue_empty(sk); +} + +static __always_inline bool __sk_stream_memory_free(const struct sock *sk, int wake) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return tcp_stream_memory_free(sk, wake); +} + +static __always_inline bool sk_stream_memory_free(const struct sock *sk) +{ + return __sk_stream_memory_free(sk, 0); +} + +SEC("struct_ops/mptcp_sched_burst_init") +void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk) +{ +} + +SEC("struct_ops/mptcp_sched_burst_release") +void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk) +{ +} + +static int bpf_burst_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct subflow_send_info send_info[SSK_MODE_MAX]; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + __u32 pace, burst, wmem; + __u64 linger_time; + struct sock *ssk; + int i; + + /* pick the subflow with the lower wmem/wspace ratio */ + for (i = 0; i < SSK_MODE_MAX; ++i) { + send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX; + send_info[i].linger_time = -1; + } + + for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; + + pace = subflow->avg_pacing_rate; + if (!pace) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = ssk->sk_pacing_rate; + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } + + linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { + send_info[subflow->backup].subflow_id = i; + send_info[subflow->backup].linger_time = linger_time; + } + } + mptcp_set_timeout(sk); + + /* pick the best backup if no other subflow is active */ + if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX) + send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id; + + subflow = bpf_mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id); + if (!subflow) + return -1; + ssk = mptcp_subflow_tcp_sock(subflow); + if (!ssk || !sk_stream_memory_free(ssk)) + return -1; + + burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + wmem = ssk->sk_wmem_queued; + if (!burst) + goto out; + + subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem + + ssk->sk_pacing_rate * burst, + burst + wmem); + msk->snd_burst = burst; + +out: + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +static int bpf_burst_get_retrans(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id; + struct mptcp_subflow_context *subflow; + int min_stale_count = INT_MAX; + struct sock *ssk; + + for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + if (!mptcp_subflow_active(subflow)) + continue; + + ssk = mptcp_subflow_tcp_sock(subflow); + /* still data outstanding at TCP level? skip this */ + if (!tcp_rtx_and_write_queues_empty(ssk)) { + mptcp_pm_subflow_chk_stale(msk, ssk); + min_stale_count = min(min_stale_count, subflow->stale_count); + continue; + } + + if (subflow->backup) { + if (backup == MPTCP_SUBFLOWS_MAX) + backup = i; + continue; + } + + if (pick == MPTCP_SUBFLOWS_MAX) + pick = i; + } + + if (pick < MPTCP_SUBFLOWS_MAX) { + subflow_id = pick; + goto out; + } + subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX; + +out: + subflow = bpf_mptcp_subflow_ctx_by_pos(data, subflow_id); + if (!subflow) + return -1; + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +int BPF_STRUCT_OPS(bpf_burst_get_subflow, struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + if (data->reinject) + return bpf_burst_get_retrans(msk, data); + return bpf_burst_get_send(msk, data); +} + +SEC(".struct_ops") +struct mptcp_sched_ops burst = { + .init = (void *)mptcp_sched_burst_init, + .release = (void *)mptcp_sched_burst_release, + .get_subflow = (void *)bpf_burst_get_subflow, + .name = "bpf_burst", +}; |