diff options
author | David Ahern <dsahern@gmail.com> | 2020-02-28 22:42:49 +0000 |
---|---|---|
committer | David Ahern <dsahern@gmail.com> | 2020-02-28 22:42:49 +0000 |
commit | b6de0bf7dbb20f07e267e869d30c9df83c6f5e85 (patch) | |
tree | bde02b924320744530347666ceddbece4d54cc76 | |
parent | 5023df6a21c73560b514d7fde5381d140373afe9 (diff) | |
parent | b5a77cf70116f4c5c1767f0e0ab78f7ff2f58bca (diff) | |
download | iproute2-b6de0bf7dbb20f07e267e869d30c9df83c6f5e85.tar.gz |
Merge branch 'master' into next
Signed-off-by: David Ahern <dsahern@gmail.com>
-rw-r--r-- | bash-completion/tc | 12 | ||||
-rw-r--r-- | devlink/devlink.c | 6 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 16 | ||||
-rw-r--r-- | include/uapi/linux/magic.h | 1 | ||||
-rw-r--r-- | ip/iplink_bridge.c | 2 | ||||
-rw-r--r-- | ip/iproute.c | 11 | ||||
-rw-r--r-- | ip/link_gre.c | 2 | ||||
-rw-r--r-- | ip/link_gre6.c | 2 | ||||
-rw-r--r-- | ip/xfrm_policy.c | 21 | ||||
-rw-r--r-- | ip/xfrm_state.c | 27 | ||||
-rw-r--r-- | man/man8/ip.8 | 7 | ||||
-rw-r--r-- | man/man8/rdma-statistic.8 | 16 | ||||
-rw-r--r-- | man/man8/rdma.8 | 6 | ||||
-rw-r--r-- | man/man8/tc-fq_pie.8 | 166 | ||||
-rw-r--r-- | man/man8/tc.8 | 8 | ||||
-rw-r--r-- | misc/nstat.c | 47 | ||||
-rw-r--r-- | rdma/stat.c | 1 | ||||
-rw-r--r-- | tc/Makefile | 1 | ||||
-rw-r--r-- | tc/q_fq_pie.c | 318 | ||||
-rwxr-xr-x | testsuite/tests/ss/ssfilter.t | 24 |
20 files changed, 604 insertions, 90 deletions
diff --git a/bash-completion/tc b/bash-completion/tc index fe0d51ec4..086cb7f67 100644 --- a/bash-completion/tc +++ b/bash-completion/tc @@ -3,8 +3,8 @@ # Copyright 2016 Quentin Monnet <quentin.monnet@6wind.com> QDISC_KIND=' choke codel bfifo pfifo pfifo_head_drop fq fq_codel gred hhf \ - mqprio multiq netem pfifo_fast pie red rr sfb sfq tbf atm cbq drr \ - dsmark hfsc htb prio qfq ' + mqprio multiq netem pfifo_fast pie fq_pie red rr sfb sfq tbf atm \ + cbq drr dsmark hfsc htb prio qfq ' FILTER_KIND=' basic bpf cgroup flow flower fw route rsvp tcindex u32 matchall ' ACTION_KIND=' gact mirred bpf sample ' @@ -326,6 +326,14 @@ _tc_qdisc_options() _tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator' return 0 ;; + fq_pie) + _tc_once_attr 'limit flows target tupdate \ + alpha beta quantum memory_limit ecn_prob' + _tc_one_of_list 'ecn noecn' + _tc_one_of_list 'bytemode nobytemode' + _tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator' + return 0 + ;; red) _tc_once_attr 'limit min max avpkt burst adaptive probability \ bandwidth ecn harddrop' diff --git a/devlink/devlink.c b/devlink/devlink.c index f48ff6c2d..6e2115b6c 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -3066,11 +3066,13 @@ static int cmd_dev_flash(struct dl *dl) /* In child, just execute the flash and pass returned * value through pipe once it is done. */ + int cc; + close(pipe_r); err = _mnlg_socket_send(dl->nlg, nlh); - write(pipe_w, &err, sizeof(err)); + cc = write(pipe_w, &err, sizeof(err)); close(pipe_w); - exit(0); + exit(cc != sizeof(err)); } close(pipe_w); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a8701bf7..657645801 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 3ac436376..d78064007 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -87,6 +87,7 @@ #define NSFS_MAGIC 0x6e736673 #define BPF_FS_MAGIC 0xcafe4a11 #define AAFS_MAGIC 0x5a3c69f0 +#define ZONEFS_MAGIC 0x5a4f4653 /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 diff --git a/ip/iplink_bridge.c b/ip/iplink_bridge.c index bbd6f3a88..3e81aa059 100644 --- a/ip/iplink_bridge.c +++ b/ip/iplink_bridge.c @@ -743,7 +743,7 @@ static void bridge_print_stats_attr(struct rtattr *attr, int ifindex) print_string(PRINT_FP, NULL, "%-16s ", ""); print_u64(PRINT_ANY, "tx_v1", "TX: v1 %llu ", mstats->igmp_v1reports[BR_MCAST_DIR_TX]); - print_u64(PRINT_ANY, "tx_v2", "v2 %llu", + print_u64(PRINT_ANY, "tx_v2", "v2 %llu ", mstats->igmp_v2reports[BR_MCAST_DIR_TX]); print_u64(PRINT_ANY, "tx_v3", "v3 %llu\n", mstats->igmp_v3reports[BR_MCAST_DIR_TX]); diff --git a/ip/iproute.c b/ip/iproute.c index 93b805c90..07c451694 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -933,9 +933,6 @@ int print_route(struct nlmsghdr *n, void *arg) if (tb[RTA_IIF] && filter.iifmask != -1) print_rta_if(fp, tb[RTA_IIF], "iif"); - if (tb[RTA_MULTIPATH]) - print_rta_multipath(fp, r, tb[RTA_MULTIPATH]); - if (tb[RTA_PREF]) print_rt_pref(fp, rta_getattr_u8(tb[RTA_PREF])); @@ -951,6 +948,14 @@ int print_route(struct nlmsghdr *n, void *arg) propagate ? "enabled" : "disabled"); } + if (tb[RTA_MULTIPATH]) + print_rta_multipath(fp, r, tb[RTA_MULTIPATH]); + + /* If you are adding new route RTA_XXXX then place it above + * the RTA_MULTIPATH else it will appear that the last nexthop + * in the ECMP has new attributes + */ + print_string(PRINT_FP, NULL, "\n", NULL); close_json_object(); fflush(fp); diff --git a/ip/link_gre.c b/ip/link_gre.c index 15beb7372..e42f21ae4 100644 --- a/ip/link_gre.c +++ b/ip/link_gre.c @@ -94,7 +94,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv, __u8 metadata = 0; __u32 fwmark = 0; __u32 erspan_idx = 0; - __u8 erspan_ver = 0; + __u8 erspan_ver = 1; __u8 erspan_dir = 0; __u16 erspan_hwid = 0; diff --git a/ip/link_gre6.c b/ip/link_gre6.c index 9d1741bff..94a4ee700 100644 --- a/ip/link_gre6.c +++ b/ip/link_gre6.c @@ -106,7 +106,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv, __u8 metadata = 0; __u32 fwmark = 0; __u32 erspan_idx = 0; - __u8 erspan_ver = 0; + __u8 erspan_ver = 1; __u8 erspan_dir = 0; __u16 erspan_hwid = 0; diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c index 7c0233c18..d3c706d32 100644 --- a/ip/xfrm_policy.c +++ b/ip/xfrm_policy.c @@ -66,24 +66,9 @@ static void usage(void) "Usage: ip xfrm policy count\n" "Usage: ip xfrm policy set [ hthresh4 LBITS RBITS ] [ hthresh6 LBITS RBITS ]\n" "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n" - "UPSPEC := proto { { "); - fprintf(stderr, "%s | %s | %s | %s } ", - strxf_proto(IPPROTO_TCP), - strxf_proto(IPPROTO_UDP), - strxf_proto(IPPROTO_SCTP), - strxf_proto(IPPROTO_DCCP)); - fprintf(stderr, - "[ sport PORT ] [ dport PORT ] |\n" - " { %s | %s | %s } ", - strxf_proto(IPPROTO_ICMP), - strxf_proto(IPPROTO_ICMPV6), - strxf_proto(IPPROTO_MH)); - fprintf(stderr, - "[ type NUMBER ] [ code NUMBER ] |\n" - " %s", - strxf_proto(IPPROTO_GRE)); - fprintf(stderr, - " [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" + "UPSPEC := proto { { tcp | udp | sctp | dccp } [ sport PORT ] [ dport PORT ] |\n" + " { icmp | ipv6-icmp | mobility-header } [ type NUMBER ] [ code NUMBER ] |\n" + " gre [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" "DIR := in | out | fwd\n" "PTYPE := main | sub\n" "ACTION := allow | block\n" diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c index df2d50c38..d68f600ae 100644 --- a/ip/xfrm_state.c +++ b/ip/xfrm_state.c @@ -106,27 +106,9 @@ static void usage(void) "EXTRA-FLAG-LIST := [ EXTRA-FLAG-LIST ] EXTRA-FLAG\n" "EXTRA-FLAG := dont-encap-dscp\n" "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n" - "UPSPEC := proto { { "); - fprintf(stderr, - "%s | %s | %s | %s", - strxf_proto(IPPROTO_TCP), - strxf_proto(IPPROTO_UDP), - strxf_proto(IPPROTO_SCTP), - strxf_proto(IPPROTO_DCCP)); - fprintf(stderr, - " } [ sport PORT ] [ dport PORT ] |\n" - " { "); - fprintf(stderr, - "%s | %s | %s", - strxf_proto(IPPROTO_ICMP), - strxf_proto(IPPROTO_ICMPV6), - strxf_proto(IPPROTO_MH)); - fprintf(stderr, - " } [ type NUMBER ] [ code NUMBER ] |\n"); - fprintf(stderr, - " %s", strxf_proto(IPPROTO_GRE)); - fprintf(stderr, - " [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" + "UPSPEC := proto { { tcp | udp | sctp | dccp } [ sport PORT ] [ dport PORT ] |\n" + " { icmp | ipv6-icmp | mobility-header } [ type NUMBER ] [ code NUMBER ] |\n" + " gre [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" "LIMIT-LIST := [ LIMIT-LIST ] limit LIMIT\n" "LIMIT := { time-soft | time-hard | time-use-soft | time-use-hard } SECONDS |\n" " { byte-soft | byte-hard } SIZE | { packet-soft | packet-hard } COUNT\n" @@ -1149,6 +1131,9 @@ static int xfrm_state_keep(struct nlmsghdr *n, void *arg) if (!xfrm_state_filter_match(xsinfo)) return 0; + if (xsinfo->id.proto == IPPROTO_IPIP) + return 0; + if (xb->offset > xb->size) { fprintf(stderr, "State buffer overflow\n"); return -1; diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1661aa678..1613f790a 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -22,7 +22,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels .BR link " | " address " | " addrlabel " | " route " | " rule " | " neigh " | "\ ntable " | " tunnel " | " tuntap " | " maddress " | " mroute " | " mrule " | "\ monitor " | " xfrm " | " netns " | " l2tp " | " tcp_metrics " | " token " | "\ - macsec " }" + macsec " | " vrf " }" .sp .ti -8 @@ -313,6 +313,10 @@ readability. - manage TUN/TAP devices. .TP +.B vrf +- manage virtual routing and forwarding devices. + +.TP .B xfrm - manage IPSec policies. @@ -410,6 +414,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR ip-tcp_metrics (8), .BR ip-token (8), .BR ip-tunnel (8), +.BR ip-vrf (8), .BR ip-xfrm (8) .br .RB "IP Command reference " ip-cref.ps diff --git a/man/man8/rdma-statistic.8 b/man/man8/rdma-statistic.8 index e3f4b51b1..7de495c91 100644 --- a/man/man8/rdma-statistic.8 +++ b/man/man8/rdma-statistic.8 @@ -9,7 +9,7 @@ rdma-statistic \- RDMA statistic counter configuration .B rdma .RI "[ " OPTIONS " ]" .B statistic -.RI " { " COMMAND " | " +.RI "{ " COMMAND " | " .BR help " }" .sp @@ -23,6 +23,7 @@ rdma-statistic \- RDMA statistic counter configuration .RI "[ " OBJECT " ]" .B show link .RI "[ " DEV/PORT_INDX " ]" +.RI "[ " FILTER_NAME " " FILTER_VALUE " ]" .ti -8 .B rdma statistic @@ -34,7 +35,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B set .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .B auto .RI "{ " CRITERIA " | " .BR off " }" @@ -44,7 +45,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B bind .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .RI "[ " OBJECT-ID " ]" .RI "[ " COUNTER-ID " ]" @@ -53,7 +54,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B unbind .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .RI "[ " COUNTER-ID " ]" .RI "[ " OBJECT-ID " ]" @@ -69,6 +70,10 @@ rdma-statistic \- RDMA statistic counter configuration .IR CRITERIA " := " .RB "{ " type " }" +.ti -8 +.IR FILTER_NAME " := " +.RB "{ " cntn " | " lqpn " | " pid " }" + .SH "DESCRIPTION" .SS rdma statistic [object] show - Queries the specified RDMA device for RDMA and driver-specific statistics. Show the default hw counters if object is not specified @@ -79,6 +84,9 @@ rdma-statistic \- RDMA statistic counter configuration .I "PORT_INDEX" - specifies counters on this RDMA port to show. +.I "FILTER_NAME +- specifies a filter to show only the results matching it. + .SS rdma statistic <object> set - configure counter statistic auto-mode for a specific device/port In auto mode all objects belong to one category are bind automatically to a single counter set. Not applicable for MR's. diff --git a/man/man8/rdma.8 b/man/man8/rdma.8 index ef29b1c63..221bf3343 100644 --- a/man/man8/rdma.8 +++ b/man/man8/rdma.8 @@ -19,7 +19,7 @@ rdma \- RDMA tool .ti -8 .IR OBJECT " := { " -.BR dev " | " link " | " system " | " statistic " }" +.BR dev " | " link " | " resource " | " system " | " statistic " }" .sp .ti -8 @@ -71,6 +71,10 @@ Generate JSON output. - RDMA port related. .TP +.B resource +- RDMA resource configuration. + +.TP .B sys - RDMA subsystem related. diff --git a/man/man8/tc-fq_pie.8 b/man/man8/tc-fq_pie.8 new file mode 100644 index 000000000..457a56bb1 --- /dev/null +++ b/man/man8/tc-fq_pie.8 @@ -0,0 +1,166 @@ +.TH FQ-PIE 8 "23 January 2020" "iproute2" "Linux" + +.SH NAME + +FQ-PIE - Flow Queue Proportional Integral controller Enhanced + +.SH SYNOPSIS + +.B tc qdisc ... fq_pie +[ \fBlimit\fR PACKETS ] [ \fBflows\fR NUMBER ] +.br + \ +[ \fBtarget\fR TIME ] [ \fBtupdate\fR TIME ] +.br + \ +[ \fBalpha\fR NUMBER ] [ \fBbeta\fR NUMBER ] +.br + \ +[ \fBquantum\fR BYTES ] [ \fBmemory_limit\fR BYTES ] +.br + \ +[ \fBecn_prob\fR PERENTAGE ] [ [\fBno\fR]\fBecn\fR ] +.br + \ +[ [\fBno\fR]\fBbytemode\fR ] [ [\fBno_\fR]\fBdq_rate_estimator\fR ] + +.SH DESCRIPTION +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. + +.SH ALGORITHM +The FQ-PIE algorithm consists of two logical parts: the scheduler which selects +which queue to dequeue a packet from, and the PIE AQM which works on each of the +queues. The major work of FQ-PIE is mostly in the scheduling part. The +interaction between the scheduler and the PIE algorithm is straight forward. + +During the enqueue stage, a hashing-based scheme is used, where flows are hashed +into a number of buckets with each bucket having its own queue. The number of +buckets is configurable, and presently defaults to 1024 in the implementation. +The flow hashing is performed on the 5-tuple of source and destination IP +addresses, port numbers and IP protocol number. Once the packet has been +successfully classified into a queue, it is handed over to the PIE algorithm +for enqueuing. It is then added to the tail of the selected queue, and the +queue's byte count is updated by the packet size. If the queue is not currently +active (i.e., if it is not in either the list of new or the list of old queues) +, it is added to the end of the list of new queues, and its number of credits +is initiated to the configured quantum. Otherwise, the queue is left in its +current queue list. + +During the dequeue stage, the scheduler first looks at the list of new queues; +for the queue at the head of that list, if that queue has a negative number of +credits (i.e., it has already dequeued at least a quantum of bytes), it is given +an additional quantum of credits, the queue is put onto the end of the list of +old queues, and the routine selects the next queue and starts again. Otherwise, +that queue is selected for dequeue again. If the list of new queues is empty, +the scheduler proceeds down the list of old queues in the same fashion +(checking the credits, and either selecting the queue for dequeuing, or adding +credits and putting the queue back at the end of the list). After having +selected a queue from which to dequeue a packet, the PIE algorithm is invoked +on that queue. + +Finally, if the PIE algorithm does not return a packet, then the queue must be +empty and the scheduler does one of two things: + +If the queue selected for dequeue came from the list of new queues, it is moved +to the end of the list of old queues. If instead it came from the list of old +queues, that queue is removed from the list, to be added back (as a new queue) +the next time a packet arrives that hashes to that queue. Then (since no packet +was available for dequeue), the whole dequeue process is restarted from the +beginning. + +If, instead, the scheduler did get a packet back from the PIE algorithm, it +subtracts the size of the packet from the byte credits for the selected queue +and returns the packet as the result of the dequeue operation. + +.SH PARAMETERS +.SS limit +It is the limit on the queue size in packets. Incoming packets are dropped when +the limit is reached. The default value is 10240 packets. + +.SS flows +It is the number of flows into which the incoming packets are classified. Due +to the stochastic nature of hashing, multiple flows may end up being hashed +into the same slot. Newer flows have priority over older ones. This +parameter can be set only at load time since memory has to be allocated for +the hash table. The default value is 1024. + +.SS target +It is the queue delay which the PIE algorithm tries to maintain. The default +target delay is 15ms. + +.SS tupdate +It is the time interval at which the system drop probability is calculated. +The default is 15ms. + +.SS alpha +.SS beta +alpha and beta are parameters chosen to control the drop probability. These +should be in the range between 0 and 32. + +.SS quantum +quantum signifies the number of bytes that may be dequeued from a queue before +switching to the next queue in the deficit round robin scheme. + +.SS memory_limit +It is the maximum total memory allowed for packets of all flows. The default is +32Mb. + +.SS ecn_prob +It is the drop probability threshold below which packets will be ECN marked +instead of getting dropped. The default is 10%. Setting this parameter requires +\fBecn\fR to be enabled. + +.SS \fR[\fBno\fR]\fBecn\fR +It has the same semantics as \fBpie\fR and can be used to mark packets +instead of dropping them. If \fBecn\fR has been enabled, \fBnoecn\fR can +be used to turn it off and vice-a-versa. + +.SS \fR[\fBno\fR]\fBbytemode\fR +It is used to scale drop probability proportional to packet size +\fBbytemode\fR to turn on bytemode, \fBnobytemode\fR to turn off +bytemode. By default, \fBbytemode\fR is turned off. + +.SS \fR[\fBno_\fR]\fBdq_rate_estimator\fR +\fBdq_rate_estimator\fR can be used to calculate queue delay using Little's +Law, \fBno_dq_rate_estimator\fR can be used to calculate queue delay +using timestamp. By default, \fBdq_rate_estimator\fR is turned off. + +.SH EXAMPLES +# tc qdisc add dev eth0 root fq_pie +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 + Sent 159173586 bytes 105261 pkt (dropped 24, overlimits 0 requeues 0) + backlog 75700b 50p requeues 0 + pkts_in 105311 overlimit 0 overmemory 0 dropped 24 ecn_mark 0 + new_flow_count 7332 new_flows_len 0 old_flows_len 4 memory_used 108800 + +# tc qdisc add dev eth0 root fq_pie dq_rate_estimator +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 +dq_rate_estimator + Sent 8263620 bytes 5550 pkt (dropped 4, overlimits 0 requeues 0) + backlog 805448b 532p requeues 0 + pkts_in 6082 overlimit 0 overmemory 0 dropped 4 ecn_mark 0 + new_flow_count 94 new_flows_len 0 old_flows_len 8 memory_used 1157632 + +.SH SEE ALSO +.BR tc (8), +.BR tc-pie (8), +.BR tc-fq_codel (8) + +.SH SOURCES +RFC 8033: https://tools.ietf.org/html/rfc8033 + +.SH AUTHORS +FQ-PIE was implemented by Mohit P. Tahiliani. Please report corrections to the +Linux Networking mailing list <netdev@vger.kernel.org>. diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 39976ad71..e8e0cd0fe 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -284,6 +284,13 @@ bandwidth to all the flows using the queue. Each such flow is managed by the CoDel queuing discipline. Reordering within a flow is avoided since Codel internally uses a FIFO queue. .TP +fq_pie +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. +.TP gred Generalized Random Early Detection combines multiple RED queues in order to achieve multiple drop priorities. This is required to realize Assured @@ -855,6 +862,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR tc-flower (8), .BR tc-fq (8), .BR tc-fq_codel (8), +.BR tc-fq_pie (8), .BR tc-fw (8), .BR tc-hfsc (7), .BR tc-hfsc (8), diff --git a/misc/nstat.c b/misc/nstat.c index 23113b223..425e75ef4 100644 --- a/misc/nstat.c +++ b/misc/nstat.c @@ -142,14 +142,19 @@ static void load_good_table(FILE *fp) } /* idbuf is as big as buf, so this is safe */ nr = sscanf(buf, "%s%llu%lg", idbuf, &val, &rate); - if (nr < 2) - abort(); + if (nr < 2) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } if (nr < 3) rate = 0; if (useless_number(idbuf)) continue; - if ((n = malloc(sizeof(*n))) == NULL) - abort(); + if ((n = malloc(sizeof(*n))) == NULL) { + perror("nstat: malloc"); + exit(-1); + } n->id = strdup(idbuf); n->val = val; n->rate = rate; @@ -190,8 +195,11 @@ static void load_ugly_table(FILE *fp) int count1, count2, skip = 0; p = strchr(buf, ':'); - if (!p) - abort(); + if (!p) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } count1 = count_spaces(buf); *p = 0; idbuf[0] = 0; @@ -211,8 +219,10 @@ static void load_ugly_table(FILE *fp) strncat(idbuf, p, sizeof(idbuf) - off - 1); } n = malloc(sizeof(*n)); - if (!n) - abort(); + if (!n) { + perror("nstat: malloc"); + exit(-1); + } n->id = strdup(idbuf); n->rate = 0; n->next = db; @@ -221,18 +231,27 @@ static void load_ugly_table(FILE *fp) } n = db; nread = getline(&buf, &buflen, fp); - if (nread == -1) - abort(); + if (nread == -1) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } count2 = count_spaces(buf); if (count2 > count1) skip = count2 - count1; do { p = strrchr(buf, ' '); - if (!p) - abort(); + if (!p) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } *p = 0; - if (sscanf(p+1, "%llu", &n->val) != 1) - abort(); + if (sscanf(p+1, "%llu", &n->val) != 1) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } /* Trick to skip "dummy" trailing ICMP MIB in 2.4 */ if (skip) skip--; diff --git a/rdma/stat.c b/rdma/stat.c index 2f5752870..8d4b7a116 100644 --- a/rdma/stat.c +++ b/rdma/stat.c @@ -23,6 +23,7 @@ static int stat_help(struct rd *rd) pr_out("where OBJECT: = { qp }\n"); pr_out(" CRITERIA : = { type }\n"); pr_out(" COUNTER_SCOPE: = { link | dev }\n"); + pr_out(" FILTER_NAME: = { cntn | lqpn | pid }\n"); pr_out("Examples:\n"); pr_out(" %s statistic qp show\n", rd->filename); pr_out(" %s statistic qp show link mlx5_2/1\n", rd->filename); diff --git a/tc/Makefile b/tc/Makefile index f06ba14b2..e31cbc12e 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -70,6 +70,7 @@ TCMODULES += q_codel.o TCMODULES += q_fq_codel.o TCMODULES += q_fq.o TCMODULES += q_pie.o +TCMODULES += q_fq_pie.o TCMODULES += q_cake.o TCMODULES += q_hhf.o TCMODULES += q_clsact.o diff --git a/tc/q_fq_pie.c b/tc/q_fq_pie.c new file mode 100644 index 000000000..c136cd1af --- /dev/null +++ b/tc/q_fq_pie.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Flow Queue PIE + * + * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in> + * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com> + * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com> + * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com> + * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com> + * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, + "Usage: ... fq_pie [ limit PACKETS ] [ flows NUMBER ]\n" + " [ target TIME ] [ tupdate TIME ]\n" + " [ alpha NUMBER ] [ beta NUMBER ]\n" + " [ quantum BYTES ] [ memory_limit BYTES ]\n" + " [ ecn_prob PERCENTAGE ] [ [no]ecn ]\n" + " [ [no]bytemode ] [ [no_]dq_rate_estimator ]\n"); +} + +#define ALPHA_MAX 32 +#define BETA_MAX 32 + +static int fq_pie_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n, const char *dev) +{ + unsigned int limit = 0; + unsigned int flows = 0; + unsigned int target = 0; + unsigned int tupdate = 0; + unsigned int alpha = 0; + unsigned int beta = 0; + unsigned int quantum = 0; + unsigned int memory_limit = 0; + unsigned int ecn_prob = 0; + int ecn = -1; + int bytemode = -1; + int dq_rate_estimator = -1; + struct rtattr *tail; + + while (argc > 0) { + if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_unsigned(&limit, *argv, 0)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + } else if (strcmp(*argv, "flows") == 0) { + NEXT_ARG(); + if (get_unsigned(&flows, *argv, 0)) { + fprintf(stderr, "Illegal \"flows\"\n"); + return -1; + } + } else if (strcmp(*argv, "target") == 0) { + NEXT_ARG(); + if (get_time(&target, *argv)) { + fprintf(stderr, "Illegal \"target\"\n"); + return -1; + } + } else if (strcmp(*argv, "tupdate") == 0) { + NEXT_ARG(); + if (get_time(&tupdate, *argv)) { + fprintf(stderr, "Illegal \"tupdate\"\n"); + return -1; + } + } else if (strcmp(*argv, "alpha") == 0) { + NEXT_ARG(); + if (get_unsigned(&alpha, *argv, 0) || + alpha > ALPHA_MAX) { + fprintf(stderr, "Illegal \"alpha\"\n"); + return -1; + } + } else if (strcmp(*argv, "beta") == 0) { + NEXT_ARG(); + if (get_unsigned(&beta, *argv, 0) || + beta > BETA_MAX) { + fprintf(stderr, "Illegal \"beta\"\n"); + return -1; + } + } else if (strcmp(*argv, "quantum") == 0) { + NEXT_ARG(); + if (get_size(&quantum, *argv)) { + fprintf(stderr, "Illegal \"quantum\"\n"); + return -1; + } + } else if (strcmp(*argv, "memory_limit") == 0) { + NEXT_ARG(); + if (get_size(&memory_limit, *argv)) { + fprintf(stderr, "Illegal \"memory_limit\"\n"); + return -1; + } + } else if (strcmp(*argv, "ecn_prob") == 0) { + NEXT_ARG(); + if (get_unsigned(&ecn_prob, *argv, 0) || + ecn_prob >= 100) { + fprintf(stderr, "Illegal \"ecn_prob\"\n"); + return -1; + } + } else if (strcmp(*argv, "ecn") == 0) { + ecn = 1; + } else if (strcmp(*argv, "noecn") == 0) { + ecn = 0; + } else if (strcmp(*argv, "bytemode") == 0) { + bytemode = 1; + } else if (strcmp(*argv, "nobytemode") == 0) { + bytemode = 0; + } else if (strcmp(*argv, "dq_rate_estimator") == 0) { + dq_rate_estimator = 1; + } else if (strcmp(*argv, "no_dq_rate_estimator") == 0) { + dq_rate_estimator = 0; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + + argc--; + argv++; + } + + tail = addattr_nest(n, 1024, TCA_OPTIONS | NLA_F_NESTED); + if (limit) + addattr_l(n, 1024, TCA_FQ_PIE_LIMIT, &limit, sizeof(limit)); + if (flows) + addattr_l(n, 1024, TCA_FQ_PIE_FLOWS, &flows, sizeof(flows)); + if (target) + addattr_l(n, 1024, TCA_FQ_PIE_TARGET, &target, sizeof(target)); + if (tupdate) + addattr_l(n, 1024, TCA_FQ_PIE_TUPDATE, &tupdate, + sizeof(tupdate)); + if (alpha) + addattr_l(n, 1024, TCA_FQ_PIE_ALPHA, &alpha, sizeof(alpha)); + if (beta) + addattr_l(n, 1024, TCA_FQ_PIE_BETA, &beta, sizeof(beta)); + if (quantum) + addattr_l(n, 1024, TCA_FQ_PIE_QUANTUM, &quantum, + sizeof(quantum)); + if (memory_limit) + addattr_l(n, 1024, TCA_FQ_PIE_MEMORY_LIMIT, &memory_limit, + sizeof(memory_limit)); + if (ecn_prob) + addattr_l(n, 1024, TCA_FQ_PIE_ECN_PROB, &ecn_prob, + sizeof(ecn_prob)); + if (ecn != -1) + addattr_l(n, 1024, TCA_FQ_PIE_ECN, &ecn, sizeof(ecn)); + if (bytemode != -1) + addattr_l(n, 1024, TCA_FQ_PIE_BYTEMODE, &bytemode, + sizeof(bytemode)); + if (dq_rate_estimator != -1) + addattr_l(n, 1024, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + &dq_rate_estimator, sizeof(dq_rate_estimator)); + addattr_nest_end(n, tail); + + return 0; +} + +static int fq_pie_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int limit = 0; + unsigned int flows = 0; + unsigned int target = 0; + unsigned int tupdate = 0; + unsigned int alpha = 0; + unsigned int beta = 0; + unsigned int quantum = 0; + unsigned int memory_limit = 0; + unsigned int ecn_prob = 0; + int ecn = -1; + int bytemode = -1; + int dq_rate_estimator = -1; + + SPRINT_BUF(b1); + + if (opt == NULL) + return 0; + + parse_rtattr_nested(tb, TCA_FQ_PIE_MAX, opt); + + if (tb[TCA_FQ_PIE_LIMIT] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_LIMIT]) >= sizeof(__u32)) { + limit = rta_getattr_u32(tb[TCA_FQ_PIE_LIMIT]); + print_uint(PRINT_ANY, "limit", "limit %up ", limit); + } + if (tb[TCA_FQ_PIE_FLOWS] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_FLOWS]) >= sizeof(__u32)) { + flows = rta_getattr_u32(tb[TCA_FQ_PIE_FLOWS]); + print_uint(PRINT_ANY, "flows", "flows %u ", flows); + } + if (tb[TCA_FQ_PIE_TARGET] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_TARGET]) >= sizeof(__u32)) { + target = rta_getattr_u32(tb[TCA_FQ_PIE_TARGET]); + print_uint(PRINT_JSON, "target", NULL, target); + print_string(PRINT_FP, NULL, "target %s ", + sprint_time(target, b1)); + } + if (tb[TCA_FQ_PIE_TUPDATE] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_TUPDATE]) >= sizeof(__u32)) { + tupdate = rta_getattr_u32(tb[TCA_FQ_PIE_TUPDATE]); + print_uint(PRINT_JSON, "tupdate", NULL, tupdate); + print_string(PRINT_FP, NULL, "tupdate %s ", + sprint_time(tupdate, b1)); + } + if (tb[TCA_FQ_PIE_ALPHA] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ALPHA]) >= sizeof(__u32)) { + alpha = rta_getattr_u32(tb[TCA_FQ_PIE_ALPHA]); + print_uint(PRINT_ANY, "alpha", "alpha %u ", alpha); + } + if (tb[TCA_FQ_PIE_BETA] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_BETA]) >= sizeof(__u32)) { + beta = rta_getattr_u32(tb[TCA_FQ_PIE_BETA]); + print_uint(PRINT_ANY, "beta", "beta %u ", beta); + } + if (tb[TCA_FQ_PIE_QUANTUM] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_QUANTUM]) >= sizeof(__u32)) { + quantum = rta_getattr_u32(tb[TCA_FQ_PIE_QUANTUM]); + print_uint(PRINT_JSON, "quantum", NULL, quantum); + print_string(PRINT_FP, NULL, "quantum %s ", + sprint_size(quantum, b1)); + } + if (tb[TCA_FQ_PIE_MEMORY_LIMIT] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_MEMORY_LIMIT]) >= sizeof(__u32)) { + memory_limit = rta_getattr_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + print_uint(PRINT_JSON, "memory_limit", NULL, memory_limit); + print_string(PRINT_FP, NULL, "memory_limit %s ", + sprint_size(memory_limit, b1)); + } + if (tb[TCA_FQ_PIE_ECN_PROB] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ECN_PROB]) >= sizeof(__u32)) { + ecn_prob = rta_getattr_u32(tb[TCA_FQ_PIE_ECN_PROB]); + print_uint(PRINT_ANY, "ecn_prob", "ecn_prob %u ", ecn_prob); + } + if (tb[TCA_FQ_PIE_ECN] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ECN]) >= sizeof(__u32)) { + ecn = rta_getattr_u32(tb[TCA_FQ_PIE_ECN]); + if (ecn) + print_bool(PRINT_ANY, "ecn", "ecn ", true); + } + if (tb[TCA_FQ_PIE_BYTEMODE] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_BYTEMODE]) >= sizeof(__u32)) { + bytemode = rta_getattr_u32(tb[TCA_FQ_PIE_BYTEMODE]); + if (bytemode) + print_bool(PRINT_ANY, "bytemode", "bytemode ", true); + } + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) >= sizeof(__u32)) { + dq_rate_estimator = + rta_getattr_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + if (dq_rate_estimator) + print_bool(PRINT_ANY, "dq_rate_estimator", + "dq_rate_estimator ", true); + } + + return 0; +} + +static int fq_pie_print_xstats(struct qdisc_util *qu, FILE *f, + struct rtattr *xstats) +{ + struct tc_fq_pie_xstats _st = {}, *st; + + if (xstats == NULL) + return 0; + + st = RTA_DATA(xstats); + if (RTA_PAYLOAD(xstats) < sizeof(*st)) { + memcpy(&_st, st, RTA_PAYLOAD(xstats)); + st = &_st; + } + + print_uint(PRINT_ANY, "pkts_in", " pkts_in %u", + st->packets_in); + print_uint(PRINT_ANY, "overlimit", " overlimit %u", + st->overlimit); + print_uint(PRINT_ANY, "overmemory", " overmemory %u", + st->overmemory); + print_uint(PRINT_ANY, "dropped", " dropped %u", + st->dropped); + print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u", + st->ecn_mark); + print_nl(); + print_uint(PRINT_ANY, "new_flow_count", " new_flow_count %u", + st->new_flow_count); + print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u", + st->new_flows_len); + print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u", + st->old_flows_len); + print_uint(PRINT_ANY, "memory_used", " memory_used %u", + st->memory_usage); + + return 0; + +} + +struct qdisc_util fq_pie_qdisc_util = { + .id = "fq_pie", + .parse_qopt = fq_pie_parse_opt, + .print_qopt = fq_pie_print_opt, + .print_xstats = fq_pie_print_xstats, +}; diff --git a/testsuite/tests/ss/ssfilter.t b/testsuite/tests/ss/ssfilter.t index 3091054f2..4c2315ca7 100755 --- a/testsuite/tests/ss/ssfilter.t +++ b/testsuite/tests/ss/ssfilter.t @@ -12,37 +12,37 @@ export TCPDIAG_FILE="$(dirname $0)/ss1.dump" ts_log "[Testing ssfilter]" ts_ss "$0" "Match dport = 22" -Htna dport = 22 -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match dport 22" -Htna dport 22 -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match (dport)" -Htna '( dport = 22 )' -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match src = 0.0.0.0" -Htna src = 0.0.0.0 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src 0.0.0.0" -Htna src 0.0.0.0 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src sport" -Htna src 0.0.0.0 sport = 22 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src and sport" -Htna src 0.0.0.0 and sport = 22 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src and sport and dport" -Htna src 10.0.0.1 and sport = 22 and dport = 50312 -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match src and sport and (dport)" -Htna 'src 10.0.0.1 and sport = 22 and ( dport = 50312 )' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match src and (sport and dport)" -Htna 'src 10.0.0.1 and ( sport = 22 and dport = 50312 )' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match (src and sport) and dport" -Htna '( src 10.0.0.1 and sport = 22 ) and dport = 50312' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match (src or src) and dst" -Htna '( src 0.0.0.0 or src 10.0.0.1 ) and dst 10.0.0.2' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" |