diff options
-rw-r--r-- | Documentation/networking/snmp_counter.rst | 240 | ||||
-rw-r--r-- | drivers/isdn/capi/kcapi.c | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/huawei/hinic/hinic_main.c | 6 | ||||
-rw-r--r-- | drivers/net/hamradio/6pack.c | 16 | ||||
-rw-r--r-- | drivers/net/tap.c | 3 | ||||
-rw-r--r-- | include/linux/ptr_ring.h | 2 | ||||
-rw-r--r-- | include/net/ip_tunnels.h | 20 | ||||
-rw-r--r-- | include/net/sock.h | 38 | ||||
-rw-r--r-- | net/compat.c | 15 | ||||
-rw-r--r-- | net/core/sock.c | 15 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 9 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 9 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 12 | ||||
-rw-r--r-- | net/ipv6/ip6_gre.c | 10 | ||||
-rw-r--r-- | net/ipv6/ip6_tunnel.c | 10 | ||||
-rw-r--r-- | net/ipv6/ip6_vti.c | 8 | ||||
-rw-r--r-- | net/ipv6/ip6mr.c | 17 | ||||
-rw-r--r-- | net/ipv6/route.c | 4 | ||||
-rw-r--r-- | net/ipv6/sit.c | 3 | ||||
-rw-r--r-- | net/rds/tcp.c | 2 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 2 |
22 files changed, 383 insertions, 69 deletions
diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index f8eb77ddbd4403..b0dfdaaca512b3 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -571,7 +571,97 @@ duplicate packet is received. * TcpExtTCPDSACKOfoRecv The TCP stack receives a DSACK, which indicate an out of order -duplciate packet is received. +duplicate packet is received. + +TCP out of order +=============== +* TcpExtTCPOFOQueue +The TCP layer receives an out of order packet and has enough memory +to queue it. + +* TcpExtTCPOFODrop +The TCP layer receives an out of order packet but doesn't have enough +memory, so drops it. Such packets won't be counted into +TcpExtTCPOFOQueue. + +* TcpExtTCPOFOMerge +The received out of order packet has an overlay with the previous +packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge +packets will also be counted into TcpExtTCPOFOQueue. + +TCP PAWS +======= +PAWS (Protection Against Wrapped Sequence numbers) is an algorithm +which is used to drop old packets. It depends on the TCP +timestamps. For detail information, please refer the `timestamp wiki`_ +and the `RFC of PAWS`_. + +.. _RFC of PAWS: https://tools.ietf.org/html/rfc1323#page-17 +.. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps + +* TcpExtPAWSActive +Packets are dropped by PAWS in Syn-Sent status. + +* TcpExtPAWSEstab +Packets are dropped by PAWS in any status other than Syn-Sent. + +TCP ACK skip +=========== +In some scenarios, kernel would avoid sending duplicate ACKs too +frequently. Please find more details in the tcp_invalid_ratelimit +section of the `sysctl document`_. When kernel decides to skip an ACK +due to tcp_invalid_ratelimit, kernel would update one of below +counters to indicate the ACK is skipped in which scenario. The ACK +would only be skipped if the received packet is either a SYN packet or +it has no data. + +.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt + +* TcpExtTCPACKSkippedSynRecv +The ACK is skipped in Syn-Recv status. The Syn-Recv status means the +TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is +waiting for an ACK. Generally, the TCP stack doesn't need to send ACK +in the Syn-Recv status. But in several scenarios, the TCP stack need +to send an ACK. E.g., the TCP stack receives the same SYN packet +repeately, the received packet does not pass the PAWS check, or the +received packet sequence number is out of window. In these scenarios, +the TCP stack needs to send ACK. If the ACk sending frequency is higher than +tcp_invalid_ratelimit allows, the TCP stack will skip sending ACK and +increase TcpExtTCPACKSkippedSynRecv. + + +* TcpExtTCPACKSkippedPAWS +The ACK is skipped due to PAWS (Protect Against Wrapped Sequence +numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2 +or Time-Wait statuses, the skipped ACK would be counted to +TcpExtTCPACKSkippedSynRecv, TcpExtTCPACKSkippedFinWait2 or +TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK +would be counted to TcpExtTCPACKSkippedPAWS. + +* TcpExtTCPACKSkippedSeq +The sequence number is out of window and the timestamp passes the PAWS +check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait. + +* TcpExtTCPACKSkippedFinWait2 +The ACK is skipped in Fin-Wait-2 status, the reason would be either +PAWS check fails or the received sequence number is out of window. + +* TcpExtTCPACKSkippedTimeWait +Tha ACK is skipped in Time-Wait status, the reason would be either +PAWS check failed or the received sequence number is out of window. + +* TcpExtTCPACKSkippedChallenge +The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines +3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_, +`RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these +three scenarios, In some TCP status, the linux TCP stack would also +send challenge ACKs if the ACK number is before the first +unacknowledged number (more strict than `RFC 5961 section 5.2`_). + +.. _RFC 5961 section 3.2: https://tools.ietf.org/html/rfc5961#page-7 +.. _RFC 5961 section 4.2: https://tools.ietf.org/html/rfc5961#page-9 +.. _RFC 5961 section 5.2: https://tools.ietf.org/html/rfc5961#page-11 + examples ======= @@ -1188,3 +1278,151 @@ Run nstat on server B:: We have deleted the default route on server B. Server B couldn't find a route for the 8.8.8.8 IP address, so server B increased IpOutNoRoutes. + +TcpExtTCPACKSkippedSynRecv +------------------------ +In this test, we send 3 same SYN packets from client to server. The +first SYN will let server create a socket, set it to Syn-Recv status, +and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK +again, and record the reply time (the duplicate ACK reply time). The +third SYN will let server check the previous duplicate ACK reply time, +and decide to skip the duplicate ACK, then increase the +TcpExtTCPACKSkippedSynRecv counter. + +Run tcpdump to capture a SYN packet:: + + nstatuser@nstat-a:~$ sudo tcpdump -c 1 -w /tmp/syn.pcap port 9000 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +Open another terminal, run nc command:: + + nstatuser@nstat-a:~$ nc nstat-b 9000 + +As the nstat-b didn't listen on port 9000, it should reply a RST, and +the nc command exited immediately. It was enough for the tcpdump +command to capture a SYN packet. A linux server might use hardware +offload for the TCP checksum, so the checksum in the /tmp/syn.pcap +might be not correct. We call tcprewrite to fix it:: + + nstatuser@nstat-a:~$ tcprewrite --infile=/tmp/syn.pcap --outfile=/tmp/syn_fixcsum.pcap --fixcsum + +On nstat-b, we run nc to listen on port 9000:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + +On nstat-a, we blocked the packet from port 9000, or nstat-a would send +RST to nstat-b:: + + nstatuser@nstat-a:~$ sudo iptables -A INPUT -p tcp --sport 9000 -j DROP + +Send 3 SYN repeatly to nstat-b:: + + nstatuser@nstat-a:~$ for i in {1..3}; do sudo tcpreplay -i ens3 /tmp/syn_fixcsum.pcap; done + +Check snmp cunter on nstat-b:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedSynRecv 1 0.0 + +As we expected, TcpExtTCPACKSkippedSynRecv is 1. + +TcpExtTCPACKSkippedPAWS +---------------------- +To trigger PAWS, we could send an old SYN. + +On nstat-b, let nc listen on port 9000:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + +On nstat-a, run tcpdump to capture a SYN:: + + nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/paws_pre.pcap -c 1 port 9000 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +On nstat-a, run nc as a client to connect nstat-b:: + + nstatuser@nstat-a:~$ nc -v nstat-b 9000 + Connection to nstat-b 9000 port [tcp/*] succeeded! + +Now the tcpdump has captured the SYN and exit. We should fix the +checksum:: + + nstatuser@nstat-a:~$ tcprewrite --infile /tmp/paws_pre.pcap --outfile /tmp/paws.pcap --fixcsum + +Send the SYN packet twice:: + + nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/paws.pcap; done + +On nstat-b, check the snmp counter:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedPAWS 1 0.0 + +We sent two SYN via tcpreplay, both of them would let PAWS check +failed, the nstat-b replied an ACK for the first SYN, skipped the ACK +for the second SYN, and updated TcpExtTCPACKSkippedPAWS. + +TcpExtTCPACKSkippedSeq +-------------------- +To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid +timestamp (to pass PAWS check) but the sequence number is out of +window. The linux TCP stack would avoid to skip if the packet has +data, so we need a pure ACK packet. To generate such a packet, we +could create two sockets: one on port 9000, another on port 9001. Then +we capture an ACK on port 9001, change the source/destination port +numbers to match the port 9000 socket. Then we could trigger +TcpExtTCPACKSkippedSeq via this packet. + +On nstat-b, open two terminals, run two nc commands to listen on both +port 9000 and port 9001:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + + nstatuser@nstat-b:~$ nc -lkv 9001 + Listening on [0.0.0.0] (family 0, port 9001) + +On nstat-a, run two nc clients:: + + nstatuser@nstat-a:~$ nc -v nstat-b 9000 + Connection to nstat-b 9000 port [tcp/*] succeeded! + + nstatuser@nstat-a:~$ nc -v nstat-b 9001 + Connection to nstat-b 9001 port [tcp/*] succeeded! + +On nstat-a, run tcpdump to capture an ACK:: + + nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/seq_pre.pcap -c 1 dst port 9001 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +On nstat-b, send a packet via the port 9001 socket. E.g. we sent a +string 'foo' in our example:: + + nstatuser@nstat-b:~$ nc -lkv 9001 + Listening on [0.0.0.0] (family 0, port 9001) + Connection from nstat-a 42132 received! + foo + +On nstat-a, the tcpdump should have caputred the ACK. We should check +the source port numbers of the two nc clients:: + + nstatuser@nstat-a:~$ ss -ta '( dport = :9000 || dport = :9001 )' | tee + State Recv-Q Send-Q Local Address:Port Peer Address:Port + ESTAB 0 0 192.168.122.250:50208 192.168.122.251:9000 + ESTAB 0 0 192.168.122.250:42132 192.168.122.251:9001 + +Run tcprewrite, change port 9001 to port 9000, chagne port 42132 to +port 50208:: + + nstatuser@nstat-a:~$ tcprewrite --infile /tmp/seq_pre.pcap --outfile /tmp/seq.pcap -r 9001:9000 -r 42132:50208 --fixcsum + +Now the /tmp/seq.pcap is the packet we need. Send it to nstat-b:: + + nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/seq.pcap; done + +Check TcpExtTCPACKSkippedSeq on nstat-b:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedSeq 1 0.0 diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 0ff517d3c98f98..a4ceb61c5b6035 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -852,7 +852,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf) u16 ret; if (contr == 0) { - strlcpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN); + strncpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN); return CAPI_NOERROR; } @@ -860,7 +860,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf) ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { - strlcpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN); + strncpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index d3b9aaf96c1c30..07cd58798083cf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3995,17 +3995,18 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle) struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev); int ret = 0; + clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); + if (netif_running(kinfo->netdev)) { - ret = hns3_nic_net_up(kinfo->netdev); + ret = hns3_nic_net_open(kinfo->netdev); if (ret) { + set_bit(HNS3_NIC_STATE_RESETTING, &priv->state); netdev_err(kinfo->netdev, "hns net up fail, ret=%d!\n", ret); return ret; } } - clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); - return ret; } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 6d48dc62a44b5d..da323b9e1f62fb 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -1106,6 +1106,11 @@ static void hinic_remove(struct pci_dev *pdev) dev_info(&pdev->dev, "HiNIC driver - removed\n"); } +static void hinic_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + static const struct pci_device_id hinic_pci_table[] = { { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_QUAD_PORT_25GE), 0}, { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_25GE), 0}, @@ -1119,6 +1124,7 @@ static struct pci_driver hinic_driver = { .id_table = hinic_pci_table, .probe = hinic_probe, .remove = hinic_remove, + .shutdown = hinic_shutdown, }; module_pci_driver(hinic_driver); diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 28c74998035946..a19868cba48cb0 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -523,10 +523,7 @@ static void resync_tnc(struct timer_list *t) /* Start resync timer again -- the TNC might be still absent */ - - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); + mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); } static inline int tnc_init(struct sixpack *sp) @@ -537,9 +534,7 @@ static inline int tnc_init(struct sixpack *sp) sp->tty->ops->write(sp->tty, &inbyte, 1); - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); + mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); return 0; } @@ -897,11 +892,8 @@ static void decode_prio_command(struct sixpack *sp, unsigned char cmd) /* if the state byte has been received, the TNC is present, so the resync timer can be reset. */ - if (sp->tnc_state == TNC_IN_SYNC) { - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_INIT_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); - } + if (sp->tnc_state == TNC_IN_SYNC) + mod_timer(&sp->resync_t, jiffies + SIXP_INIT_RESYNC_TIMEOUT); sp->status1 = cmd & SIXP_PRIO_DATA_MASK; } diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 443b2694130cda..c0b52e48f0e63c 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1177,8 +1177,6 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) goto err_kfree; } - skb_probe_transport_header(skb, ETH_HLEN); - /* Move network header to the right position for VLAN tagged packets */ if ((skb->protocol == htons(ETH_P_8021Q) || skb->protocol == htons(ETH_P_8021AD)) && @@ -1189,6 +1187,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; + skb_probe_transport_header(skb, ETH_HLEN); dev_queue_xmit(skb); } else { kfree_skb(skb); diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6894976b54e376..186cd8e970c70f 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -573,6 +573,8 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, else if (destroy) destroy(ptr); + if (producer >= size) + producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cbcf35ce1b1496..34f019650941bd 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -308,6 +308,26 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + int nhlen; + + switch (skb->protocol) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + default: + nhlen = 0; + } + + return pskb_network_may_pull(skb, nhlen); +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; diff --git a/include/net/sock.h b/include/net/sock.h index a6235c286ef996..2b229f7be8ebbc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -298,6 +298,7 @@ struct sock_common { * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received + * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications @@ -474,6 +475,9 @@ struct sock { const struct cred *sk_peer_cred; long sk_rcvtimeo; ktime_t sk_stamp; +#if BITS_PER_LONG==32 + seqlock_t sk_stamp_seq; +#endif u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; @@ -2297,6 +2301,34 @@ static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) atomic_add(segs, &sk->sk_drops); } +static inline ktime_t sock_read_timestamp(struct sock *sk) +{ +#if BITS_PER_LONG==32 + unsigned int seq; + ktime_t kt; + + do { + seq = read_seqbegin(&sk->sk_stamp_seq); + kt = sk->sk_stamp; + } while (read_seqretry(&sk->sk_stamp_seq, seq)); + + return kt; +#else + return sk->sk_stamp; +#endif +} + +static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) +{ +#if BITS_PER_LONG==32 + write_seqlock(&sk->sk_stamp_seq); + sk->sk_stamp = kt; + write_sequnlock(&sk->sk_stamp_seq); +#else + sk->sk_stamp = kt; +#endif +} + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, @@ -2321,7 +2353,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else - sk->sk_stamp = kt; + sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) __sock_recv_wifi_status(msg, sk, skb); @@ -2342,9 +2374,9 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) - sk->sk_stamp = skb->tstamp; + sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) - sk->sk_stamp = 0; + sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); diff --git a/net/compat.c b/net/compat.c index 47a614b370cd3e..d1f3a8a0b3efe7 100644 --- a/net/compat.c +++ b/net/compat.c @@ -467,12 +467,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) ctv = (struct compat_timeval __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); + if (tv.tv_sec == -1) return err; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } err = 0; if (put_user(tv.tv_sec, &ctv->tv_sec) || @@ -494,12 +496,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta ctv = (struct compat_timespec __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return err; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - ts = ktime_to_timespec(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec(kt); } err = 0; if (put_user(ts.tv_sec, &ctv->tv_sec) || diff --git a/net/core/sock.c b/net/core/sock.c index f00902c532cc77..6aa2e7e0b4fbdb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2751,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL @@ -2850,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); if (tv.tv_sec == -1) return -ENOENT; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } @@ -2866,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) struct timespec ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); ts = ktime_to_timespec(sk->sk_stamp); } return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c7a7bd58a23c58..d1d09f3e5f9ec9 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -676,6 +676,9 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -719,6 +722,9 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { erspan_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -762,6 +768,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 284a22154b4e6c..c4f5602308edca 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,7 +627,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); - unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -637,14 +636,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; - /* ensure we can access the inner net header, for several users below */ - if (skb->protocol == htons(ETH_P_IP)) - inner_nhdr_len = sizeof(struct iphdr); - else if (skb->protocol == htons(ETH_P_IPV6)) - inner_nhdr_len = sizeof(struct ipv6hdr); - if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) - goto tx_error; - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b302d69c6b..d7b43e700023a0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -241,6 +241,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { @@ -253,15 +256,18 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); break; default: - dev->stats.tx_errors++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; + goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 229e55c99021a8..09d0826742f89f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -881,6 +881,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -923,6 +926,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int nhoff; int thoff; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -995,8 +1001,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1004,7 +1008,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, &dsfield, &encap_limit); break; case htons(ETH_P_IPV6): - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr)) goto tx_err; if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 99179b9c83840b..0c6403cf8b5226 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1243,10 +1243,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - /* ensure we can access the full inner ip header */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return -1; - iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1321,9 +1317,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - return -1; - ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || @@ -1405,6 +1398,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 706fe42e492899..8b6eefff2f7eaf 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -522,18 +522,18 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h; struct flowi fl; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): - ipv6h = ipv6_hdr(skb); - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - vti6_addr_conflict(t, ipv6h)) + vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; xfrm_decode_session(skb, &fl, AF_INET6); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8276f1224f168d..30337b38274b29 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -51,6 +51,7 @@ #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> +#include <net/ip_tunnels.h> #include <linux/nospec.h> @@ -599,13 +600,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; - int err; - err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (!pskb_inet_may_pull(skb)) + goto tx_err; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto tx_err; read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; @@ -614,6 +614,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a94e0b02a8ac9d..40b225f87d5e1d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -210,7 +210,9 @@ struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dev); + + n = neigh_create(&nd_tbl, daddr, dev); + return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 51c9f75f34b9bb..1e03305c054922 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1021,6 +1021,9 @@ tx_error: static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b9bbcf3d6c6397..c16f0a362c32c3 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -623,7 +623,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net) if (rtn->rds_tcp_sysctl) unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - if (net != &init_net && rtn->ctl_table) + if (net != &init_net) kfree(rtn->ctl_table); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 986f3ed7d1a248..b7e67310ec379f 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -549,7 +549,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->tstamp; + sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ len = skb->len; |