Index: 2.6.12-rc4-tcp3/include/linux/tcp.h =================================================================== --- 2.6.12-rc4-tcp3.orig/include/linux/tcp.h +++ 2.6.12-rc4-tcp3/include/linux/tcp.h @@ -280,13 +280,15 @@ struct tcp_sock { __u32 snd_wnd; /* The window we expect to receive */ __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ - __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u16 mss_cache_std; /* Like mss_cache, but without TSO */ + __u16 mss_cache; /* Cached effective mss, not including SACKS */ + __u16 xmit_size_goal; /* Goal for segmenting output packets */ + __u32 xmit_size_cache;/* Cache for keeping xmit_size_goal uptodate */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ __u16 advmss; /* Advertised MSS */ + __u16 deferred; /* Packets deferred for segmentation */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ Index: 2.6.12-rc4-tcp3/include/net/sock.h =================================================================== --- 2.6.12-rc4-tcp3.orig/include/net/sock.h +++ 2.6.12-rc4-tcp3/include/net/sock.h @@ -1130,13 +1130,16 @@ static inline void sk_stream_moderate_sn static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, int size, int mem, int gfp) { - struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp); + struct sk_buff *skb; + int hdr_len; + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); + skb = alloc_skb(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || sk_stream_mem_schedule(sk, skb->truesize, 0)) { - skb_reserve(skb, sk->sk_prot->max_header); + skb_reserve(skb, hdr_len); return skb; } __kfree_skb(skb); Index: 2.6.12-rc4-tcp3/include/net/tcp.h =================================================================== --- 2.6.12-rc4-tcp3.orig/include/net/tcp.h +++ 2.6.12-rc4-tcp3/include/net/tcp.h @@ -817,11 +817,18 @@ static inline int tcp_ack_scheduled(stru return tp->ack.pending&TCP_ACK_SCHED; } -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp) +static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) { - if (tp->ack.quick && --tp->ack.quick == 0) { - /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + if (tp->ack.quick) { + if (pkts > tp->ack.quick) + tp->ack.quick = 0; + else + tp->ack.quick -= pkts; + + if (!tp->ack.quick) { + /* Leaving quickack mode we deflate ATO. */ + tp->ack.ato = TCP_ATO_MIN; + } } } @@ -939,7 +946,14 @@ extern __u32 cookie_v4_init_sequence(str /* tcp_output.c */ -extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); +extern void __tcp_push_pending_frames(struct sock *sk, + struct tcp_sock *tp, + unsigned int cur_mss, + int nonagle); +extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); +extern int tcp_tso_mince(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); @@ -951,7 +965,7 @@ extern int tcp_write_wakeup(struct sock extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, int priority); extern int tcp_send_synack(struct sock *); -extern void tcp_push_one(struct sock *, unsigned mss_now); +extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); @@ -1054,7 +1068,7 @@ static inline void tcp_reset_xmit_timer( static inline void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min(tp->advmss, tp->mss_cache_std); + unsigned int hint = min(tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd/2); hint = min(hint, TCP_MIN_RCVMSS); @@ -1353,23 +1367,23 @@ static inline void tcp_cwnd_validate(str } /* Set slow start threshould and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct tcp_sock *tp) +static inline void __tcp_enter_cwr(struct tcp_sock *tp, unsigned int pkts) { tp->undo_marker = 0; tp->snd_ssthresh = tcp_recalc_ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); + tcp_packets_in_flight(tp) + pkts); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; tp->snd_cwnd_stamp = tcp_time_stamp; TCP_ECN_queue_cwr(tp); } -static inline void tcp_enter_cwr(struct tcp_sock *tp) +static inline void tcp_enter_cwr(struct tcp_sock *tp, unsigned int pkts) { tp->prior_ssthresh = 0; if (tp->ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(tp); + __tcp_enter_cwr(tp, pkts); tcp_set_ca_state(tp, TCP_CA_CWR); } } @@ -1397,74 +1411,6 @@ static __inline__ void tcp_minshall_upda tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -/* Return 0, if packet can be sent now without violation Nagle's rules: - 1. It is full sized. - 2. Or it contains FIN. - 3. Or TCP_NODELAY was set. - 4. Or TCP_CORK is not set, and all sent packets are ACKed. - With Minshall's modification: all sent small packets are ACKed. - */ - -static __inline__ int -tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *); - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. - */ -static __inline__ int tcp_snd_test(struct sock *sk, - struct sk_buff *skb, - unsigned cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); - - if (!pkts) { - tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); - } - - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ - - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); -} - static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out && !tp->pending) @@ -1477,42 +1423,12 @@ static __inline__ int tcp_skb_is_last(co return skb->next == (struct sk_buff *)&sk->sk_write_queue; } -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -static __inline__ void __tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp, - unsigned cur_mss, - int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); -} - static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; @@ -1986,7 +1902,7 @@ static inline void tcp_westwood_update_r static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) { return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), + (__u32) (tp->mss_cache), 2U); } Index: 2.6.12-rc4-tcp3/net/ipv4/tcp.c =================================================================== --- 2.6.12-rc4-tcp3.orig/net/ipv4/tcp.c +++ 2.6.12-rc4-tcp3/net/ipv4/tcp.c @@ -634,7 +634,7 @@ static ssize_t do_tcp_sendpages(struct s size_t psize, int flags) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; + int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -647,6 +647,7 @@ static ssize_t do_tcp_sendpages(struct s clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; copied = 0; err = -EPIPE; @@ -660,7 +661,7 @@ static ssize_t do_tcp_sendpages(struct s int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -671,7 +672,7 @@ new_segment: goto wait_for_memory; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } if (copy > size) @@ -712,7 +713,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len != size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -732,6 +733,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } out: @@ -773,15 +775,11 @@ ssize_t tcp_sendpage(struct socket *sock static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - int tmp = tp->mss_cache_std; + int tmp = tp->mss_cache; - if (sk->sk_route_caps & NETIF_F_SG) { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + if (sk->sk_route_caps & NETIF_F_SG) + tmp = 0; - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } return tmp; } @@ -792,7 +790,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, size_goal; int err, copied; long timeo; @@ -811,6 +809,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -833,7 +832,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru skb = sk->sk_write_queue.prev; if (!sk->sk_send_head || - (copy = mss_now - skb->len) <= 0) { + (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, @@ -856,7 +855,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } /* Try to append data to the end of skb. */ @@ -891,11 +890,6 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } else if (page) { - /* If page is cached, align - * offset to L1 cache boundary - */ - off = (off + L1_CACHE_BYTES - 1) & - ~(L1_CACHE_BYTES - 1); if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; @@ -956,7 +950,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len != size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -976,6 +970,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } } @@ -2135,7 +2130,7 @@ void tcp_get_info(struct sock *sk, struc info->tcpi_rto = jiffies_to_usecs(tp->rto); info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); - info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = tp->ack.rcv_mss; info->tcpi_unacked = tp->packets_out; @@ -2185,7 +2180,7 @@ int tcp_getsockopt(struct sock *sk, int switch (optname) { case TCP_MAXSEG: - val = tp->mss_cache_std; + val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; break; Index: 2.6.12-rc4-tcp3/net/ipv4/tcp_input.c =================================================================== --- 2.6.12-rc4-tcp3.orig/net/ipv4/tcp_input.c +++ 2.6.12-rc4-tcp3/net/ipv4/tcp_input.c @@ -805,10 +805,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp->mss_cache_std > 1460) + if (tp->mss_cache > 1460) cwnd = 2; else - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -974,14 +974,6 @@ tcp_sacktag_write_queue(struct sock *sk, int flag = 0; int i; - /* So, SACKs for already sent large segments will be lost. - * Not good, but alternative is to resegment the queue. */ - if (sk->sk_route_caps & NETIF_F_TSO) { - sk->sk_route_caps &= ~NETIF_F_TSO; - sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; - } - if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; @@ -1038,6 +1030,15 @@ tcp_sacktag_write_queue(struct sock *sk, if(!before(TCP_SKB_CB(skb)->seq, end_seq)) break; + /* Even if mincing a TSO frame fails, we + * continue anyways. We will end up with + * more coarse SACK information, but it is + * better than ignoring all the SACK information + * altogether. + */ + if (tcp_skb_pcount(skb) > 1) + tcp_tso_mince(sk, tp, skb); + fack_count += tcp_skb_pcount(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -1142,7 +1143,7 @@ tcp_sacktag_write_queue(struct sock *sk, (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache_std))) { + tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1782,7 +1783,7 @@ static void tcp_try_to_open(struct sock tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(tp); + tcp_enter_cwr(tp, 1); if (tp->ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -2170,7 +2171,7 @@ static void vegas_cong_avoid(struct tcp_ * is the cwnd during the previous RTT. */ old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache_std; + tp->mss_cache; old_snd_cwnd = tp->vegas.beg_snd_cwnd; /* Save the extent of the current window so we can use this @@ -2799,19 +2800,19 @@ static void westwood_dupack_update(struc { struct tcp_sock *tp = tcp_sk(sk); - tp->westwood.accounted += tp->mss_cache_std; - tp->westwood.cumul_ack = tp->mss_cache_std; + tp->westwood.accounted += tp->mss_cache; + tp->westwood.cumul_ack = tp->mss_cache; } static inline int westwood_may_change_cumul(struct tcp_sock *tp) { - return (tp->westwood.cumul_ack > tp->mss_cache_std); + return (tp->westwood.cumul_ack > tp->mss_cache); } static inline void westwood_partial_update(struct tcp_sock *tp) { tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache_std; + tp->westwood.cumul_ack = tp->mss_cache; } static inline void westwood_complete_update(struct tcp_sock *tp) @@ -2932,7 +2933,7 @@ static int tcp_ack(struct sock *sk, stru if (!prior_packets) goto no_queue; - prior_in_flight = tcp_packets_in_flight(tp); + prior_in_flight = tcp_packets_in_flight(tp) + tp->deferred; /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt); @@ -3948,11 +3949,11 @@ static void tcp_new_space(struct sock *s { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out < tp->snd_cwnd && + if ((tp->packets_out + tp->deferred) < tp->snd_cwnd && !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); @@ -3975,16 +3976,6 @@ static inline void tcp_check_space(struc } } -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - static __inline__ void tcp_data_snd_check(struct sock *sk) { struct sk_buff *skb = sk->sk_send_head; Index: 2.6.12-rc4-tcp3/net/ipv4/tcp_ipv4.c =================================================================== --- 2.6.12-rc4-tcp3.orig/net/ipv4/tcp_ipv4.c +++ 2.6.12-rc4-tcp3/net/ipv4/tcp_ipv4.c @@ -2060,7 +2060,8 @@ static int tcp_v4_init_sock(struct sock */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; + tp->xmit_size_cache = ~0; tp->reordering = sysctl_tcp_reordering; Index: 2.6.12-rc4-tcp3/net/ipv4/tcp_output.c =================================================================== --- 2.6.12-rc4-tcp3.orig/net/ipv4/tcp_output.c +++ 2.6.12-rc4-tcp3/net/ipv4/tcp_output.c @@ -58,6 +58,7 @@ static inline void update_send_head(stru if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->deferred = 0; tcp_packets_out_inc(sk, tp, skb); } @@ -141,11 +142,11 @@ static inline void tcp_event_data_sent(s tp->ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk) +static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { struct tcp_sock *tp = tcp_sk(sk); - tcp_dec_quickack_mode(tp); + tcp_dec_quickack_mode(tp, pkts); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } @@ -361,7 +362,7 @@ static int tcp_transmit_skb(struct sock tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); @@ -372,7 +373,7 @@ static int tcp_transmit_skb(struct sock if (err <= 0) return err; - tcp_enter_cwr(tp); + tcp_enter_cwr(tp, tcp_skb_pcount(skb)); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -409,42 +410,11 @@ static void tcp_queue_skb(struct sock *s sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } - } -} - void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -454,10 +424,10 @@ void tcp_set_skb_tso_segs(struct sock *s } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -542,6 +512,7 @@ static int tcp_fragment(struct sock *sk, } /* Link BUFF into the send queue. */ + skb_header_release(buff); __skb_append(skb, buff); return 0; @@ -662,7 +633,7 @@ unsigned int tcp_sync_mss(struct sock *s /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -674,117 +645,477 @@ unsigned int tcp_sync_mss(struct sock *s * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ +static inline u32 compute_xmit_cache(u32 mss, int sacks, int tso_enabled) +{ + u32 ret = (mss << 16) | sacks; -unsigned int tcp_current_mss(struct sock *sk, int large) + if (tso_enabled) + ret |= (1 << 8); + + return ret; +} + +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u32 xmit_cache; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); - - if (do_large) { - unsigned int large_mss, factor, limit; - - large_mss = 65535 - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; - - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); - - factor = large_mss / mss_now; - - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; - - tp->mss_cache = mss_now * factor; - - mss_now = tp->mss_cache; - } + /* If the MSS, the TSO state, or the number of SACK blocks + * changes, we have to recompute tp->xmit_size_goal. + */ + xmit_cache = compute_xmit_cache(mss_now, tp->rx_opt.eff_sacks, + doing_tso); if (tp->rx_opt.eff_sacks) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + + if (tp->xmit_size_cache != xmit_cache) { + u16 xmit_size_goal = mss_now; + + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - + tp->ext_header_len - tp->tcp_header_len; + + if (tp->rx_opt.eff_sacks) + xmit_size_goal -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + + xmit_size_goal -= (xmit_size_goal % mss_now); + } + tp->xmit_size_goal = xmit_size_goal; + tp->xmit_size_cache = xmit_cache; + } + return mss_now; } +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according + * to the congestion window rules? If so, return how many segments + * are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the + * final FIN frame. -DaveM + */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data). + * + * This is implemented in the callers, where they modify + * the 'nonagle' argument based upon the location of SKB + * in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or + * for the final FIN -DaveM). + */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the congestion + * window? + */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually + * sk->sk_send_head) should be put on the wire right now. If so, it + * returns the number of packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + sk_charge_skb(sk, buff); + + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Split TSO frame SKB into one MSS sized packet, then the rest. + * This is called during SACK processing when some SACK information + * hits a TSO packet. + */ +int tcp_tso_mince(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + unsigned int mss_now = tcp_current_mss(sk, 1); + + BUG_ON(tcp_skb_pcount(skb) <= 1); + + /* We cannot use tso_fragment() in this case, because + * the packet has been sent once already and thus may + * be cloned or have other non-trivial details to deal + * with. + */ + return tcp_fragment(sk, skb, mss_now); +} + +/* Try to defer sending, if possible, in order + * to minimize the amount of TSO splitting we do. + * View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + * + * We know that "tcp_skb_pcount(skb) > 1" and also + * that "tp->snd_cwnd > tcp_packets_in_flight(tp)". + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, chunk, in_flight; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that + * cwnd > in_flight. + */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + + /* Ok, it looks like it is advisable to defer. */ + if (cong_win < send_win) + tp->deferred = tp->snd_cwnd - in_flight; + + return 1; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote - * window for us. + * window for us. Caller makes sure sk_send_head is non-NULL. * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, struct tcp_sock *tp, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); - unsigned int mss_now; + unsigned int mss_now, cwnd_quota, sent_pkts, tso_segs; + struct sk_buff *skb; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *skb; - int sent_pkts = 0; + if (unlikely(sk->sk_state == TCP_CLOSE)) + return 0; - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk, 1); + + skb = sk->sk_send_head; + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_cwnd_test(tp, skb); + sent_pkts = 0; + + while (cwnd_quota) { + if (tso_segs > 1) { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } else if (!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH))) + break; + + if (!tcp_snd_wnd_test(tp, skb, mss_now)) + break; + + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) break; } - - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + } else if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) break; + } - /* Advance the send_head. This one is sent out. - * This call will increment packets_out. - */ - update_send_head(sk, tp, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; - tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; - } + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); - if (sent_pkts) { - tcp_cwnd_validate(sk, tp); - return 0; + tcp_minshall_update(tp, mss_now, skb); + sent_pkts++; + + cwnd_quota -= tcp_skb_pcount(skb); + skb = sk->sk_send_head; + if (!skb) + break; + tso_segs = tcp_init_tso_segs(sk, skb); + } + + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; + } + + return !tp->packets_out && sk->sk_send_head; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, + struct tcp_sock *tp, + unsigned int cur_mss, + int nonagle) +{ + if (sk->sk_send_head) { + if (tcp_write_xmit(sk, tp, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +/* As ACKs arrive and the send and congestion windows potentially + * open up, we call this to try and make write queue transmit + * progress. + * + * The caller has the socket locked, and has verified that + * sk->sk_send_head is not NULL. + */ +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_write_xmit(sk, tp, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. Caller makes + * sure that sk->sk_send_head is non-NULL. + */ +void tcp_push_one(struct sock *sk, unsigned int cur_mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH); + + if (cwnd_quota && tso_segs > 1) { + if (tcp_tso_should_defer(sk, tp, skb)) + return; + } + + if (cwnd_quota) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + cur_mss, cwnd_quota); + + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) + return; + } + } else if (skb->len > cur_mss) { + if (tcp_fragment(sk, skb, cur_mss)) + return; } - return !tp->packets_out && sk->sk_send_head; + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { + update_send_head(sk, tp, skb); + return; + } } - return 0; } /* This function returns the amount that we can raise the @@ -1041,12 +1372,6 @@ int tcp_retransmit_skb(struct sock *sk, if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); - if (sk->sk_route_caps & NETIF_F_TSO) { - sk->sk_route_caps &= ~NETIF_F_TSO; - sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; - } - if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } @@ -1106,7 +1431,6 @@ int tcp_retransmit_skb(struct sock *sk, * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1671,19 +1995,11 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; if (tcp_fragment(sk, skb, seg_size)) return -1; - /* SWS override triggered forced fragmentation. - * Disable TSO, the connection is too sick. */ - if (sk->sk_route_caps & NETIF_F_TSO) { - sock_set_flag(sk, SOCK_NO_LARGESEND); - sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; - } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); Index: 2.6.12-rc4-tcp3/net/ipv6/tcp_ipv6.c =================================================================== --- 2.6.12-rc4-tcp3.orig/net/ipv6/tcp_ipv6.c +++ 2.6.12-rc4-tcp3/net/ipv6/tcp_ipv6.c @@ -2021,7 +2021,8 @@ static int tcp_v6_init_sock(struct sock */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; + tp->xmit_size_cache = ~0; tp->reordering = sysctl_tcp_reordering; @@ -2035,6 +2036,7 @@ static int tcp_v6_init_sock(struct sock sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + tcp_init_congestion_control(tp); atomic_inc(&tcp_sockets_allocated); return 0;