*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.79 1998/03/28 00:55:33 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.113 1999/09/07 02:31:39 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* David S. Miller : Charge memory using the right skb
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
+ * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
*
*/
#include <net/tcp.h>
+#include <linux/smp_lock.h>
+
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
+extern int sysctl_tcp_sack;
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse = 1;
/* Get rid of any delayed acks, we sent one already.. */
static __inline__ void clear_delayed_acks(struct sock * sk)
@@ -45,7+52,7 @@ static __inline__ void clear_delayed_acks(struct sock * sk)
tp->delayed_acks = 0;
if(tcp_in_quickack_mode(tp))
- tp->ato = ((HZ/100)*2);
+ tcp_exit_quickack_mode(tp);
tcp_clear_xmit_timer(sk, TIME_DACK);
}
@@ -58,6+65,50 @@ static __inline__ void update_send_head(struct sock *sk) tp->send_head = NULL;
}
+/* Calculate mss to advertise in SYN segment.
+ RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+
+ 1. It is independent of path mtu.
+ 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ attached devices, because some buggy hosts are confused by
+ large MSS.
+ 4. We do not make 3, we advertise MSS, calculated from first
+ hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ This may be overriden via information stored in routing table.
+ 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss;
+
+ if (dst) {
+ mss = dst->advmss;
+ } else {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* No dst. It is bad. Guess some reasonable value.
+ * Actually, this case should not be possible.
+ * SANITY.
+ */
+ BUG_TRAP(dst!=NULL);
+
+ mss = tp->mss_cache;
+ mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
+ tp->ext_header_len;
+
+ /* Minimal MSS to include full set of of TCP/IP options
+ plus 8 bytes of data. It corresponds to mtu 128.
+ */
+ if (mss < 88)
+ mss = 88;
+ }
+
+ return (__u16)mss;
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -76,15+127,28 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
int tcp_header_size = tp->tcp_header_len;
struct tcphdr *th;
+ int sysctl_flags;
+#define SYSCTL_FLAG_TSTAMPS 0x1
+#define SYSCTL_FLAG_WSCALE 0x2
+#define SYSCTL_FLAG_SACK 0x4
+
+ sysctl_flags = 0;
if(tcb->flags & TCPCB_FLAG_SYN) {
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
- if(sysctl_tcp_timestamps)
+ if(sysctl_tcp_timestamps) {
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
- if(sysctl_tcp_window_scaling)
+ sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+ }
+ if(sysctl_tcp_window_scaling) {
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
- if(sysctl_tcp_sack && !sysctl_tcp_timestamps)
- tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_WSCALE;
+ }
+ if(sysctl_tcp_sack) {
+ sysctl_flags |= SYSCTL_FLAG_SACK;
+ if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+ tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+ }
} else if(tp->sack_ok && tp->num_sacks) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
@@ -104,19+168,23 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) th->doff = (tcp_header_size >> 2);
th->res1 = 0;
*(((__u8 *)th) + 13) = tcb->flags;
- if(!(tcb->flags & TCPCB_FLAG_SYN))
- th->window = htons(tcp_select_window(sk));
th->check = 0;
th->urg_ptr = ntohs(tcb->urg_ptr);
if(tcb->flags & TCPCB_FLAG_SYN) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
th->window = htons(tp->rcv_wnd);
- tcp_syn_build_options((__u32 *)(th + 1), sk->mss,
- sysctl_tcp_timestamps,
- sysctl_tcp_sack,
- sysctl_tcp_window_scaling,
+ tcp_syn_build_options((__u32 *)(th + 1),
+ tcp_advertise_mss(sk),
+ (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+ (sysctl_flags & SYSCTL_FLAG_SACK),
+ (sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rcv_wscale,
- TCP_SKB_CB(skb)->when);
+ TCP_SKB_CB(skb)->when,
+ tp->ts_recent);
} else {
+ th->window = htons(tcp_select_window(sk));
tcp_build_and_update_options((__u32 *)(th + 1),
tp, TCP_SKB_CB(skb)->when);
}
@@ -127,6+195,9 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tcp_statistics.TcpOutSegs++;
tp->af_specific->queue_xmit(skb);
}
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
}
/* This is the main buffer sending routine. We queue the buffer
@@ -138,11+209,11 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
/* Advance write_seq and place onto the write_queue. */
tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
- skb_queue_tail(&sk->write_queue, skb);
+ __skb_queue_tail(&sk->write_queue, skb);
if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
/* Send it out now. */
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
@@ -159,10+230,10 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) }
}
-/* Function to create two new tcp segments. Shrinks the given segment
+/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
- * packet to the list. This won't be called frenquently, I hope...
- * Remember, these are still header-less SKB's at this point.
+ * packet to the list. This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
*/
static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
{
@@ -208,18+279,84 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
nsize, 0);
- TCP_SKB_CB(skb)->end_seq -= nsize;
- skb_trim(skb, skb->len - nsize);
+ /* This takes care of the FIN sequence number too. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+ skb_trim(skb, len);
/* Rechecksum original buffer. */
skb->csum = csum_partial(skb->data, skb->len, 0);
+ /* Looks stupid, but our code really uses when of
+ * skbs, which it never sent before. --ANK
+ */
+ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
/* Link BUFF into the send queue. */
- skb_append(skb, buff);
+ __skb_append(skb, buff);
return 0;
}
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+ tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+ for TCP options, but includes only bare TCP header.
+
+ tp->mss_clamp is mss negotiated at connection setup.
+ It is minumum of user_mss and mss received with SYN.
+ It also does not include TCP options.
+
+ tp->pmtu_cookie is last pmtu, seen by this function.
+
+ tp->mss_cache is current effective sending mss, including
+ all tcp options except for SACKs. It is evaluated,
+ taking into account current pmtu, but never exceeds
+ tp->mss_clamp.
+
+ NOTE1. rfc1122 clearly states that advertised MSS
+ DOES NOT include either tcp or ip options.
+
+ NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+ this function. --ANK (980731)
+ */
+
+int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+
+ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->mss_clamp)
+ mss_now = tp->mss_clamp;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ /* Now subtract optional transport overhead */
+ mss_now -= tp->ext_header_len;
+
+ /* It we got too small (or even negative) value,
+ clamp it by 8 from below. Why 8 ?
+ Well, it could be 1 with the same success,
+ but if IP accepted segment of length 1,
+ it would love 8 even more 8) --ANK (980731)
+ */
+ if (mss_now < 8)
+ mss_now = 8;
+
+ /* And store cached results */
+ tp->pmtu_cookie = pmtu;
+ tp->mss_cache = mss_now;
+ return mss_now;
+}
+
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -227,18+364,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) void tcp_write_xmit(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int mss_now = sk->mss;
+ unsigned int mss_now;
/* Account for SACKS, we may need to fragment due to this.
* It is just like the real MSS changing on us midstream.
* We also handle things correctly when the user adds some
* IP options mid-stream. Silly to do, but cover it.
*/
- if(tp->sack_ok && tp->num_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
- if(sk->opt && sk->opt->optlen)
- mss_now -= sk->opt->optlen;
+ mss_now = tcp_current_mss(sk);
/* If we are zapped, the bytes will have to remain here.
* In time closedown will empty the write queue and all
@@ -253,7+386,7 @@ void tcp_write_xmit(struct sock *sk) *
* a) following SWS avoidance [and Nagle algorithm]
* b) not exceeding our congestion window.
- * c) not retransmiting [Nagle]
+ * c) not retransmitting [Nagle]
*/
while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
if (skb->len > mss_now) {
@@ -263,7+396,7 @@ void tcp_write_xmit(struct sock *sk)
/* Advance the send_head. This one is going out. */
update_send_head(sk);
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
@@ -285,14+418,14 @@ void tcp_write_xmit(struct sock *sk) * 2. We limit memory per socket
*
* RFC 1122:
- * "the suggested [SWS] avoidance algoritm for the receiver is to keep
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
* RECV.NEXT + RCV.WIN fixed until:
* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
*
* i.e. don't raise the right edge of the window until you can raise
* it at least MSS bytes.
*
- * Unfortunately, the recomended algorithm breaks header prediction,
+ * Unfortunately, the recommended algorithm breaks header prediction,
* since header prediction assumes th->window stays fixed.
*
* Strictly speaking, keeping th->window fixed violates the receiver
@@ -327,30+460,32 @@ void tcp_write_xmit(struct sock *sk) * a multiple of the mss when it is feasible to do so.
*
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- unsigned int mss = sk->mss;
- unsigned int free_space;
- u32 window, cur_win;
-
- free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
- if (tp->window_clamp) {
- free_space = min(tp->window_clamp, free_space);
- mss = min(tp->window_clamp, mss);
- } else {
- printk("tcp_select_window: tp->window_clamp == 0.\n");
- }
-
- if (mss < 1) {
- mss = 1;
- printk("tcp_select_window: sk->mss fell to 0.\n");
- }
+ /* MSS for the peer's data. Previous verions used mss_clamp
+ * here. I don't know if the value based on our guesses
+ * of peer's MSS is better for the performance. It's more correct
+ * but may be worse for the performance because of rcv_mss
+ * fluctuations. --SAW 1998/11/1
+ */
+ unsigned int mss = tp->rcv_mss;
+ int free_space;
+ u32 window;
+
+ /* Sometimes free_space can be < 0. */
+ free_space = tcp_space(sk);
+ if (free_space > ((int) tp->window_clamp))
+ free_space = tp->window_clamp;
+ if (tp->window_clamp < mss)
+ mss = tp->window_clamp;
- cur_win = tcp_receive_window(tp);
- if (free_space < sk->rcvbuf/4 && free_space < mss/2) {
+ if ((free_space < (tcp_full_space(sk) / 2)) &&
+ (free_space < ((int) (mss/2)))) {
window = 0;
+ tp->pred_flags = 0;
} else {
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
@@ -361,8+496,9 @@ u32 __tcp_select_window(struct sock *sk) * is too small.
*/
window = tp->rcv_wnd;
- if ((window <= (free_space - mss)) || (window > free_space))
- window = (free_space/mss)*mss;
+ if ((((int) window) <= (free_space - ((int) mss))) ||
+ (((int) window) > free_space))
+ window = (((unsigned int) free_space)/mss)*mss;
}
return window;
}
@@ -396,7+532,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m return;
/* Ok. We will be able to collapse the packet. */
- skb_unlink(next_skb);
+ __skb_unlink(next_skb, next_skb->list);
if(skb->len % 4) {
/* Must copy and rechecksum all data. */
@@ -412,8+548,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m }
/* Update sequence range on original skb. */
- TCP_SKB_CB(skb)->end_seq +=
- TCP_SKB_CB(next_skb)->end_seq - TCP_SKB_CB(next_skb)->seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
/* Merge over control information. */
flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
@@ -432,25+567,45 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m }
/* Do a simple retransmit without using the backoff mechanisms in
- * tcp_timer. This is used to speed up path mtu recovery. Note that
- * these simple retransmits aren't counted in the usual tcp retransmit
- * backoff counters.
+ * tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb, *old_next_skb;
+ unsigned int mss = tcp_current_mss(sk);
/* Don't muck with the congestion window here. */
tp->dup_acks = 0;
tp->high_seq = tp->snd_nxt;
- tp->retrans_head = NULL;
+ tp->retrans_head = NULL;
/* Input control flow will see that this was retransmitted
* and not use it for RTT calculation in the absence of
* the timestamp option.
*/
- tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+ for (old_next_skb = skb = skb_peek(&sk->write_queue);
+ ((skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue));
+ skb = skb->next) {
+ int resend_skb = 0;
+
+ /* Our goal is to push out the packets which we
+ * sent already, but are being chopped up now to
+ * account for the PMTU information we have.
+ *
+ * As we resend the queue, packets are fragmented
+ * into two pieces, and when we try to send the
+ * second piece it may be collapsed together with
+ * a subsequent packet, and so on. -DaveM
+ */
+ if (old_next_skb != skb || skb->len > mss)
+ resend_skb = 1;
+ old_next_skb = skb->next;
+ if (resend_skb != 0)
+ tcp_retransmit_skb(sk, skb);
+ }
}
static __inline__ void update_retrans_head(struct sock *sk)
@@ -459,28+614,23 @@ static __inline__ void update_retrans_head(struct sock *sk)
tp->retrans_head = tp->retrans_head->next;
if((tp->retrans_head == tp->send_head) ||
- (tp->retrans_head == (struct sk_buff *) &sk->write_queue))
+ (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
tp->retrans_head = NULL;
+ tp->rexmt_done = 1;
+ }
}
/* This retransmits one SKB. Policy decisions and retransmit queue
* state updates are done by the caller. Returns non-zero if an
- * error occured which prevented the send.
+ * error occurred which prevented the send.
*/
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int current_mss = sk->mss;
-
- /* Account for outgoing SACKS and IP options, if any. */
- if(tp->sack_ok && tp->num_sacks)
- current_mss -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
- if(sk->opt && sk->opt->optlen)
- current_mss -= sk->opt->optlen;
+ unsigned int cur_mss = tcp_current_mss(sk);
- if(skb->len > current_mss) {
- if(tcp_fragment(sk, skb, current_mss))
+ if(skb->len > cur_mss) {
+ if(tcp_fragment(sk, skb, cur_mss))
return 1; /* We'll try again later. */
/* New SKB created, account for it. */
@@ -489,26+639,40 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Collapse two adjacent packets if worthwhile and we can. */
if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
- (skb->len < (current_mss >> 1)) &&
+ (skb->len < (cur_mss >> 1)) &&
(skb->next != tp->send_head) &&
- (skb->next != (struct sk_buff *)&sk->write_queue))
- tcp_retrans_try_collapse(sk, skb, current_mss);
+ (skb->next != (struct sk_buff *)&sk->write_queue) &&
+ (sysctl_tcp_retrans_collapse != 0))
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
if(tp->af_specific->rebuild_header(sk))
return 1; /* Routing failure or similar. */
+ /* Some Solaris stacks overoptimize and ignore the FIN on a
+ * retransmit when old data is attached. So strip it off
+ * since it is cheap to do so and saves bytes on the network.
+ */
+ if(skb->len > 0 &&
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+ TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+ skb_trim(skb, 0);
+ skb->csum = 0;
+ }
+
/* Ok, we're gonna send it out, update state. */
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
tp->retrans_out++;
/* Make a copy, if the first transmission SKB clone we made
- * is still in somebodies hands, else make a clone.
+ * is still in somebody's hands, else make a clone.
*/
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
if(skb_cloned(skb))
skb = skb_copy(skb, GFP_ATOMIC);
else
skb = skb_clone(skb, GFP_ATOMIC);
+
tcp_transmit_skb(sk, skb);
/* Update global TCP statistics and return success. */
@@ -531,11+695,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
- if (tp->retrans_head == NULL)
+ if (tp->retrans_head == NULL &&
+ tp->rexmt_done == 0)
tp->retrans_head = skb_peek(&sk->write_queue);
if (tp->retrans_head == tp->send_head)
tp->retrans_head = NULL;
+ /* Each time, advance the retrans_head if we got
+ * a packet out or we skipped one because it was
+ * SACK'd. -DaveM
+ */
while ((skb = tp->retrans_head) != NULL) {
/* If it has been ack'd by a SACK block, we don't
* retransmit it.
@@ -544,14+713,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* Send it out, punt if error occurred. */
if(tcp_retransmit_skb(sk, skb))
break;
+
+ update_retrans_head(sk);
/* Stop retransmitting if we've hit the congestion
* window limit.
*/
if (tp->retrans_out >= tp->snd_cwnd)
break;
+ } else {
+ update_retrans_head(sk);
}
- update_retrans_head(sk);
}
}
@@ -594,22+766,37 @@ void tcp_send_fin(struct sock *sk) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
- int mss_now = sk->mss;
+ unsigned int mss_now;
/* Optimization, tack on the FIN if we have a queue of
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
- if(tp->sack_ok && tp->num_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
- if(sk->opt && sk->opt->optlen)
- mss_now -= sk->opt->optlen;
+ mss_now = tcp_current_mss(sk);
+
if((tp->send_head != NULL) && (skb->len < mss_now)) {
/* tcp_write_xmit() takes care of the rest. */
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
+
+ /* Special case to avoid Nagle bogosity. If this
+ * segment is the last segment, and it was queued
+ * due to Nagle/SWS-avoidance, send it out now.
+ */
+ if(tp->send_head == skb &&
+ !sk->nonagle &&
+ skb->len < (tp->rcv_mss >> 1) &&
+ tp->packets_out &&
+ !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
+ update_send_head(sk);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if(!tcp_timer_is_set(sk, TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
} else {
/* Socket is locked, keep trying until memory is available. */
do {
@@ -638,15+825,15 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk)
+void tcp_send_active_reset(struct sock *sk, int priority)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
- do {
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
- } while(skb == NULL);
+ skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+ if (!skb)
+ return;
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
@@ -658,7+845,7 @@ void tcp_send_active_reset(struct sock *sk) /* Send it off. */
TCP_SKB_CB(skb)->seq = tp->write_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_transmit_skb(sk, skb);
}
@@ -671,7+858,7 @@ int tcp_send_synack(struct sock *sk) {
struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff* skb;
-
+
skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
1, GFP_ATOMIC);
if (skb == NULL)
@@ -687,15+874,18 @@ int tcp_send_synack(struct sock *sk) /* SYN eats a sequence byte. */
TCP_SKB_CB(skb)->seq = tp->snd_una;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- skb_queue_tail(&sk->write_queue, skb);
- TCP_SKB_CB(skb)->when = jiffies;
+ __skb_queue_tail(&sk->write_queue, skb);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
return 0;
}
+/*
+ * Prepare a SYN-ACK.
+ */
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct open_request *req, int mss)
+ struct open_request *req)
{
struct tcphdr *th;
int tcp_header_size;
@@ -710,24+900,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb->dst = dst_clone(dst);
- if (sk->user_mss)
- mss = min(mss, sk->user_mss);
- if (req->tstamp_ok)
- mss -= TCPOLEN_TSTAMP_ALIGNED;
- else
- req->mss += TCPOLEN_TSTAMP_ALIGNED;
-
- /* Don't offer more than they did.
- * This way we don't have to memorize who said what.
- * FIXME: maybe this should be changed for better performance
- * with syncookies.
- */
- req->mss = min(mss, req->mss);
- if (req->mss < 1) {
- printk(KERN_DEBUG "initial req->mss below 1\n");
- req->mss = 1;
- }
-
tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
(req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
(req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
@@ -748,19+920,24 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = skb->dst->window;
- tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
req->wscale_ok,
&rcv_wscale);
req->rcv_wscale = rcv_wscale;
}
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(req->rcv_wnd);
- TCP_SKB_CB(skb)->when = jiffies;
- tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
req->sack_ok, req->wscale_ok, req->rcv_wscale,
- TCP_SKB_CB(skb)->when);
+ TCP_SKB_CB(skb)->when,
+ req->ts_recent);
skb->csum = 0;
th->doff = (tcp_header_size >> 2);
@@ -768,95+945,89 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb;
}
-void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
+int tcp_connect(struct sock *sk, struct sk_buff *buff)
{
- struct dst_entry *dst = sk->dst_cache;
+ struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Reserve space for headers. */
skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
- if (sk->priority == 0)
- sk->priority = dst->priority;
-
- tp->snd_wnd = 0;
- tp->snd_wl1 = 0;
- tp->snd_wl2 = tp->write_seq;
- tp->snd_una = tp->write_seq;
- tp->rcv_nxt = 0;
-
- sk->err = 0;
-
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
tp->tcp_header_len = sizeof(struct tcphdr) +
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
- mss -= tp->tcp_header_len;
-
- if (sk->user_mss)
- mss = min(mss, sk->user_mss);
-
- if (mss < 1) {
- printk(KERN_DEBUG "intial sk->mss below 1\n");
- mss = 1; /* Sanity limit */
- }
-
- sk->mss = mss;
-
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
- TCP_SKB_CB(buff)->sacked = 0;
- TCP_SKB_CB(buff)->urg_ptr = 0;
- buff->csum = 0;
- TCP_SKB_CB(buff)->seq = tp->write_seq++;
- TCP_SKB_CB(buff)->end_seq = tp->write_seq;
- tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
+ /* If user gave his TCP_MAXSEG, record it to clamp */
+ if (tp->user_mss)
+ tp->mss_clamp = tp->user_mss;
+ tcp_sync_mss(sk, dst->pmtu);
tp->window_clamp = dst->window;
- tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
+
+ tcp_select_initial_window(tcp_full_space(sk),
+ dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
&tp->rcv_wscale);
- /* Ok, now lock the socket before we make it visible to
- * the incoming packet engine.
- */
- lock_sock(sk);
/* Socket identity change complete, no longer
* in TCP_CLOSE, so enter ourselves into the
* hash tables.
*/
tcp_set_state(sk,TCP_SYN_SENT);
- sk->prot->hash(sk);
+ if (tp->af_specific->hash_connecting(sk))
+ goto err_out;
+
+ sk->err = 0;
+ tp->snd_wnd = 0;
+ tp->snd_wl1 = 0;
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
+ tp->rcv_nxt = 0;
+ tp->rcv_wup = 0;
+ tp->copied_seq = 0;
- tp->rto = dst->rtt;
+ tp->rto = TCP_TIMEOUT_INIT;
tcp_init_xmit_timers(sk);
tp->retransmits = 0;
tp->fackets_out = 0;
tp->retrans_out = 0;
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+ TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->urg_ptr = 0;
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->seq = tp->write_seq++;
+ TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ tp->snd_nxt = tp->write_seq;
+
/* Send it off. */
- skb_queue_tail(&sk->write_queue, buff);
- TCP_SKB_CB(buff)->when = jiffies;
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ __skb_queue_tail(&sk->write_queue, buff);
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
tcp_statistics.TcpActiveOpens++;
/* Timer for repeating the SYN until an answer. */
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ return 0;
- /* Now, it is safe to release the socket. */
- release_sock(sk);
+err_out:
+ tcp_set_state(sk,TCP_CLOSE);
+ kfree_skb(buff);
+ return -EADDRNOTAVAIL;
}
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
-void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
unsigned long timeout;
/* Stay within the limit we were given */
@@ -866,13+1037,16 @@ void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) timeout += jiffies;
/* Use new timeout only if there wasn't a older one earlier. */
- if (!tp->delack_timer.prev) {
+ spin_lock_bh(&sk->timer_lock);
+ if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) {
+ sock_hold(sk);
tp->delack_timer.expires = timeout;
- add_timer(&tp->delack_timer);
- } else {
- if (timeout < tp->delack_timer.expires)
- mod_timer(&tp->delack_timer, timeout);
+ } else {
+ if (time_before(timeout, tp->delack_timer.expires))
+ tp->delack_timer.expires = timeout;
}
+ add_timer(&tp->delack_timer);
+ spin_unlock_bh(&sk->timer_lock);
}
/* This routine sends an ack and also updates the window. */
@@ -893,8+1067,14 @@ void tcp_send_ack(struct sock *sk) * (ACK is unreliable) but it's much better use of
* bandwidth on slow links to send a spare ack than
* resend packets.
+ *
+ * This is the one possible way that we can delay an
+ * ACK and have tp->ato indicate that we are in
+ * quick ack mode, so clear it.
*/
- tcp_send_delayed_ack(tp, HZ/2);
+ if(tcp_in_quickack_mode(tp))
+ tcp_exit_quickack_mode(tp);
+ tcp_send_delayed_ack(sk, HZ/2);
return;
}
@@ -907,7+1087,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
- TCP_SKB_CB(buff)->when = jiffies;
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_transmit_skb(sk, buff);
}
}
@@ -928,7+1108,7 @@ void tcp_write_wakeup(struct sock *sk) */
if ((1 << sk->state) &
~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
- TCPF_LAST_ACK|TCPF_CLOSING))
+ TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
return;
if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
@@ -945,7+1125,7 @@ void tcp_write_wakeup(struct sock *sk) return; /* Let a retransmit get it. */
}
update_send_head(sk);
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
@@ -971,7+1151,7 @@ void tcp_write_wakeup(struct sock *sk) */
TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(skb)->when = jiffies;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_transmit_skb(sk, skb);
}
}