Import 2.3.18pre1
[davej-history.git] / net / ipv4 / tcp_output.c
index e318207..77f8b98 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_output.c,v 1.90 1998/05/06 04:59:15 davem Exp $
+ * Version:    $Id: tcp_output.c,v 1.113 1999/09/07 02:31:39 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             David S. Miller :       Charge memory using the right skb
  *                                     during syn/ack processing.
  *             David S. Miller :       Output engine completely rewritten.
+ *             Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
  *
  */
 
 #include <net/tcp.h>
 
+#include <linux/smp_lock.h>
+
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
@@ -49,7+52,7 @@ static __inline__ void clear_delayed_acks(struct sock * sk)
 
        tp->delayed_acks = 0;
        if(tcp_in_quickack_mode(tp))
-               tp->ato = ((HZ/100)*2);
+               tcp_exit_quickack_mode(tp);
        tcp_clear_xmit_timer(sk, TIME_DACK);
 }
 
@@ -62,6+65,50 @@ static __inline__ void update_send_head(struct sock *sk)
                tp->send_head = NULL;
 }
 
+/* Calculate mss to advertise in SYN segment.
+   RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+
+   1. It is independent of path mtu.
+   2. Ideally, it is maximal possible segment size i.e. 65535-40.
+   3. For IPv4 it is reasonable to calculate it from maximal MTU of
+      attached devices, because some buggy hosts are confused by
+      large MSS.
+   4. We do not make 3, we advertise MSS, calculated from first
+      hop device mtu, but allow to raise it to ip_rt_min_advmss.
+      This may be overriden via information stored in routing table.
+   5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+      probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+       struct dst_entry *dst = __sk_dst_get(sk);
+       int mss;
+
+       if (dst) {
+               mss = dst->advmss;
+       } else {
+               struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+               /* No dst. It is bad. Guess some reasonable value.
+                * Actually, this case should not be possible.
+                * SANITY.
+                */
+               BUG_TRAP(dst!=NULL);
+
+               mss = tp->mss_cache;
+               mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
+                       tp->ext_header_len;
+
+               /* Minimal MSS to include full set of of TCP/IP options
+                  plus 8 bytes of data. It corresponds to mtu 128.
+                */
+               if (mss < 88)
+                       mss = 88;
+       }
+
+       return (__u16)mss;
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -80,15+127,28 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
                int tcp_header_size = tp->tcp_header_len;
                struct tcphdr *th;
+               int sysctl_flags;
+
+#define SYSCTL_FLAG_TSTAMPS    0x1
+#define SYSCTL_FLAG_WSCALE     0x2
+#define SYSCTL_FLAG_SACK       0x4
 
+               sysctl_flags = 0;
                if(tcb->flags & TCPCB_FLAG_SYN) {
                        tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
-                       if(sysctl_tcp_timestamps)
+                       if(sysctl_tcp_timestamps) {
                                tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
-                       if(sysctl_tcp_window_scaling)
+                               sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+                       }
+                       if(sysctl_tcp_window_scaling) {
                                tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
-                       if(sysctl_tcp_sack && !sysctl_tcp_timestamps)
-                               tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+                               sysctl_flags |= SYSCTL_FLAG_WSCALE;
+                       }
+                       if(sysctl_tcp_sack) {
+                               sysctl_flags |= SYSCTL_FLAG_SACK;
+                               if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+                                       tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+                       }
                } else if(tp->sack_ok && tp->num_sacks) {
                        /* A SACK is 2 pad bytes, a 2 byte header, plus
                         * 2 32-bit sequence numbers for each SACK block.
@@ -108,8+168,6 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                th->doff                = (tcp_header_size >> 2);
                th->res1                = 0;
                *(((__u8 *)th) + 13)    = tcb->flags;
-               if(!(tcb->flags & TCPCB_FLAG_SYN))
-                       th->window      = htons(tcp_select_window(sk));
                th->check               = 0;
                th->urg_ptr             = ntohs(tcb->urg_ptr);
                if(tcb->flags & TCPCB_FLAG_SYN) {
@@ -117,13+175,16 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                         * is never scaled.
                         */
                        th->window      = htons(tp->rcv_wnd);
-                       tcp_syn_build_options((__u32 *)(th + 1), sk->mss,
-                                             sysctl_tcp_timestamps,
-                                             sysctl_tcp_sack,
-                                             sysctl_tcp_window_scaling,
+                       tcp_syn_build_options((__u32 *)(th + 1),
+                                             tcp_advertise_mss(sk),
+                                             (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+                                             (sysctl_flags & SYSCTL_FLAG_SACK),
+                                             (sysctl_flags & SYSCTL_FLAG_WSCALE),
                                              tp->rcv_wscale,
-                                             TCP_SKB_CB(skb)->when);
+                                             TCP_SKB_CB(skb)->when,
+                                             tp->ts_recent);
                } else {
+                       th->window      = htons(tcp_select_window(sk));
                        tcp_build_and_update_options((__u32 *)(th + 1),
                                                     tp, TCP_SKB_CB(skb)->when);
                }
@@ -134,6+195,9 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                tcp_statistics.TcpOutSegs++;
                tp->af_specific->queue_xmit(skb);
        }
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
 }
 
 /* This is the main buffer sending routine. We queue the buffer
@@ -149,7+213,7 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
 
        if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
                /* Send it out now. */
-               TCP_SKB_CB(skb)->when = jiffies;
+               TCP_SKB_CB(skb)->when = tcp_time_stamp;
                tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
                tp->packets_out++;
                tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
@@ -166,10+230,10 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
        }
 }
 
-/* Function to create two new tcp segments.  Shrinks the given segment
+/* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
- * packet to the list. This won't be called frenquently, I hope..
- * Remember, these are still header-less SKB's at this point.
+ * packet to the list.  This won't be called frequently, I hope
+ * Remember, these are still headerless SKBs at this point.
  */
 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 {
@@ -215,18+279,84 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
        buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
                                       nsize, 0);
 
-       TCP_SKB_CB(skb)->end_seq -= nsize;
-       skb_trim(skb, skb->len - nsize);
+       /* This takes care of the FIN sequence number too. */
+       TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+       skb_trim(skb, len);
 
        /* Rechecksum original buffer. */
        skb->csum = csum_partial(skb->data, skb->len, 0);
 
+       /* Looks stupid, but our code really uses when of
+        * skbs, which it never sent before. --ANK
+        */
+       TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
        /* Link BUFF into the send queue. */
        __skb_append(skb, buff);
 
        return 0;
 }
 
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->mss_clamp is mss negotiated at connection setup.
+   It is minumum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   tp->pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+   this function.                      --ANK (980731)
+ */
+
+int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+       int mss_now;
+
+       /* Calculate base mss without TCP options:
+          It is MMS_S - sizeof(tcphdr) of rfc1122
+        */
+
+       mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+       /* Clamp it (mss_clamp does not include tcp options) */
+       if (mss_now > tp->mss_clamp)
+               mss_now = tp->mss_clamp;
+
+       /* Now subtract TCP options size, not including SACKs */
+       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+       /* Now subtract optional transport overhead */
+       mss_now -= tp->ext_header_len;
+
+       /* It we got too small (or even negative) value,
+          clamp it by 8 from below. Why 8 ?
+          Well, it could be 1 with the same success,
+          but if IP accepted segment of length 1,
+          it would love 8 even more 8)         --ANK (980731)
+        */
+       if (mss_now < 8)
+               mss_now = 8;
+
+       /* And store cached results */
+       tp->pmtu_cookie = pmtu;
+       tp->mss_cache = mss_now;
+       return mss_now;
+}
+
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -256,7+386,7 @@ void tcp_write_xmit(struct sock *sk)
                 *
                 * a) following SWS avoidance [and Nagle algorithm]
                 * b) not exceeding our congestion window.
-                * c) not retransmiting [Nagle]
+                * c) not retransmitting [Nagle]
                 */
                while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
                        if (skb->len > mss_now) {
@@ -266,7+396,7 @@ void tcp_write_xmit(struct sock *sk)
 
                        /* Advance the send_head.  This one is going out. */
                        update_send_head(sk);
-                       TCP_SKB_CB(skb)->when = jiffies;
+                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
                        tp->packets_out++;
                        tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
@@ -288,14+418,14 @@ void tcp_write_xmit(struct sock *sk)
  * 2. We limit memory per socket
  *
  * RFC 1122:
- * "the suggested [SWS] avoidance algoritm for the receiver is to keep
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
  *  RECV.NEXT + RCV.WIN fixed until:
  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
  *
  * i.e. don't raise the right edge of the window until you can raise
  * it at least MSS bytes.
  *
- * Unfortunately, the recomended algorithm breaks header prediction,
+ * Unfortunately, the recommended algorithm breaks header prediction,
  * since header prediction assumes th->window stays fixed.
  *
  * Strictly speaking, keeping th->window fixed violates the receiver
@@ -330,30+460,32 @@ void tcp_write_xmit(struct sock *sk)
  * a multiple of the mss when it is feasible to do so.
  *
  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
  */
 u32 __tcp_select_window(struct sock *sk)
 {
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-       unsigned int mss = sk->mss;
-       unsigned int free_space;
-       u32 window, cur_win;
-
-       free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
-       if (tp->window_clamp) {
-               free_space = min(tp->window_clamp, free_space);
-               mss = min(tp->window_clamp, mss);
-       } else {
-               printk("tcp_select_window: tp->window_clamp == 0.\n");
-       }
-
-       if (mss < 1) {
-               mss = 1;
-               printk("tcp_select_window: sk->mss fell to 0.\n");
-       }
+       /* MSS for the peer's data.  Previous verions used mss_clamp
+        * here.  I don't know if the value based on our guesses
+        * of peer's MSS is better for the performance.  It's more correct
+        * but may be worse for the performance because of rcv_mss
+        * fluctuations.  --SAW  1998/11/1
+        */
+       unsigned int mss = tp->rcv_mss;
+       int free_space;
+       u32 window;
+
+       /* Sometimes free_space can be < 0. */
+       free_space = tcp_space(sk); 
+       if (free_space > ((int) tp->window_clamp))
+               free_space = tp->window_clamp;
+       if (tp->window_clamp < mss)
+               mss = tp->window_clamp; 
        
-       cur_win = tcp_receive_window(tp);
-       if (free_space < sk->rcvbuf/4 && free_space < mss/2) {
+       if ((free_space < (tcp_full_space(sk) / 2)) && 
+               (free_space < ((int) (mss/2)))) {
                window = 0;
+               tp->pred_flags = 0; 
        } else {
                /* Get the largest window that is a nice multiple of mss.
                 * Window clamp already applied above.
@@ -364,8+496,9 @@ u32 __tcp_select_window(struct sock *sk)
                 * is too small.
                 */
                window = tp->rcv_wnd;
-               if ((window <= (free_space - mss)) || (window > free_space))
-                       window = (free_space/mss)*mss;
+               if ((((int) window) <= (free_space - ((int) mss))) ||
+                               (((int) window) > free_space))
+                       window = (((unsigned int) free_space)/mss)*mss;
        }
        return window;
 }
@@ -440,24+573,39 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 void tcp_simple_retransmit(struct sock *sk)
 {
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-       struct sk_buff *skb
-       unsigned int mss = tcp_current_mss(sk); 
+       struct sk_buff *skb, *old_next_skb;
+       unsigned int mss = tcp_current_mss(sk);
 
        /* Don't muck with the congestion window here. */
        tp->dup_acks = 0;
        tp->high_seq = tp->snd_nxt;
-       tp->retrans_head = NULL; 
+       tp->retrans_head = NULL;
 
        /* Input control flow will see that this was retransmitted
         * and not use it for RTT calculation in the absence of
         * the timestamp option.
         */
-       for (skb = skb_peek(&sk->write_queue);
+       for (old_next_skb = skb = skb_peek(&sk->write_queue);
             ((skb != tp->send_head) &&
              (skb != (struct sk_buff *)&sk->write_queue));
-            skb = skb->next) 
-               if (skb->len > mss)
-                       tcp_retransmit_skb(sk, skb); 
+            skb = skb->next) {
+               int resend_skb = 0;
+
+               /* Our goal is to push out the packets which we
+                * sent already, but are being chopped up now to
+                * account for the PMTU information we have.
+                *
+                * As we resend the queue, packets are fragmented
+                * into two pieces, and when we try to send the
+                * second piece it may be collapsed together with
+                * a subsequent packet, and so on.  -DaveM
+                */
+               if (old_next_skb != skb || skb->len > mss)
+                       resend_skb = 1;
+               old_next_skb = skb->next;
+               if (resend_skb != 0)
+                       tcp_retransmit_skb(sk, skb);
+       }
 }
 
 static __inline__ void update_retrans_head(struct sock *sk)
@@ -466,13+614,15 @@ static __inline__ void update_retrans_head(struct sock *sk)
        
        tp->retrans_head = tp->retrans_head->next;
        if((tp->retrans_head == tp->send_head) ||
-          (tp->retrans_head == (struct sk_buff *) &sk->write_queue))
+          (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
                tp->retrans_head = NULL;
+               tp->rexmt_done = 1;
+       }
 }
 
 /* This retransmits one SKB.  Policy decisions and retransmit queue
  * state updates are done by the caller.  Returns non-zero if an
- * error occured which prevented the send.
+ * error occurred which prevented the send.
  */
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -498,18+648,31 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        if(tp->af_specific->rebuild_header(sk))
                return 1; /* Routing failure or similar. */
 
+       /* Some Solaris stacks overoptimize and ignore the FIN on a
+        * retransmit when old data is attached.  So strip it off
+        * since it is cheap to do so and saves bytes on the network.
+        */
+       if(skb->len > 0 &&
+          (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+          tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+               TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+               skb_trim(skb, 0);
+               skb->csum = 0;
+       }
+
        /* Ok, we're gonna send it out, update state. */
        TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
        tp->retrans_out++;
 
        /* Make a copy, if the first transmission SKB clone we made
-        * is still in somebodies hands, else make a clone.
+        * is still in somebody's hands, else make a clone.
         */
-       TCP_SKB_CB(skb)->when = jiffies;
+       TCP_SKB_CB(skb)->when = tcp_time_stamp;
        if(skb_cloned(skb))
                skb = skb_copy(skb, GFP_ATOMIC);
        else
                skb = skb_clone(skb, GFP_ATOMIC);
+
        tcp_transmit_skb(sk, skb);
 
        /* Update global TCP statistics and return success. */
@@ -532,7+695,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
        struct sk_buff *skb;
 
-       if (tp->retrans_head == NULL)
+       if (tp->retrans_head == NULL &&
+           tp->rexmt_done == 0)
                tp->retrans_head = skb_peek(&sk->write_queue);
        if (tp->retrans_head == tp->send_head)
                tp->retrans_head = NULL;
@@ -555,7+719,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        /* Stop retransmitting if we've hit the congestion
                         * window limit.
                         */
-                       if (tp->retrans_out >= (tp->snd_cwnd >> TCP_CWND_SHIFT))
+                       if (tp->retrans_out >= tp->snd_cwnd)
                                break;
                } else {
                        update_retrans_head(sk);
@@ -585,7+749,7 @@ void tcp_fack_retransmit(struct sock *sk)
                if(tcp_retransmit_skb(sk, skb))
                        break;
 
-               if(tcp_packets_in_flight(tp) >= (tp->snd_cwnd >> TCP_CWND_SHIFT))
+               if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
                        break;
 next_packet:
                packet_cnt++;
@@ -622,11+786,11 @@ void tcp_send_fin(struct sock *sk)
                 */
                if(tp->send_head == skb &&
                   !sk->nonagle &&
-                  skb->len < (sk->mss >> 1) &&
+                  skb->len < (tp->rcv_mss >> 1) &&
                   tp->packets_out &&
                   !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
                        update_send_head(sk);
-                       TCP_SKB_CB(skb)->when = jiffies;
+                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
                        tp->packets_out++;
                        tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
@@ -661,15+825,15 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk)
+void tcp_send_active_reset(struct sock *sk, int priority)
 {
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
        struct sk_buff *skb;
 
        /* NOTE: No TCP options attached and we never retransmit this. */
-       do {
-               skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
-       } while(skb == NULL);
+       skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+       if (!skb)
+               return;
 
        /* Reserve space for headers and prepare control bits. */
        skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
@@ -681,7+845,7 @@ void tcp_send_active_reset(struct sock *sk)
        /* Send it off. */
        TCP_SKB_CB(skb)->seq = tp->write_seq;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
-       TCP_SKB_CB(skb)->when = jiffies;
+       TCP_SKB_CB(skb)->when = tcp_time_stamp;
        tcp_transmit_skb(sk, skb);
 }
 
@@ -694,7+858,7 @@ int tcp_send_synack(struct sock *sk)
 {
        struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
        struct sk_buff* skb;    
-       
+
        skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
                           1, GFP_ATOMIC);
        if (skb == NULL) 
@@ -711,7+875,7 @@ int tcp_send_synack(struct sock *sk)
        TCP_SKB_CB(skb)->seq = tp->snd_una;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
        __skb_queue_tail(&sk->write_queue, skb);
-       TCP_SKB_CB(skb)->when = jiffies;
+       TCP_SKB_CB(skb)->when = tcp_time_stamp;
        tp->packets_out++;
        tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
        return 0;
@@ -721,7+885,7 @@ int tcp_send_synack(struct sock *sk)
  * Prepare a SYN-ACK.
  */
 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
-                                struct open_request *req, int mss)
+                                struct open_request *req)
 {
        struct tcphdr *th;
        int tcp_header_size;
@@ -736,22+900,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
        skb->dst = dst_clone(dst);
 
-       if (sk->user_mss)
-               mss = min(mss, sk->user_mss);
-       if (req->tstamp_ok)
-               mss -= TCPOLEN_TSTAMP_ALIGNED;
-
-       /* Don't offer more than they did.
-        * This way we don't have to memorize who said what.
-        * FIXME: maybe this should be changed for better performance
-        * with syncookies.
-        */
-       req->mss = min(mss, req->mss);
-       if (req->mss < 1) {
-               printk(KERN_DEBUG "initial req->mss below 1\n");
-               req->mss = 1;
-       }
-
        tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
                           (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
                           (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
@@ -772,7+920,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                __u8 rcv_wscale; 
                /* Set this up on the first call only */
                req->window_clamp = skb->dst->window;
-               tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+               /* tcp_full_space because it is guaranteed to be the first packet */
+               tcp_select_initial_window(tcp_full_space(sk), 
+                       dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                        &req->rcv_wnd,
                        &req->window_clamp,
                        req->wscale_ok,
@@ -783,106+933,101 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(req->rcv_wnd);
 
-       TCP_SKB_CB(skb)->when = jiffies;
-       tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
+       TCP_SKB_CB(skb)->when = tcp_time_stamp;
+       tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
                              req->sack_ok, req->wscale_ok, req->rcv_wscale,
-                             TCP_SKB_CB(skb)->when);
+                             TCP_SKB_CB(skb)->when,
+                             req->ts_recent);
 
        skb->csum = 0;
        th->doff = (tcp_header_size >> 2);
-       tcp_statistics.TcpOutSegs++; 
+       tcp_statistics.TcpOutSegs++;
        return skb;
 }
 
-void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
+int tcp_connect(struct sock *sk, struct sk_buff *buff)
 {
-       struct dst_entry *dst = sk->dst_cache;
+       struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
        /* Reserve space for headers. */
        skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 
-       if (sk->priority == 0)
-               sk->priority = dst->priority;
-
-       tp->snd_wnd = 0;
-       tp->snd_wl1 = 0;
-       tp->snd_wl2 = tp->write_seq;
-       tp->snd_una = tp->write_seq;
-       tp->rcv_nxt = 0;
-
-       sk->err = 0;
-       
        /* We'll fix this up when we get a response from the other end.
         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
         */
        tp->tcp_header_len = sizeof(struct tcphdr) +
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
-       mss -= tp->tcp_header_len;
-
-       if (sk->user_mss)
-               mss = min(mss, sk->user_mss);
-
-       if (mss < 1) {
-               printk(KERN_DEBUG "intial sk->mss below 1\n");
-               mss = 1;        /* Sanity limit */
-       }
-
-       sk->mss = mss;
-
-       TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
-       TCP_SKB_CB(buff)->sacked = 0;
-       TCP_SKB_CB(buff)->urg_ptr = 0;
-       buff->csum = 0;
-       TCP_SKB_CB(buff)->seq = tp->write_seq++;
-       TCP_SKB_CB(buff)->end_seq = tp->write_seq;
-       tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
+       /* If user gave his TCP_MAXSEG, record it to clamp */
+       if (tp->user_mss)
+               tp->mss_clamp = tp->user_mss;
+       tcp_sync_mss(sk, dst->pmtu);
 
        tp->window_clamp = dst->window;
-       tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
+
+       tcp_select_initial_window(tcp_full_space(sk),
+               dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
                &tp->rcv_wnd,
                &tp->window_clamp,
                sysctl_tcp_window_scaling,
                &tp->rcv_wscale);
-       /* Ok, now lock the socket before we make it visible to
-        * the incoming packet engine.
-        */
-       lock_sock(sk);
 
        /* Socket identity change complete, no longer
         * in TCP_CLOSE, so enter ourselves into the
         * hash tables.
         */
        tcp_set_state(sk,TCP_SYN_SENT);
-       sk->prot->hash(sk);
+       if (tp->af_specific->hash_connecting(sk))
+               goto err_out;
+
+       sk->err = 0;
+       tp->snd_wnd = 0;
+       tp->snd_wl1 = 0;
+       tp->snd_wl2 = tp->write_seq;
+       tp->snd_una = tp->write_seq;
+       tp->rcv_nxt = 0;
+       tp->rcv_wup = 0;
+       tp->copied_seq = 0;
 
-       tp->rto = dst->rtt;
+       tp->rto = TCP_TIMEOUT_INIT;
        tcp_init_xmit_timers(sk);
        tp->retransmits = 0;
        tp->fackets_out = 0;
        tp->retrans_out = 0;
 
+       TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+       TCP_SKB_CB(buff)->sacked = 0;
+       TCP_SKB_CB(buff)->urg_ptr = 0;
+       buff->csum = 0;
+       TCP_SKB_CB(buff)->seq = tp->write_seq++;
+       TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+       tp->snd_nxt = tp->write_seq;
+
        /* Send it off. */
+       TCP_SKB_CB(buff)->when = tcp_time_stamp;
        __skb_queue_tail(&sk->write_queue, buff);
-       TCP_SKB_CB(buff)->when = jiffies;
        tp->packets_out++;
        tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
        tcp_statistics.TcpActiveOpens++;
 
        /* Timer for repeating the SYN until an answer. */
        tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+       return 0;
 
-       /* Now, it is safe to release the socket. */
-       release_sock(sk);
+err_out:
+       tcp_set_state(sk,TCP_CLOSE);
+       kfree_skb(buff);
+       return -EADDRNOTAVAIL;
 }
 
 /* Send out a delayed ack, the caller does the policy checking
  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
  * for details.
  */
-void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
 {
+       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
        unsigned long timeout;
 
        /* Stay within the limit we were given */
@@ -892,13+1037,16 @@ void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
        timeout += jiffies;
 
        /* Use new timeout only if there wasn't a older one earlier. */
-       if (!tp->delack_timer.prev) {
+       spin_lock_bh(&sk->timer_lock);
+       if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) {
+               sock_hold(sk);
                tp->delack_timer.expires = timeout;
-               add_timer(&tp->delack_timer);
-        } else {
-               if (timeout < tp->delack_timer.expires)
-                       mod_timer(&tp->delack_timer, timeout);
+       } else {
+               if (time_before(timeout, tp->delack_timer.expires))
+                       tp->delack_timer.expires = timeout;
        }
+       add_timer(&tp->delack_timer);
+       spin_unlock_bh(&sk->timer_lock);
 }
 
 /* This routine sends an ack and also updates the window. */
@@ -919,8+1067,14 @@ void tcp_send_ack(struct sock *sk)
                         * (ACK is unreliable) but it's much better use of
                         * bandwidth on slow links to send a spare ack than
                         * resend packets.
+                        *
+                        * This is the one possible way that we can delay an
+                        * ACK and have tp->ato indicate that we are in
+                        * quick ack mode, so clear it.
                         */
-                       tcp_send_delayed_ack(tp, HZ/2);
+                       if(tcp_in_quickack_mode(tp))
+                               tcp_exit_quickack_mode(tp);
+                       tcp_send_delayed_ack(sk, HZ/2);
                        return;
                }
 
@@ -933,7+1087,7 @@ void tcp_send_ack(struct sock *sk)
 
                /* Send it off, this clears delayed acks for us. */
                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
-               TCP_SKB_CB(buff)->when = jiffies;
+               TCP_SKB_CB(buff)->when = tcp_time_stamp;
                tcp_transmit_skb(sk, buff);
        }
 }
@@ -954,7+1108,7 @@ void tcp_write_wakeup(struct sock *sk)
                 */
                if ((1 << sk->state) &
                    ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
-                     TCPF_LAST_ACK|TCPF_CLOSING))
+                     TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
                        return;
 
                if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
@@ -971,7+1125,7 @@ void tcp_write_wakeup(struct sock *sk)
                                        return; /* Let a retransmit get it. */
                        }
                        update_send_head(sk);
-                       TCP_SKB_CB(skb)->when = jiffies;
+                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
                        tp->packets_out++;
                        tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
@@ -997,7+1151,7 @@ void tcp_write_wakeup(struct sock *sk)
                         */
                        TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
                        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
-                       TCP_SKB_CB(skb)->when = jiffies;
+                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        tcp_transmit_skb(sk, skb);
                }
        }
close