net/ipv4/tcp_output.c

Name: Public Git Hosting - davej-history.git/blob - net/ipv4/tcp_output.c
Rating: 4.5 (755 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * Implementation of the Transmission Control Protocol(TCP).
 7  *
 8  * Version: $Id: tcp_output.c,v 1.129 2000/11/28 17:04:10 davem Exp $
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  * Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  * Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  * Florian La Roche, <flla@stud.uni-sb.de>
 15  * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  * Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  * Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  * Matthew Dillon, <dillon@apollo.west.oic.com>
 19  * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  * Jorge Cwik, <jorge@laser.satlink.net>
 21  */
 22
 23 /*
 24  * Changes: Pedro Roque : Retransmit queue handled by TCP.
 25  * : Fragmentation on mtu decrease
 26  * : Segment collapse on retransmit
 27  * : AF independence
 28  *
 29  * Linus Torvalds : send_delayed_ack
 30  * David S. Miller : Charge memory using the right skb
 31  * during syn/ack processing.
 32  * David S. Miller : Output engine completely rewritten.
 33  * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
 34  * Cacophonix Gaul : draft-minshall-nagle-01
 35  * J Hadi Salim : ECN support
 36  *
 37  */
 38
 39 #include <net/tcp.h>
 40
 41 #include <linux/smp_lock.h>
 42
 43 /* People can turn this off for buggy TCP's found in printers etc. */
 44 int sysctl_tcp_retrans_collapse =1;
 45
 46 static __inline__
 47 voidupdate_send_head(struct sock *sk,struct tcp_opt *tp,struct sk_buff *skb)
 48 {
 49  tp->send_head = skb->next;
 50 if(tp->send_head == (struct sk_buff *) &sk->write_queue)
 51  tp->send_head = NULL;
 52  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
 53 if(tp->packets_out++ ==0)
 54 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 55 }
 56
 57 /* SND.NXT, if window was not shrunk.
 58  * If window has been shrunk, what should we make? It is not clear at all.
 59  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
 60  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 61  * invalid. OK, let's make this for now:
 62  */
 63 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk,struct tcp_opt *tp)
 64 {
 65 if(!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
 66 return tp->snd_nxt;
 67 else
 68 return tp->snd_una+tp->snd_wnd;
 69 }
 70
 71 /* Calculate mss to advertise in SYN segment.
 72  * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 73  *
 74  * 1. It is independent of path mtu.
 75  * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 76  * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 77  * attached devices, because some buggy hosts are confused by
 78  * large MSS.
 79  * 4. We do not make 3, we advertise MSS, calculated from first
 80  * hop device mtu, but allow to raise it to ip_rt_min_advmss.
 81  * This may be overriden via information stored in routing table.
 82  * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 83  * probably even Jumbo".
 84  */
 85 static __u16 tcp_advertise_mss(struct sock *sk)
 86 {
 87 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 88 struct dst_entry *dst =__sk_dst_get(sk);
 89 int mss = tp->advmss;
 90
 91 if(dst && dst->advmss < mss) {
 92  mss = dst->advmss;
 93  tp->advmss = mss;
 94 }
 95
 96 return(__u16)mss;
 97 }
 98
 99 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 100  * This is the first part of cwnd validation mechanism. */
 101 static voidtcp_cwnd_restart(struct tcp_opt *tp)
 102 {
 103  s32 delta = tcp_time_stamp - tp->lsndtime;
 104  u32 restart_cwnd =tcp_init_cwnd(tp);
 105  u32 cwnd = tp->snd_cwnd;
 106
 107  tp->snd_ssthresh =tcp_current_ssthresh(tp);
 108  restart_cwnd =min(restart_cwnd, cwnd);
 109
 110 while((delta -= tp->rto) >0&& cwnd > restart_cwnd)
 111  cwnd >>=1;
 112  tp->snd_cwnd =max(cwnd, restart_cwnd);
 113  tp->snd_cwnd_stamp = tcp_time_stamp;
 114  tp->snd_cwnd_used =0;
 115 }
 116
 117 static __inline__ voidtcp_event_data_sent(struct tcp_opt *tp,struct sk_buff *skb)
 118 {
 119  u32 now = tcp_time_stamp;
 120
 121 if(!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
 122 tcp_cwnd_restart(tp);
 123
 124  tp->lsndtime = now;
 125
 126 /* If it is a reply for ato after last received
 127  * packet, enter pingpong mode.
 128  */
 129 if((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
 130  tp->ack.pingpong =1;
 131 }
 132
 133 static __inline__ voidtcp_event_ack_sent(struct sock *sk)
 134 {
 135 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 136
 137 tcp_dec_quickack_mode(tp);
 138 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 139 }
 140
 141 /* Chose a new window to advertise, update state in tcp_opt for the
 142  * socket, and return result with RFC1323 scaling applied. The return
 143  * value can be stuffed directly into th->window for an outgoing
 144  * frame.
 145  */
 146 static __inline__ u16 tcp_select_window(struct sock *sk)
 147 {
 148 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 149  u32 cur_win =tcp_receive_window(tp);
 150  u32 new_win =__tcp_select_window(sk);
 151
 152 /* Never shrink the offered window */
 153 if(new_win < cur_win) {
 154 /* Danger Will Robinson!
 155  * Don't update rcv_wup/rcv_wnd here or else
 156  * we will not be able to advertise a zero
 157  * window in time. --DaveM
 158  *
 159  * Relax Will Robinson.
 160  */
 161  new_win = cur_win;
 162 }
 163  tp->rcv_wnd = new_win;
 164  tp->rcv_wup = tp->rcv_nxt;
 165
 166 /* RFC1323 scaling applied */
 167  new_win >>= tp->rcv_wscale;
 168
 169 #ifdef TCP_FORMAL_WINDOW
 170 if(new_win ==0) {
 171 /* If we advertise zero window, disable fast path. */
 172  tp->pred_flags =0;
 173 }else if(cur_win ==0&& tp->pred_flags ==0&&
 174 skb_queue_len(&tp->out_of_order_queue) ==0&&
 175 !tp->urg_data) {
 176 /* If we open zero window, enable fast path.
 177  Without this it will be open by the first data packet,
 178  it is too late to merge checksumming to copy.
 179  */
 180 tcp_fast_path_on(tp);
 181 }
 182 #endif
 183
 184 return new_win;
 185 }
 186
 187
 188 /* This routine actually transmits TCP packets queued in by
 189  * tcp_do_sendmsg(). This is used by both the initial
 190  * transmission and possible later retransmissions.
 191  * All SKB's seen here are completely headerless. It is our
 192  * job to build the TCP header, and pass the packet down to
 193  * IP so it can do the same plus pass the packet off to the
 194  * device.
 195  *
 196  * We are working here with either a clone of the original
 197  * SKB, or a fresh unique copy made by the retransmit engine.
 198  */
 199 inttcp_transmit_skb(struct sock *sk,struct sk_buff *skb)
 200 {
 201 if(skb != NULL) {
 202 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 203 struct tcp_skb_cb *tcb =TCP_SKB_CB(skb);
 204 int tcp_header_size = tp->tcp_header_len;
 205 struct tcphdr *th;
 206 int sysctl_flags;
 207 int err;
 208
 209 #define SYSCTL_FLAG_TSTAMPS 0x1
 210 #define SYSCTL_FLAG_WSCALE 0x2
 211 #define SYSCTL_FLAG_SACK 0x4
 212
 213  sysctl_flags =0;
 214 if(tcb->flags & TCPCB_FLAG_SYN) {
 215  tcp_header_size =sizeof(struct tcphdr) + TCPOLEN_MSS;
 216 if(sysctl_tcp_timestamps) {
 217  tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
 218  sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
 219 }
 220 if(sysctl_tcp_window_scaling) {
 221  tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
 222  sysctl_flags |= SYSCTL_FLAG_WSCALE;
 223 }
 224 if(sysctl_tcp_sack) {
 225  sysctl_flags |= SYSCTL_FLAG_SACK;
 226 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
 227  tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 228 }
 229 }else if(tp->eff_sacks) {
 230 /* A SACK is 2 pad bytes, a 2 byte header, plus
 231  * 2 32-bit sequence numbers for each SACK block.
 232  */
 233  tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 234 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
 235 }
 236  th = (struct tcphdr *)skb_push(skb, tcp_header_size);
 237  skb->h.th = th;
 238 skb_set_owner_w(skb, sk);
 239
 240 /* Build TCP header and checksum it. */
 241  th->source = sk->sport;
 242  th->dest = sk->dport;
 243  th->seq =htonl(tcb->seq);
 244  th->ack_seq =htonl(tp->rcv_nxt);
 245 *(((__u16 *)th) +6) =htons(((tcp_header_size >>2) <<12) | tcb->flags);
 246 if(tcb->flags & TCPCB_FLAG_SYN) {
 247 /* RFC1323: The window in SYN & SYN/ACK segments
 248  * is never scaled.
 249  */
 250  th->window =htons(tp->rcv_wnd);
 251 }else{
 252  th->window =htons(tcp_select_window(sk));
 253 }
 254  th->check =0;
 255  th->urg_ptr =0;
 256
 257 if(tp->urg_mode &&
 258 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
 259  th->urg_ptr =htons(tp->snd_up-tcb->seq);
 260  th->urg =1;
 261 }
 262
 263 if(tcb->flags & TCPCB_FLAG_SYN) {
 264 tcp_syn_build_options((__u32 *)(th +1),
 265 tcp_advertise_mss(sk),
 266 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
 267 (sysctl_flags & SYSCTL_FLAG_SACK),
 268 (sysctl_flags & SYSCTL_FLAG_WSCALE),
 269  tp->rcv_wscale,
 270  tcb->when,
 271  tp->ts_recent);
 272 }else{
 273 tcp_build_and_update_options((__u32 *)(th +1),
 274  tp, tcb->when);
 275
 276 TCP_ECN_send(sk, tp, skb, tcp_header_size);
 277 }
 278  tp->af_specific->send_check(sk, th, skb->len, skb);
 279
 280 if(tcb->flags & TCPCB_FLAG_ACK)
 281 tcp_event_ack_sent(sk);
 282
 283 if(skb->len != tcp_header_size)
 284 tcp_event_data_sent(tp, skb);
 285
 286 TCP_INC_STATS(TcpOutSegs);
 287
 288  err = tp->af_specific->queue_xmit(skb);
 289 if(err <=0)
 290 return err;
 291
 292 tcp_enter_cwr(tp);
 293
 294 /* NET_XMIT_CN is special. It does not guarantee,
 295  * that this packet is lost. It tells that device
 296  * is about to start to drop packets or already
 297  * drops some packets of the same priority and
 298  * invokes us to send less aggressively.
 299  */
 300 return err == NET_XMIT_CN ?0: err;
 301 }
 302 return-ENOBUFS;
 303 #undef SYSCTL_FLAG_TSTAMPS
 304 #undef SYSCTL_FLAG_WSCALE
 305 #undef SYSCTL_FLAG_SACK
 306 }
 307
 308
 309 /* This is the main buffer sending routine. We queue the buffer
 310  * and decide whether to queue or transmit now.
 311  *
 312  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
 313  * otherwise socket can stall.
 314  */
 315 voidtcp_send_skb(struct sock *sk,struct sk_buff *skb,int force_queue,unsigned cur_mss)
 316 {
 317 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 318
 319 /* Advance write_seq and place onto the write_queue. */
 320  tp->write_seq =TCP_SKB_CB(skb)->end_seq;
 321 __skb_queue_tail(&sk->write_queue, skb);
 322 tcp_charge_skb(sk, skb);
 323
 324 if(!force_queue && tp->send_head == NULL &&tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
 325 /* Send it out now. */
 326 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 327 if(tcp_transmit_skb(sk,skb_clone(skb, sk->allocation)) ==0) {
 328  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
 329 tcp_minshall_update(tp, cur_mss, skb);
 330 if(tp->packets_out++ ==0)
 331 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 332 return;
 333 }
 334 }
 335 /* Queue it, remembering where we must start sending. */
 336 if(tp->send_head == NULL)
 337  tp->send_head = skb;
 338 }
 339
 340 /* Function to create two new TCP segments. Shrinks the given segment
 341  * to the specified size and appends a new segment with the rest of the
 342  * packet to the list. This won't be called frequently, I hope.
 343  * Remember, these are still headerless SKBs at this point.
 344  */
 345 static inttcp_fragment(struct sock *sk,struct sk_buff *skb, u32 len)
 346 {
 347 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 348 struct sk_buff *buff;
 349 int nsize = skb->len - len;
 350  u16 flags;
 351
 352 /* Get a new skb... force flag on. */
 353  buff =tcp_alloc_skb(sk, nsize + MAX_TCP_HEADER, GFP_ATOMIC);
 354 if(buff == NULL)
 355 return-ENOMEM;/* We'll just try again later. */
 356 tcp_charge_skb(sk, buff);
 357
 358 /* Reserve space for headers. */
 359 skb_reserve(buff, MAX_TCP_HEADER);
 360
 361 /* Correct the sequence numbers. */
 362 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(skb)->seq + len;
 363 TCP_SKB_CB(buff)->end_seq =TCP_SKB_CB(skb)->end_seq;
 364
 365 /* PSH and FIN should only be set in the second packet. */
 366  flags =TCP_SKB_CB(skb)->flags;
 367 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
 368 TCP_SKB_CB(buff)->flags = flags;
 369 TCP_SKB_CB(buff)->sacked =TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
 370 if(TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
 371  tp->lost_out++;
 372  tp->left_out++;
 373 }
 374 TCP_SKB_CB(buff)->sacked &= ~TCPCB_AT_TAIL;
 375
 376 /* Copy and checksum data tail into the new buffer. */
 377  buff->csum =csum_partial_copy_nocheck(skb->data + len,skb_put(buff, nsize),
 378  nsize,0);
 379
 380 /* This takes care of the FIN sequence number too. */
 381 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(buff)->seq;
 382 skb_trim(skb, len);
 383
 384 /* Rechecksum original buffer. */
 385  skb->csum =csum_partial(skb->data, skb->len,0);
 386
 387 /* Looks stupid, but our code really uses when of
 388  * skbs, which it never sent before. --ANK
 389  */
 390 TCP_SKB_CB(buff)->when =TCP_SKB_CB(skb)->when;
 391
 392 /* Link BUFF into the send queue. */
 393 __skb_append(skb, buff);
 394
 395 return0;
 396 }
 397
 398 /* This function synchronize snd mss to current pmtu/exthdr set.
 399
 400  tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 401  for TCP options, but includes only bare TCP header.
 402
 403  tp->mss_clamp is mss negotiated at connection setup.
 404  It is minumum of user_mss and mss received with SYN.
 405  It also does not include TCP options.
 406
 407  tp->pmtu_cookie is last pmtu, seen by this function.
 408
 409  tp->mss_cache is current effective sending mss, including
 410  all tcp options except for SACKs. It is evaluated,
 411  taking into account current pmtu, but never exceeds
 412  tp->mss_clamp.
 413
 414  NOTE1. rfc1122 clearly states that advertised MSS
 415  DOES NOT include either tcp or ip options.
 416
 417  NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
 418  this function. --ANK (980731)
 419  */
 420
 421 inttcp_sync_mss(struct sock *sk, u32 pmtu)
 422 {
 423 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 424 int mss_now;
 425
 426 /* Calculate base mss without TCP options:
 427  It is MMS_S - sizeof(tcphdr) of rfc1122
 428  */
 429
 430  mss_now = pmtu - tp->af_specific->net_header_len -sizeof(struct tcphdr);
 431
 432 /* Clamp it (mss_clamp does not include tcp options) */
 433 if(mss_now > tp->mss_clamp)
 434  mss_now = tp->mss_clamp;
 435
 436 /* Now subtract optional transport overhead */
 437  mss_now -= tp->ext_header_len;
 438
 439 /* Then reserve room for full set of TCP options and 8 bytes of data */
 440 if(mss_now <48)
 441  mss_now =48;
 442
 443 /* Now subtract TCP options size, not including SACKs */
 444  mss_now -= tp->tcp_header_len -sizeof(struct tcphdr);
 445
 446 /* Bound mss with half of window */
 447 if(tp->max_window && mss_now > (tp->max_window>>1))
 448  mss_now =max((tp->max_window>>1),68- tp->tcp_header_len);
 449
 450 /* And store cached results */
 451  tp->pmtu_cookie = pmtu;
 452  tp->mss_cache = mss_now;
 453 return mss_now;
 454 }
 455
 456
 457 /* This routine writes packets to the network. It advances the
 458  * send_head. This happens as incoming acks open up the remote
 459  * window for us.
 460  *
 461  * Returns 1, if no segments are in flight and we have queued segments, but
 462  * cannot send anything now because of SWS or another problem.
 463  */
 464 inttcp_write_xmit(struct sock *sk)
 465 {
 466 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 467 unsigned int mss_now;
 468
 469 /* If we are closed, the bytes will have to remain here.
 470  * In time closedown will finish, we empty the write queue and all
 471  * will be happy.
 472  */
 473 if(sk->state != TCP_CLOSE) {
 474 struct sk_buff *skb;
 475 int sent_pkts =0;
 476
 477 /* Account for SACKS, we may need to fragment due to this.
 478  * It is just like the real MSS changing on us midstream.
 479  * We also handle things correctly when the user adds some
 480  * IP options mid-stream. Silly to do, but cover it.
 481  */
 482  mss_now =tcp_current_mss(sk);
 483
 484 while((skb = tp->send_head) &&
 485 tcp_snd_test(tp, skb, mss_now,tcp_skb_is_last(sk, skb) ? tp->nonagle :1)) {
 486 if(skb->len > mss_now) {
 487 if(tcp_fragment(sk, skb, mss_now))
 488 break;
 489 }
 490
 491 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 492 if(tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC)))
 493 break;
 494 /* Advance the send_head. This one is sent out. */
 495 update_send_head(sk, tp, skb);
 496 tcp_minshall_update(tp, mss_now, skb);
 497  sent_pkts =1;
 498 }
 499
 500 if(sent_pkts) {
 501 tcp_cwnd_validate(sk, tp);
 502 return0;
 503 }
 504
 505 return!tp->packets_out && tp->send_head;
 506 }
 507 return0;
 508 }
 509
 510 /* This function returns the amount that we can raise the
 511  * usable window based on the following constraints
 512  *
 513  * 1. The window can never be shrunk once it is offered (RFC 793)
 514  * 2. We limit memory per socket
 515  *
 516  * RFC 1122:
 517  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 518  * RECV.NEXT + RCV.WIN fixed until:
 519  * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 520  *
 521  * i.e. don't raise the right edge of the window until you can raise
 522  * it at least MSS bytes.
 523  *
 524  * Unfortunately, the recommended algorithm breaks header prediction,
 525  * since header prediction assumes th->window stays fixed.
 526  *
 527  * Strictly speaking, keeping th->window fixed violates the receiver
 528  * side SWS prevention criteria. The problem is that under this rule
 529  * a stream of single byte packets will cause the right side of the
 530  * window to always advance by a single byte.
 531  *
 532  * Of course, if the sender implements sender side SWS prevention
 533  * then this will not be a problem.
 534  *
 535  * BSD seems to make the following compromise:
 536  *
 537  * If the free space is less than the 1/4 of the maximum
 538  * space available and the free space is less than 1/2 mss,
 539  * then set the window to 0.
 540  * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
 541  * Otherwise, just prevent the window from shrinking
 542  * and from being larger than the largest representable value.
 543  *
 544  * This prevents incremental opening of the window in the regime
 545  * where TCP is limited by the speed of the reader side taking
 546  * data out of the TCP receive queue. It does nothing about
 547  * those cases where the window is constrained on the sender side
 548  * because the pipeline is full.
 549  *
 550  * BSD also seems to "accidentally" limit itself to windows that are a
 551  * multiple of MSS, at least until the free space gets quite small.
 552  * This would appear to be a side effect of the mbuf implementation.
 553  * Combining these two algorithms results in the observed behavior
 554  * of having a fixed window size at almost all times.
 555  *
 556  * Below we obtain similar behavior by forcing the offered window to
 557  * a multiple of the mss when it is feasible to do so.
 558  *
 559  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 560  * Regular options like TIMESTAMP are taken into account.
 561  */
 562 u32 __tcp_select_window(struct sock *sk)
 563 {
 564 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 565 /* MSS for the peer's data. Previous verions used mss_clamp
 566  * here. I don't know if the value based on our guesses
 567  * of peer's MSS is better for the performance. It's more correct
 568  * but may be worse for the performance because of rcv_mss
 569  * fluctuations. --SAW 1998/11/1
 570  */
 571 unsigned int mss = tp->ack.rcv_mss;
 572 int free_space;
 573  u32 window;
 574
 575 /* Sometimes free_space can be < 0. */
 576  free_space =tcp_space(sk);
 577 if(tp->window_clamp < mss)
 578  mss = tp->window_clamp;
 579
 580 if(free_space < (int)min(tp->window_clamp,tcp_full_space(sk)) /2) {
 581  tp->ack.quick =0;
 582
 583 if(tcp_memory_pressure)
 584  tp->rcv_ssthresh =min(tp->rcv_ssthresh,4*tp->advmss);
 585
 586 if(free_space < ((int)mss))
 587 return0;
 588 }
 589
 590 if(free_space > tp->rcv_ssthresh)
 591  free_space = tp->rcv_ssthresh;
 592
 593 /* Get the largest window that is a nice multiple of mss.
 594  * Window clamp already applied above.
 595  * If our current window offering is within 1 mss of the
 596  * free space we just keep it. This prevents the divide
 597  * and multiply from happening most of the time.
 598  * We also don't do any window rounding when the free space
 599  * is too small.
 600  */
 601  window = tp->rcv_wnd;
 602 if((((int) window) <= (free_space - ((int) mss))) ||
 603 (((int) window) > free_space))
 604  window = (((unsigned int) free_space)/mss)*mss;
 605
 606 return window;
 607 }
 608
 609 /* Attempt to collapse two adjacent SKB's during retransmission. */
 610 static voidtcp_retrans_try_collapse(struct sock *sk,struct sk_buff *skb,int mss_now)
 611 {
 612 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 613 struct sk_buff *next_skb = skb->next;
 614
 615 /* The first test we must make is that neither of these two
 616  * SKB's are still referenced by someone else.
 617  */
 618 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
 619 int skb_size = skb->len, next_skb_size = next_skb->len;
 620  u16 flags =TCP_SKB_CB(skb)->flags;
 621
 622 /* Also punt if next skb has been SACK'd. */
 623 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
 624 return;
 625
 626 /* Next skb is out of window. */
 627 if(after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
 628 return;
 629
 630 /* Punt if not enough space exists in the first SKB for
 631  * the data in the second, or the total combined payload
 632  * would exceed the MSS.
 633  */
 634 if((next_skb_size >skb_tailroom(skb)) ||
 635 ((skb_size + next_skb_size) > mss_now))
 636 return;
 637
 638 /* Ok. We will be able to collapse the packet. */
 639 __skb_unlink(next_skb, next_skb->list);
 640
 641 if(skb->len %4) {
 642 /* Must copy and rechecksum all data. */
 643 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 644  skb->csum =csum_partial(skb->data, skb->len,0);
 645 }else{
 646 /* Optimize, actually we could also combine next_skb->csum
 647  * to skb->csum using a single add w/carry operation too.
 648  */
 649  skb->csum =csum_partial_copy_nocheck(next_skb->data,
 650 skb_put(skb, next_skb_size),
 651  next_skb_size, skb->csum);
 652 }
 653
 654 /* Update sequence range on original skb. */
 655 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(next_skb)->end_seq;
 656
 657 /* Merge over control information. */
 658  flags |=TCP_SKB_CB(next_skb)->flags;/* This moves PSH/FIN etc. over */
 659 TCP_SKB_CB(skb)->flags = flags;
 660
 661 /* All done, get rid of second SKB and account for it so
 662  * packet counting does not break.
 663  */
 664 TCP_SKB_CB(skb)->sacked |=TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
 665 if(TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
 666  tp->retrans_out--;
 667 if(TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
 668  tp->lost_out--;
 669  tp->left_out--;
 670 }
 671 if(!tp->sack_ok && tp->sacked_out) {
 672 /* Reno case is special. Sigh... */
 673  tp->sacked_out--;
 674  tp->left_out--;
 675 }
 676 /* Not quite right: it can be > snd.fack, but
 677  * it is better to underestimate fackets.
 678  */
 679 if(tp->fackets_out)
 680  tp->fackets_out--;
 681 tcp_free_skb(sk, next_skb);
 682  tp->packets_out--;
 683 }
 684 }
 685
 686 /* Do a simple retransmit without using the backoff mechanisms in
 687  * tcp_timer. This is used for path mtu discovery.
 688  * The socket is already locked here.
 689  */
 690 voidtcp_simple_retransmit(struct sock *sk)
 691 {
 692 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 693 struct sk_buff *skb;
 694 unsigned int mss =tcp_current_mss(sk);
 695 int lost =0;
 696
 697 for_retrans_queue(skb, sk, tp) {
 698 if(skb->len > mss &&
 699 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
 700 if(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
 701 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 702  tp->retrans_out--;
 703 }
 704 if(!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
 705 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 706  tp->lost_out++;
 707  lost =1;
 708 }
 709 }
 710 }
 711
 712 if(!lost)
 713 return;
 714
 715  tp->left_out = tp->sacked_out + tp->lost_out;
 716
 717 /* Don't muck with the congestion window here.
 718  * Reason is that we do not increase amount of _data_
 719  * in network, but units changed and effective
 720  * cwnd/ssthresh really reduced now.
 721  */
 722 if(tp->ca_state != TCP_CA_Loss) {
 723  tp->high_seq = tp->snd_nxt;
 724  tp->snd_ssthresh =tcp_current_ssthresh(tp);
 725  tp->prior_ssthresh =0;
 726  tp->undo_marker =0;
 727  tp->ca_state = TCP_CA_Loss;
 728 }
 729 tcp_xmit_retransmit_queue(sk);
 730 }
 731
 732 /* This retransmits one SKB. Policy decisions and retransmit queue
 733  * state updates are done by the caller. Returns non-zero if an
 734  * error occurred which prevented the send.
 735  */
 736 inttcp_retransmit_skb(struct sock *sk,struct sk_buff *skb)
 737 {
 738 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 739 unsigned int cur_mss =tcp_current_mss(sk);
 740 int err;
 741
 742 /* Do not sent more than we queued. 1/4 is reserved for possible
 743  * copying overhead: frgagmentation, tunneling, mangling etc.
 744  */
 745 if(atomic_read(&sk->wmem_alloc) >min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
 746 return-EAGAIN;
 747
 748 if(skb->len > cur_mss) {
 749 if(tcp_fragment(sk, skb, cur_mss))
 750 return-ENOMEM;/* We'll try again later. */
 751
 752 /* New SKB created, account for it. */
 753  tp->packets_out++;
 754 }
 755
 756 /* Collapse two adjacent packets if worthwhile and we can. */
 757 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 758 (skb->len < (cur_mss >>1)) &&
 759 (skb->next != tp->send_head) &&
 760 (skb->next != (struct sk_buff *)&sk->write_queue) &&
 761 (sysctl_tcp_retrans_collapse !=0))
 762 tcp_retrans_try_collapse(sk, skb, cur_mss);
 763
 764 if(tp->af_specific->rebuild_header(sk))
 765 return-EHOSTUNREACH;/* Routing failure or similar. */
 766
 767 /* Some Solaris stacks overoptimize and ignore the FIN on a
 768  * retransmit when old data is attached. So strip it off
 769  * since it is cheap to do so and saves bytes on the network.
 770  */
 771 if(skb->len >0&&
 772 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 773  tp->snd_una == (TCP_SKB_CB(skb)->end_seq -1)) {
 774 TCP_SKB_CB(skb)->seq =TCP_SKB_CB(skb)->end_seq -1;
 775 skb_trim(skb,0);
 776  skb->csum =0;
 777 }
 778
 779 /* Make a copy, if the first transmission SKB clone we made
 780  * is still in somebody's hands, else make a clone.
 781  */
 782 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 783
 784  err =tcp_transmit_skb(sk, (skb_cloned(skb) ?
 785 skb_copy(skb, GFP_ATOMIC):
 786 skb_clone(skb, GFP_ATOMIC)));
 787
 788 if(err ==0) {
 789 /* Update global TCP statistics. */
 790 TCP_INC_STATS(TcpRetransSegs);
 791
 792 #if FASTRETRANS_DEBUG > 0
 793 if(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
 794 if(net_ratelimit())
 795 printk(KERN_DEBUG "retrans_out leaked.\n");
 796 }
 797 #endif
 798 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
 799  tp->retrans_out++;
 800
 801 /* Save stamp of the first retransmit. */
 802 if(!tp->retrans_stamp)
 803  tp->retrans_stamp =TCP_SKB_CB(skb)->when;
 804
 805  tp->undo_retrans++;
 806
 807 /* snd_nxt is stored to detect loss of retransmitted segment,
 808  * see tcp_input.c tcp_sacktag_write_queue().
 809  */
 810 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
 811 }
 812 return err;
 813 }
 814
 815 /* This gets called after a retransmit timeout, and the initially
 816  * retransmitted data is acknowledged. It tries to continue
 817  * resending the rest of the retransmit queue, until either
 818  * we've sent it all or the congestion window limit is reached.
 819  * If doing SACK, the first ACK which comes back for a timeout
 820  * based retransmit packet might feed us FACK information again.
 821  * If so, we use it to avoid unnecessarily retransmissions.
 822  */
 823 voidtcp_xmit_retransmit_queue(struct sock *sk)
 824 {
 825 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 826 struct sk_buff *skb;
 827 int packet_cnt = tp->lost_out;
 828
 829 /* First pass: retransmit lost packets. */
 830 if(packet_cnt) {
 831 for_retrans_queue(skb, sk, tp) {
 832  __u8 sacked =TCP_SKB_CB(skb)->sacked;
 833
 834 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 835 return;
 836
 837 if(sacked&TCPCB_LOST) {
 838 if(!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
 839 if(tcp_retransmit_skb(sk, skb))
 840 return;
 841 if(tp->ca_state != TCP_CA_Loss)
 842 NET_INC_STATS_BH(TCPFastRetrans);
 843 else
 844 NET_INC_STATS_BH(TCPSlowStartRetrans);
 845
 846 if(skb ==skb_peek(&sk->write_queue))
 847 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 848 }
 849
 850 if(--packet_cnt <=0)
 851 break;
 852 }
 853 }
 854 }
 855
 856 /* OK, demanded retransmission is finished. */
 857
 858 /* Forward retransmissions are possible only during Recovery. */
 859 if(tp->ca_state != TCP_CA_Recovery)
 860 return;
 861
 862 /* No forward retransmissions in Reno are possible. */
 863 if(!tp->sack_ok)
 864 return;
 865
 866 /* Yeah, we have to make difficult choice between forward transmission
 867  * and retransmission... Both ways have their merits...
 868  *
 869  * For now we do not retrnamsit anything, while we have some new
 870  * segments to send.
 871  */
 872
 873 if(tcp_may_send_now(sk, tp))
 874 return;
 875
 876  packet_cnt =0;
 877
 878 for_retrans_queue(skb, sk, tp) {
 879 if(++packet_cnt > tp->fackets_out)
 880 break;
 881
 882 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 883 break;
 884
 885 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
 886 continue;
 887
 888 /* Ok, retransmit it. */
 889 if(tcp_retransmit_skb(sk, skb))
 890 break;
 891
 892 if(skb ==skb_peek(&sk->write_queue))
 893 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 894
 895 NET_INC_STATS_BH(TCPForwardRetrans);
 896 }
 897 }
 898
 899
 900 /* Send a fin. The caller locks the socket for us. This cannot be
 901  * allowed to fail queueing a FIN frame under any circumstances.
 902  */
 903 voidtcp_send_fin(struct sock *sk)
 904 {
 905 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 906 struct sk_buff *skb =skb_peek_tail(&sk->write_queue);
 907 unsigned int mss_now;
 908
 909 /* Optimization, tack on the FIN if we have a queue of
 910  * unsent frames. But be careful about outgoing SACKS
 911  * and IP options.
 912  */
 913  mss_now =tcp_current_mss(sk);
 914
 915 /* Please, find seven differences of 2.3.33 and loook
 916  * what I broke here. 8) --ANK
 917  */
 918
 919 if(tp->send_head != NULL) {
 920 /* tcp_write_xmit() takes care of the rest. */
 921 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 922 TCP_SKB_CB(skb)->end_seq++;
 923  tp->write_seq++;
 924
 925 /* Special case to avoid Nagle bogosity. If this
 926  * segment is the last segment, and it was queued
 927  * due to Nagle/SWS-avoidance, send it out now.
 928  */
 929 if(tp->send_head == skb &&
 930 !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) {
 931 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 932 if(!tcp_transmit_skb(sk,skb_clone(skb, GFP_KERNEL)))
 933 update_send_head(sk, tp, skb);
 934 else
 935 tcp_check_probe_timer(sk, tp);
 936 }
 937 }else{
 938 /* Socket is locked, keep trying until memory is available. */
 939 for(;;) {
 940  skb =alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
 941 if(skb)
 942 break;
 943  current->policy |= SCHED_YIELD;
 944 schedule();
 945 }
 946
 947 /* Reserve space for headers and prepare control bits. */
 948 skb_reserve(skb, MAX_TCP_HEADER);
 949  skb->csum =0;
 950 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 951 TCP_SKB_CB(skb)->sacked =0;
 952
 953 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 954 TCP_SKB_CB(skb)->seq = tp->write_seq;
 955 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
 956 tcp_send_skb(sk, skb,0, mss_now);
 957 __tcp_push_pending_frames(sk, tp, mss_now,1);
 958 }
 959 }
 960
 961 /* We get here when a process closes a file descriptor (either due to
 962  * an explicit close() or as a byproduct of exit()'ing) and there
 963  * was unread data in the receive queue. This behavior is recommended
 964  * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
 965  */
 966 voidtcp_send_active_reset(struct sock *sk,int priority)
 967 {
 968 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 969 struct sk_buff *skb;
 970
 971 /* NOTE: No TCP options attached and we never retransmit this. */
 972  skb =alloc_skb(MAX_TCP_HEADER, priority);
 973 if(!skb) {
 974 NET_INC_STATS(TCPAbortFailed);
 975 return;
 976 }
 977
 978 /* Reserve space for headers and prepare control bits. */
 979 skb_reserve(skb, MAX_TCP_HEADER);
 980  skb->csum =0;
 981 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 982 TCP_SKB_CB(skb)->sacked =0;
 983
 984 /* Send it off. */
 985 TCP_SKB_CB(skb)->seq =tcp_acceptable_seq(sk, tp);
 986 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
 987 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 988 if(tcp_transmit_skb(sk, skb))
 989 NET_INC_STATS(TCPAbortFailed);
 990 }
 991
 992 /* WARNING: This routine must only be called when we have already sent
 993  * a SYN packet that crossed the incoming SYN that caused this routine
 994  * to get called. If this assumption fails then the initial rcv_wnd
 995  * and rcv_wscale values will not be correct.
 996  */
 997 inttcp_send_synack(struct sock *sk)
 998 {
 999 struct sk_buff* skb;
1000
1001  skb =skb_peek(&sk->write_queue);
1002 if(skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1003 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1004 return-EFAULT;
1005 }
1006 if(!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1007 if(skb_cloned(skb)) {
1008 struct sk_buff *nskb =skb_copy(skb, GFP_ATOMIC);
1009 if(nskb == NULL)
1010 return-ENOMEM;
1011 __skb_unlink(skb, &sk->write_queue);
1012 __skb_queue_head(&sk->write_queue, nskb);
1013 tcp_free_skb(sk, skb);
1014 tcp_charge_skb(sk, nskb);
1015  skb = nskb;
1016 }
1017
1018 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1019 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1020 }
1021 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1022 returntcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1023 }
1024
1025 /*
1026  * Prepare a SYN-ACK.
1027  */
1028 struct sk_buff *tcp_make_synack(struct sock *sk,struct dst_entry *dst,
1029 struct open_request *req)
1030 {
1031 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1032 struct tcphdr *th;
1033 int tcp_header_size;
1034 struct sk_buff *skb;
1035
1036  skb =sock_wmalloc(sk, MAX_TCP_HEADER +15,1, GFP_ATOMIC);
1037 if(skb == NULL)
1038 return NULL;
1039
1040 /* Reserve space for headers. */
1041 skb_reserve(skb, MAX_TCP_HEADER);
1042
1043  skb->dst =dst_clone(dst);
1044
1045  tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1046 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0) +
1047 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED :0) +
1048 /* SACK_PERM is in the place of NOP NOP of TS */
1049 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED :0));
1050  skb->h.th = th = (struct tcphdr *)skb_push(skb, tcp_header_size);
1051
1052 memset(th,0,sizeof(struct tcphdr));
1053  th->syn =1;
1054  th->ack =1;
1055 TCP_ECN_make_synack(req, th);
1056  th->source = sk->sport;
1057  th->dest = req->rmt_port;
1058 TCP_SKB_CB(skb)->seq = req->snt_isn;
1059 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
1060  th->seq =htonl(TCP_SKB_CB(skb)->seq);
1061  th->ack_seq =htonl(req->rcv_isn +1);
1062 if(req->rcv_wnd ==0) {/* ignored for retransmitted syns */
1063  __u8 rcv_wscale;
1064 /* Set this up on the first call only */
1065  req->window_clamp = tp->window_clamp ? : dst->window;
1066 /* tcp_full_space because it is guaranteed to be the first packet */
1067 tcp_select_initial_window(tcp_full_space(sk),
1068  dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0),
1069 &req->rcv_wnd,
1070 &req->window_clamp,
1071  req->wscale_ok,
1072 &rcv_wscale);
1073  req->rcv_wscale = rcv_wscale;
1074 }
1075
1076 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1077  th->window =htons(req->rcv_wnd);
1078
1079 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1080 tcp_syn_build_options((__u32 *)(th +1), dst->advmss, req->tstamp_ok,
1081  req->sack_ok, req->wscale_ok, req->rcv_wscale,
1082 TCP_SKB_CB(skb)->when,
1083  req->ts_recent);
1084
1085  skb->csum =0;
1086  th->doff = (tcp_header_size >>2);
1087 TCP_INC_STATS(TcpOutSegs);
1088 return skb;
1089 }
1090
1091 inttcp_connect(struct sock *sk,struct sk_buff *buff)
1092 {
1093 struct dst_entry *dst =__sk_dst_get(sk);
1094 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1095
1096 /* Reserve space for headers. */
1097 skb_reserve(buff, MAX_TCP_HEADER);
1098
1099 /* We'll fix this up when we get a response from the other end.
1100  * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1101  */
1102  tp->tcp_header_len =sizeof(struct tcphdr) +
1103 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED :0);
1104
1105 /* If user gave his TCP_MAXSEG, record it to clamp */
1106 if(tp->user_mss)
1107  tp->mss_clamp = tp->user_mss;
1108  tp->max_window =0;
1109 tcp_sync_mss(sk, dst->pmtu);
1110
1111 if(!tp->window_clamp)
1112  tp->window_clamp = dst->window;
1113  tp->advmss = dst->advmss;
1114 tcp_initialize_rcv_mss(sk);
1115
1116 tcp_select_initial_window(tcp_full_space(sk),
1117  tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len -sizeof(struct tcphdr) :0),
1118 &tp->rcv_wnd,
1119 &tp->window_clamp,
1120  sysctl_tcp_window_scaling,
1121 &tp->rcv_wscale);
1122
1123  tp->rcv_ssthresh = tp->rcv_wnd;
1124
1125 /* Socket identity change complete, no longer
1126  * in TCP_CLOSE, so enter ourselves into the
1127  * hash tables.
1128  */
1129 tcp_set_state(sk,TCP_SYN_SENT);
1130 if(tp->af_specific->hash_connecting(sk))
1131 goto err_out;
1132
1133  sk->err =0;
1134  sk->done =0;
1135  tp->snd_wnd =0;
1136 tcp_init_wl(tp, tp->write_seq,0);
1137  tp->snd_una = tp->write_seq;
1138  tp->snd_sml = tp->write_seq;
1139  tp->rcv_nxt =0;
1140  tp->rcv_wup =0;
1141  tp->copied_seq =0;
1142
1143  tp->rto = TCP_TIMEOUT_INIT;
1144  tp->retransmits =0;
1145 tcp_clear_retrans(tp);
1146
1147 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1148 TCP_ECN_send_syn(tp, buff);
1149 TCP_SKB_CB(buff)->sacked =0;
1150  buff->csum =0;
1151 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1152 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1153  tp->snd_nxt = tp->write_seq;
1154  tp->pushed_seq = tp->write_seq;
1155
1156 /* Send it off. */
1157 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1158  tp->retrans_stamp =TCP_SKB_CB(buff)->when;
1159 __skb_queue_tail(&sk->write_queue, buff);
1160 tcp_charge_skb(sk, buff);
1161  tp->packets_out++;
1162 tcp_transmit_skb(sk,skb_clone(buff, GFP_KERNEL));
1163 TCP_INC_STATS(TcpActiveOpens);
1164
1165 /* Timer for repeating the SYN until an answer. */
1166 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1167 return0;
1168
1169 err_out:
1170 tcp_set_state(sk,TCP_CLOSE);
1171 kfree_skb(buff);
1172 return-EADDRNOTAVAIL;
1173 }
1174
1175 /* Send out a delayed ack, the caller does the policy checking
1176  * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1177  * for details.
1178  */
1179 voidtcp_send_delayed_ack(struct sock *sk)
1180 {
1181 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1182 int ato = tp->ack.ato;
1183 unsigned long timeout;
1184
1185 if(ato > TCP_DELACK_MIN) {
1186 int max_ato = HZ/2;
1187
1188 if(tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1189  max_ato = TCP_DELACK_MAX;
1190
1191 /* Slow path, intersegment interval is "high". */
1192
1193 /* If some rtt estimate is known, use it to bound delayed ack.
1194  * Do not use tp->rto here, use results of rtt measurements
1195  * directly.
1196  */
1197 if(tp->srtt) {
1198 int rtt =max(tp->srtt>>3, TCP_DELACK_MIN);
1199
1200 if(rtt < max_ato)
1201  max_ato = rtt;
1202 }
1203
1204  ato =min(ato, max_ato);
1205 }
1206
1207 /* Stay within the limit we were given */
1208  timeout = jiffies + ato;
1209
1210 /* Use new timeout only if there wasn't a older one earlier. */
1211 if(tp->ack.pending&TCP_ACK_TIMER) {
1212 /* If delack timer was blocked or is about to expire,
1213  * send ACK now.
1214  */
1215 if(tp->ack.blocked ||time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1216 tcp_send_ack(sk);
1217 return;
1218 }
1219
1220 if(!time_before(timeout, tp->ack.timeout))
1221  timeout = tp->ack.timeout;
1222 }
1223  tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1224  tp->ack.timeout = timeout;
1225 if(!mod_timer(&tp->delack_timer, timeout))
1226 sock_hold(sk);
1227
1228 #ifdef TCP_FORMAL_WINDOW
1229 /* Explanation. Header prediction path does not handle
1230  * case of zero window. If we send ACK immediately, pred_flags
1231  * are reset when sending ACK. If rcv_nxt is advanced and
1232  * ack is not sent, than delayed ack is scheduled.
1233  * Hence, it is the best place to check for zero window.
1234  */
1235 if(tp->pred_flags) {
1236 if(tcp_receive_window(tp) ==0)
1237  tp->pred_flags =0;
1238 }else{
1239 if(skb_queue_len(&tp->out_of_order_queue) ==0&&
1240 !tp->urg_data)
1241 tcp_fast_path_on(tp);
1242 }
1243 #endif
1244 }
1245
1246 /* This routine sends an ack and also updates the window. */
1247 voidtcp_send_ack(struct sock *sk)
1248 {
1249 /* If we have been reset, we may not send again. */
1250 if(sk->state != TCP_CLOSE) {
1251 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1252 struct sk_buff *buff;
1253
1254 /* We are not putting this on the write queue, so
1255  * tcp_transmit_skb() will set the ownership to this
1256  * sock.
1257  */
1258  buff =alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1259 if(buff == NULL) {
1260 tcp_schedule_ack(tp);
1261  tp->ack.ato = TCP_ATO_MIN;
1262 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1263 return;
1264 }
1265
1266 /* Reserve space for headers and prepare control bits. */
1267 skb_reserve(buff, MAX_TCP_HEADER);
1268  buff->csum =0;
1269 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1270 TCP_SKB_CB(buff)->sacked =0;
1271
1272 /* Send it off, this clears delayed acks for us. */
1273 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(buff)->end_seq =tcp_acceptable_seq(sk, tp);
1274 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1275 tcp_transmit_skb(sk, buff);
1276 }
1277 }
1278
1279 /* This routine sends a packet with an out of date sequence
1280  * number. It assumes the other end will try to ack it.
1281  *
1282  * Question: what should we make while urgent mode?
1283  * 4.4BSD forces sending single byte of data. We cannot send
1284  * out of window data, because we have SND.NXT==SND.MAX...
1285  *
1286  * Current solution: to send TWO zero-length segments in urgent mode:
1287  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1288  * out-of-date with SND.UNA-1 to probe window.
1289  */
1290 static inttcp_xmit_probe_skb(struct sock *sk,int urgent)
1291 {
1292 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1293 struct sk_buff *skb;
1294
1295 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1296  skb =alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1297 if(skb == NULL)
1298 return-1;
1299
1300 /* Reserve space for headers and set control bits. */
1301 skb_reserve(skb, MAX_TCP_HEADER);
1302  skb->csum =0;
1303 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1304 TCP_SKB_CB(skb)->sacked = urgent;
1305
1306 /* Use a previous sequence. This should cause the other
1307  * end to send an ack. Don't queue or clone SKB, just
1308  * send it.
1309  */
1310 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una -1;
1311 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
1312 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1313 returntcp_transmit_skb(sk, skb);
1314 }
1315
1316 inttcp_write_wakeup(struct sock *sk)
1317 {
1318 if(sk->state != TCP_CLOSE) {
1319 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1320 struct sk_buff *skb;
1321
1322 if((skb = tp->send_head) != NULL &&
1323 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1324 int err;
1325 int mss =tcp_current_mss(sk);
1326 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1327
1328 if(before(tp->pushed_seq,TCP_SKB_CB(skb)->end_seq))
1329  tp->pushed_seq =TCP_SKB_CB(skb)->end_seq;
1330
1331 /* We are probing the opening of a window
1332  * but the window size is != 0
1333  * must have been a result SWS avoidance ( sender )
1334  */
1335 if(seg_size <TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq ||
1336  skb->len > mss) {
1337  seg_size =min(seg_size, mss);
1338 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1339 if(tcp_fragment(sk, skb, seg_size))
1340 return-1;
1341 }
1342 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1343 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1344  err =tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1345 if(!err) {
1346 update_send_head(sk, tp, skb);
1347 }
1348 return err;
1349 }else{
1350 if(tp->urg_mode &&
1351 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1352 tcp_xmit_probe_skb(sk, TCPCB_URG);
1353 returntcp_xmit_probe_skb(sk,0);
1354 }
1355 }
1356 return-1;
1357 }
1358
1359 /* A window probe timeout has occurred. If window is not closed send
1360  * a partial packet else a zero probe.
1361  */
1362 voidtcp_send_probe0(struct sock *sk)
1363 {
1364 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1365 int err;
1366
1367  err =tcp_write_wakeup(sk);
1368
1369 if(tp->packets_out || !tp->send_head) {
1370 /* Cancel probe timer, if it is not required. */
1371  tp->probes_out =0;
1372  tp->backoff =0;
1373 return;
1374 }
1375
1376 if(err <=0) {
1377  tp->backoff++;
1378  tp->probes_out++;
1379 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
1380 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1381 }else{
1382 /* If packet was not sent due to local congestion,
1383  * do not backoff and do not remember probes_out.
1384  * Let local senders to fight for local resources.
1385  *
1386  * Use accumulated backoff yet.
1387  */
1388 if(!tp->probes_out)
1389  tp->probes_out=1;
1390 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
1391 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1392 }
1393 }