net/ipv4/tcp_output.c

Name: Public Git Hosting - davej-history.git/blob - net/ipv4/tcp_output.c
Rating: 4.8 (6507 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * Implementation of the Transmission Control Protocol(TCP).
 7  *
 8  * Version: $Id: tcp_output.c,v 1.94 1998/09/15 02:11:36 davem Exp $
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  * Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  * Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  * Florian La Roche, <flla@stud.uni-sb.de>
 15  * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  * Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  * Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  * Matthew Dillon, <dillon@apollo.west.oic.com>
 19  * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  * Jorge Cwik, <jorge@laser.satlink.net>
 21  */
 22
 23 /*
 24  * Changes: Pedro Roque : Retransmit queue handled by TCP.
 25  * : Fragmentation on mtu decrease
 26  * : Segment collapse on retransmit
 27  * : AF independence
 28  *
 29  * Linus Torvalds : send_delayed_ack
 30  * David S. Miller : Charge memory using the right skb
 31  * during syn/ack processing.
 32  * David S. Miller : Output engine completely rewritten.
 33  *
 34  */
 35
 36 #include <net/tcp.h>
 37
 38 externint sysctl_tcp_timestamps;
 39 externint sysctl_tcp_window_scaling;
 40 externint sysctl_tcp_sack;
 41
 42 /* People can turn this off for buggy TCP's found in printers etc. */
 43 int sysctl_tcp_retrans_collapse =1;
 44
 45 /* Get rid of any delayed acks, we sent one already.. */
 46 static __inline__ voidclear_delayed_acks(struct sock * sk)
 47 {
 48 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 49
 50  tp->delayed_acks =0;
 51 if(tcp_in_quickack_mode(tp))
 52  tp->ato = ((HZ/100)*2);
 53 tcp_clear_xmit_timer(sk, TIME_DACK);
 54 }
 55
 56 static __inline__ voidupdate_send_head(struct sock *sk)
 57 {
 58 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 59
 60  tp->send_head = tp->send_head->next;
 61 if(tp->send_head == (struct sk_buff *) &sk->write_queue)
 62  tp->send_head = NULL;
 63 }
 64
 65 /* This routine actually transmits TCP packets queued in by
 66  * tcp_do_sendmsg(). This is used by both the initial
 67  * transmission and possible later retransmissions.
 68  * All SKB's seen here are completely headerless. It is our
 69  * job to build the TCP header, and pass the packet down to
 70  * IP so it can do the same plus pass the packet off to the
 71  * device.
 72  *
 73  * We are working here with either a clone of the original
 74  * SKB, or a fresh unique copy made by the retransmit engine.
 75  */
 76 voidtcp_transmit_skb(struct sock *sk,struct sk_buff *skb)
 77 {
 78 if(skb != NULL) {
 79 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 80 struct tcp_skb_cb *tcb =TCP_SKB_CB(skb);
 81 int tcp_header_size = tp->tcp_header_len;
 82 struct tcphdr *th;
 83
 84 if(tcb->flags & TCPCB_FLAG_SYN) {
 85  tcp_header_size =sizeof(struct tcphdr) + TCPOLEN_MSS;
 86 if(sysctl_tcp_timestamps)
 87  tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
 88 if(sysctl_tcp_window_scaling)
 89  tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
 90 if(sysctl_tcp_sack && !sysctl_tcp_timestamps)
 91  tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 92 }else if(tp->sack_ok && tp->num_sacks) {
 93 /* A SACK is 2 pad bytes, a 2 byte header, plus
 94  * 2 32-bit sequence numbers for each SACK block.
 95  */
 96  tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 97 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
 98 }
 99  th = (struct tcphdr *)skb_push(skb, tcp_header_size);
 100  skb->h.th = th;
 101 skb_set_owner_w(skb, sk);
 102
 103 /* Build TCP header and checksum it. */
 104  th->source = sk->sport;
 105  th->dest = sk->dport;
 106  th->seq =htonl(TCP_SKB_CB(skb)->seq);
 107  th->ack_seq =htonl(tp->rcv_nxt);
 108  th->doff = (tcp_header_size >>2);
 109  th->res1 =0;
 110 *(((__u8 *)th) +13) = tcb->flags;
 111 if(!(tcb->flags & TCPCB_FLAG_SYN))
 112  th->window =htons(tcp_select_window(sk));
 113  th->check =0;
 114  th->urg_ptr =ntohs(tcb->urg_ptr);
 115 if(tcb->flags & TCPCB_FLAG_SYN) {
 116 /* RFC1323: The window in SYN & SYN/ACK segments
 117  * is never scaled.
 118  */
 119  th->window =htons(tp->rcv_wnd);
 120 tcp_syn_build_options((__u32 *)(th +1), tp->mss_clamp,
 121  sysctl_tcp_timestamps,
 122  sysctl_tcp_sack,
 123  sysctl_tcp_window_scaling,
 124  tp->rcv_wscale,
 125 TCP_SKB_CB(skb)->when);
 126 }else{
 127 tcp_build_and_update_options((__u32 *)(th +1),
 128  tp,TCP_SKB_CB(skb)->when);
 129 }
 130  tp->af_specific->send_check(sk, th, skb->len, skb);
 131
 132 clear_delayed_acks(sk);
 133  tp->last_ack_sent = tp->rcv_nxt;
 134  tcp_statistics.TcpOutSegs++;
 135  tp->af_specific->queue_xmit(skb);
 136 }
 137 }
 138
 139 /* This is the main buffer sending routine. We queue the buffer
 140  * and decide whether to queue or transmit now.
 141  */
 142 voidtcp_send_skb(struct sock *sk,struct sk_buff *skb,int force_queue)
 143 {
 144 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 145
 146 /* Advance write_seq and place onto the write_queue. */
 147  tp->write_seq += (TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq);
 148 __skb_queue_tail(&sk->write_queue, skb);
 149
 150 if(!force_queue && tp->send_head == NULL &&tcp_snd_test(sk, skb)) {
 151 /* Send it out now. */
 152 TCP_SKB_CB(skb)->when = jiffies;
 153  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
 154  tp->packets_out++;
 155 tcp_transmit_skb(sk,skb_clone(skb, GFP_KERNEL));
 156 if(!tcp_timer_is_set(sk, TIME_RETRANS))
 157 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 158 }else{
 159 /* Queue it, remembering where we must start sending. */
 160 if(tp->send_head == NULL)
 161  tp->send_head = skb;
 162 if(!force_queue && tp->packets_out ==0&& !tp->pending) {
 163  tp->pending = TIME_PROBE0;
 164 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 165 }
 166 }
 167 }
 168
 169 /* Function to create two new TCP segments. Shrinks the given segment
 170  * to the specified size and appends a new segment with the rest of the
 171  * packet to the list. This won't be called frequently, I hope.
 172  * Remember, these are still headerless SKBs at this point.
 173  */
 174 static inttcp_fragment(struct sock *sk,struct sk_buff *skb, u32 len)
 175 {
 176 struct sk_buff *buff;
 177 int nsize = skb->len - len;
 178  u16 flags;
 179
 180 /* Get a new skb... force flag on. */
 181  buff =sock_wmalloc(sk,
 182 (nsize + MAX_HEADER + sk->prot->max_header),
 183 1, GFP_ATOMIC);
 184 if(buff == NULL)
 185 return-1;/* We'll just try again later. */
 186
 187 /* Reserve space for headers. */
 188 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 189
 190 /* Correct the sequence numbers. */
 191 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(skb)->seq + len;
 192 TCP_SKB_CB(buff)->end_seq =TCP_SKB_CB(skb)->end_seq;
 193
 194 /* PSH and FIN should only be set in the second packet. */
 195  flags =TCP_SKB_CB(skb)->flags;
 196 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
 197 if(flags & TCPCB_FLAG_URG) {
 198  u16 old_urg_ptr =TCP_SKB_CB(skb)->urg_ptr;
 199
 200 /* Urgent data is always a pain in the ass. */
 201 if(old_urg_ptr > len) {
 202 TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
 203 TCP_SKB_CB(skb)->urg_ptr =0;
 204 TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
 205 }else{
 206  flags &= ~(TCPCB_FLAG_URG);
 207 }
 208 }
 209 if(!(flags & TCPCB_FLAG_URG))
 210 TCP_SKB_CB(buff)->urg_ptr =0;
 211 TCP_SKB_CB(buff)->flags = flags;
 212 TCP_SKB_CB(buff)->sacked =0;
 213
 214 /* Copy and checksum data tail into the new buffer. */
 215  buff->csum =csum_partial_copy(skb->data + len,skb_put(buff, nsize),
 216  nsize,0);
 217
 218 TCP_SKB_CB(skb)->end_seq -= nsize;
 219 skb_trim(skb, skb->len - nsize);
 220
 221 /* Rechecksum original buffer. */
 222  skb->csum =csum_partial(skb->data, skb->len,0);
 223
 224 /* Link BUFF into the send queue. */
 225 __skb_append(skb, buff);
 226
 227 return0;
 228 }
 229
 230 /* This function synchronize snd mss to current pmtu/exthdr set.
 231
 232  tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 233  for TCP options, but includes only bare TCP header.
 234
 235  tp->mss_clamp is mss negotiated at connection setup.
 236  It is minumum of user_mss and mss received with SYN.
 237  It also does not include TCP options.
 238
 239  tp->pmtu_cookie is last pmtu, seen by this function.
 240
 241  tp->mss_cache is current effective sending mss, including
 242  all tcp options except for SACKs. It is evaluated,
 243  taking into account current pmtu, but never exceeds
 244  tp->mss_clamp.
 245
 246  NOTE1. rfc1122 clearly states that advertised MSS
 247  DOES NOT include either tcp or ip options.
 248
 249  NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
 250  this function. --ANK (980731)
 251  */
 252
 253 inttcp_sync_mss(struct sock *sk, u32 pmtu)
 254 {
 255 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 256 int mss_now;
 257
 258 /* Calculate base mss without TCP options:
 259  It is MMS_S - sizeof(tcphdr) of rfc1122
 260  */
 261  mss_now = pmtu - tp->af_specific->net_header_len -sizeof(struct tcphdr);
 262
 263 /* Clamp it (mss_clamp does not include tcp options) */
 264 if(mss_now > tp->mss_clamp)
 265  mss_now = tp->mss_clamp;
 266
 267 /* Now subtract TCP options size, not including SACKs */
 268  mss_now -= tp->tcp_header_len -sizeof(struct tcphdr);
 269
 270 /* Now subtract optional transport overhead */
 271  mss_now -= tp->ext_header_len;
 272
 273 /* It we got too small (or even negative) value,
 274  clamp it by 8 from below. Why 8 ?
 275  Well, it could be 1 with the same success,
 276  but if IP accepted segment of length 1,
 277  it would love 8 even more 8) --ANK (980731)
 278  */
 279 if(mss_now <8)
 280  mss_now =8;
 281
 282 /* And store cached results */
 283  tp->pmtu_cookie = pmtu;
 284  tp->mss_cache = mss_now;
 285 return mss_now;
 286 }
 287
 288
 289 /* This routine writes packets to the network. It advances the
 290  * send_head. This happens as incoming acks open up the remote
 291  * window for us.
 292  */
 293 voidtcp_write_xmit(struct sock *sk)
 294 {
 295 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 296 unsigned int mss_now;
 297
 298 /* Account for SACKS, we may need to fragment due to this.
 299  * It is just like the real MSS changing on us midstream.
 300  * We also handle things correctly when the user adds some
 301  * IP options mid-stream. Silly to do, but cover it.
 302  */
 303  mss_now =tcp_current_mss(sk);
 304
 305 /* If we are zapped, the bytes will have to remain here.
 306  * In time closedown will empty the write queue and all
 307  * will be happy.
 308  */
 309 if(!sk->zapped) {
 310 struct sk_buff *skb;
 311 int sent_pkts =0;
 312
 313 /* Anything on the transmit queue that fits the window can
 314  * be added providing we are:
 315  *
 316  * a) following SWS avoidance [and Nagle algorithm]
 317  * b) not exceeding our congestion window.
 318  * c) not retransmitting [Nagle]
 319  */
 320 while((skb = tp->send_head) &&tcp_snd_test(sk, skb)) {
 321 if(skb->len > mss_now) {
 322 if(tcp_fragment(sk, skb, mss_now))
 323 break;
 324 }
 325
 326 /* Advance the send_head. This one is going out. */
 327 update_send_head(sk);
 328 TCP_SKB_CB(skb)->when = jiffies;
 329  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
 330  tp->packets_out++;
 331 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
 332  sent_pkts =1;
 333 }
 334
 335 /* If we sent anything, make sure the retransmit
 336  * timer is active.
 337  */
 338 if(sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
 339 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 340 }
 341 }
 342
 343 /* This function returns the amount that we can raise the
 344  * usable window based on the following constraints
 345  *
 346  * 1. The window can never be shrunk once it is offered (RFC 793)
 347  * 2. We limit memory per socket
 348  *
 349  * RFC 1122:
 350  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 351  * RECV.NEXT + RCV.WIN fixed until:
 352  * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 353  *
 354  * i.e. don't raise the right edge of the window until you can raise
 355  * it at least MSS bytes.
 356  *
 357  * Unfortunately, the recommended algorithm breaks header prediction,
 358  * since header prediction assumes th->window stays fixed.
 359  *
 360  * Strictly speaking, keeping th->window fixed violates the receiver
 361  * side SWS prevention criteria. The problem is that under this rule
 362  * a stream of single byte packets will cause the right side of the
 363  * window to always advance by a single byte.
 364  *
 365  * Of course, if the sender implements sender side SWS prevention
 366  * then this will not be a problem.
 367  *
 368  * BSD seems to make the following compromise:
 369  *
 370  * If the free space is less than the 1/4 of the maximum
 371  * space available and the free space is less than 1/2 mss,
 372  * then set the window to 0.
 373  * Otherwise, just prevent the window from shrinking
 374  * and from being larger than the largest representable value.
 375  *
 376  * This prevents incremental opening of the window in the regime
 377  * where TCP is limited by the speed of the reader side taking
 378  * data out of the TCP receive queue. It does nothing about
 379  * those cases where the window is constrained on the sender side
 380  * because the pipeline is full.
 381  *
 382  * BSD also seems to "accidentally" limit itself to windows that are a
 383  * multiple of MSS, at least until the free space gets quite small.
 384  * This would appear to be a side effect of the mbuf implementation.
 385  * Combining these two algorithms results in the observed behavior
 386  * of having a fixed window size at almost all times.
 387  *
 388  * Below we obtain similar behavior by forcing the offered window to
 389  * a multiple of the mss when it is feasible to do so.
 390  *
 391  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 392  */
 393 u32 __tcp_select_window(struct sock *sk, u32 cur_win)
 394 {
 395 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 396 unsigned int mss = tp->mss_cache;
 397 int free_space;
 398  u32 window;
 399
 400 /* Sometimes free_space can be < 0. */
 401  free_space = (sk->rcvbuf -atomic_read(&sk->rmem_alloc)) /2;
 402 if(tp->window_clamp) {
 403 if(free_space > ((int) tp->window_clamp))
 404  free_space = tp->window_clamp;
 405  mss =min(tp->window_clamp, mss);
 406 }else{
 407 printk("tcp_select_window: tp->window_clamp == 0.\n");
 408 }
 409
 410 if(mss <1) {
 411  mss =1;
 412 printk("tcp_select_window: sk->mss fell to 0.\n");
 413 }
 414
 415 if((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
 416  window =0;
 417 }else{
 418 /* Get the largest window that is a nice multiple of mss.
 419  * Window clamp already applied above.
 420  * If our current window offering is within 1 mss of the
 421  * free space we just keep it. This prevents the divide
 422  * and multiply from happening most of the time.
 423  * We also don't do any window rounding when the free space
 424  * is too small.
 425  */
 426  window = tp->rcv_wnd;
 427 if((((int) window) <= (free_space - ((int) mss))) ||
 428 (((int) window) > free_space))
 429  window = (((unsigned int) free_space)/mss)*mss;
 430 }
 431 return window;
 432 }
 433
 434 /* Attempt to collapse two adjacent SKB's during retransmission. */
 435 static voidtcp_retrans_try_collapse(struct sock *sk,struct sk_buff *skb,int mss_now)
 436 {
 437 struct sk_buff *next_skb = skb->next;
 438
 439 /* The first test we must make is that neither of these two
 440  * SKB's are still referenced by someone else.
 441  */
 442 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
 443 int skb_size = skb->len, next_skb_size = next_skb->len;
 444  u16 flags =TCP_SKB_CB(skb)->flags;
 445
 446 /* Punt if the first SKB has URG set. */
 447 if(flags & TCPCB_FLAG_URG)
 448 return;
 449
 450 /* Also punt if next skb has been SACK'd. */
 451 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
 452 return;
 453
 454 /* Punt if not enough space exists in the first SKB for
 455  * the data in the second, or the total combined payload
 456  * would exceed the MSS.
 457  */
 458 if((next_skb_size >skb_tailroom(skb)) ||
 459 ((skb_size + next_skb_size) > mss_now))
 460 return;
 461
 462 /* Ok. We will be able to collapse the packet. */
 463 __skb_unlink(next_skb, next_skb->list);
 464
 465 if(skb->len %4) {
 466 /* Must copy and rechecksum all data. */
 467 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 468  skb->csum =csum_partial(skb->data, skb->len,0);
 469 }else{
 470 /* Optimize, actually we could also combine next_skb->csum
 471  * to skb->csum using a single add w/carry operation too.
 472  */
 473  skb->csum =csum_partial_copy(next_skb->data,
 474 skb_put(skb, next_skb_size),
 475  next_skb_size, skb->csum);
 476 }
 477
 478 /* Update sequence range on original skb. */
 479 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(next_skb)->end_seq;
 480
 481 /* Merge over control information. */
 482  flags |=TCP_SKB_CB(next_skb)->flags;/* This moves PSH/FIN etc. over */
 483 if(flags & TCPCB_FLAG_URG) {
 484  u16 urgptr =TCP_SKB_CB(next_skb)->urg_ptr;
 485 TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
 486 }
 487 TCP_SKB_CB(skb)->flags = flags;
 488
 489 /* All done, get rid of second SKB and account for it so
 490  * packet counting does not break.
 491  */
 492 kfree_skb(next_skb);
 493  sk->tp_pinfo.af_tcp.packets_out--;
 494 }
 495 }
 496
 497 /* Do a simple retransmit without using the backoff mechanisms in
 498  * tcp_timer. This is used for path mtu discovery.
 499  * The socket is already locked here.
 500  */
 501 voidtcp_simple_retransmit(struct sock *sk)
 502 {
 503 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 504 struct sk_buff *skb;
 505 unsigned int mss =tcp_current_mss(sk);
 506
 507 /* Don't muck with the congestion window here. */
 508  tp->dup_acks =0;
 509  tp->high_seq = tp->snd_nxt;
 510  tp->retrans_head = NULL;
 511
 512 /* Input control flow will see that this was retransmitted
 513  * and not use it for RTT calculation in the absence of
 514  * the timestamp option.
 515  */
 516 for(skb =skb_peek(&sk->write_queue);
 517 ((skb != tp->send_head) &&
 518 (skb != (struct sk_buff *)&sk->write_queue));
 519  skb = skb->next)
 520 if(skb->len > mss)
 521 tcp_retransmit_skb(sk, skb);
 522 }
 523
 524 static __inline__ voidupdate_retrans_head(struct sock *sk)
 525 {
 526 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 527
 528  tp->retrans_head = tp->retrans_head->next;
 529 if((tp->retrans_head == tp->send_head) ||
 530 (tp->retrans_head == (struct sk_buff *) &sk->write_queue))
 531  tp->retrans_head = NULL;
 532 }
 533
 534 /* This retransmits one SKB. Policy decisions and retransmit queue
 535  * state updates are done by the caller. Returns non-zero if an
 536  * error occurred which prevented the send.
 537  */
 538 inttcp_retransmit_skb(struct sock *sk,struct sk_buff *skb)
 539 {
 540 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 541 unsigned int cur_mss =tcp_current_mss(sk);
 542
 543 if(skb->len > cur_mss) {
 544 if(tcp_fragment(sk, skb, cur_mss))
 545 return1;/* We'll try again later. */
 546
 547 /* New SKB created, account for it. */
 548  tp->packets_out++;
 549 }
 550
 551 /* Collapse two adjacent packets if worthwhile and we can. */
 552 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 553 (skb->len < (cur_mss >>1)) &&
 554 (skb->next != tp->send_head) &&
 555 (skb->next != (struct sk_buff *)&sk->write_queue) &&
 556 (sysctl_tcp_retrans_collapse !=0))
 557 tcp_retrans_try_collapse(sk, skb, cur_mss);
 558
 559 if(tp->af_specific->rebuild_header(sk))
 560 return1;/* Routing failure or similar. */
 561
 562 /* Ok, we're gonna send it out, update state. */
 563 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
 564  tp->retrans_out++;
 565
 566 /* Make a copy, if the first transmission SKB clone we made
 567  * is still in somebody's hands, else make a clone.
 568  */
 569 TCP_SKB_CB(skb)->when = jiffies;
 570 if(skb_cloned(skb))
 571  skb =skb_copy(skb, GFP_ATOMIC);
 572 else
 573  skb =skb_clone(skb, GFP_ATOMIC);
 574 tcp_transmit_skb(sk, skb);
 575
 576 /* Update global TCP statistics and return success. */
 577  sk->prot->retransmits++;
 578  tcp_statistics.TcpRetransSegs++;
 579
 580 return0;
 581 }
 582
 583 /* This gets called after a retransmit timeout, and the initially
 584  * retransmitted data is acknowledged. It tries to continue
 585  * resending the rest of the retransmit queue, until either
 586  * we've sent it all or the congestion window limit is reached.
 587  * If doing SACK, the first ACK which comes back for a timeout
 588  * based retransmit packet might feed us FACK information again.
 589  * If so, we use it to avoid unnecessarily retransmissions.
 590  */
 591 voidtcp_xmit_retransmit_queue(struct sock *sk)
 592 {
 593 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 594 struct sk_buff *skb;
 595
 596 if(tp->retrans_head == NULL)
 597  tp->retrans_head =skb_peek(&sk->write_queue);
 598 if(tp->retrans_head == tp->send_head)
 599  tp->retrans_head = NULL;
 600
 601 /* Each time, advance the retrans_head if we got
 602  * a packet out or we skipped one because it was
 603  * SACK'd. -DaveM
 604  */
 605 while((skb = tp->retrans_head) != NULL) {
 606 /* If it has been ack'd by a SACK block, we don't
 607  * retransmit it.
 608  */
 609 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 610 /* Send it out, punt if error occurred. */
 611 if(tcp_retransmit_skb(sk, skb))
 612 break;
 613
 614 update_retrans_head(sk);
 615
 616 /* Stop retransmitting if we've hit the congestion
 617  * window limit.
 618  */
 619 if(tp->retrans_out >= tp->snd_cwnd)
 620 break;
 621 }else{
 622 update_retrans_head(sk);
 623 }
 624 }
 625 }
 626
 627 /* Using FACK information, retransmit all missing frames at the receiver
 628  * up to the forward most SACK'd packet (tp->fackets_out) if the packet
 629  * has not been retransmitted already.
 630  */
 631 voidtcp_fack_retransmit(struct sock *sk)
 632 {
 633 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 634 struct sk_buff *skb =skb_peek(&sk->write_queue);
 635 int packet_cnt =0;
 636
 637 while((skb != NULL) &&
 638 (skb != tp->send_head) &&
 639 (skb != (struct sk_buff *)&sk->write_queue)) {
 640  __u8 sacked =TCP_SKB_CB(skb)->sacked;
 641
 642 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
 643 goto next_packet;
 644
 645 /* Ok, retransmit it. */
 646 if(tcp_retransmit_skb(sk, skb))
 647 break;
 648
 649 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 650 break;
 651 next_packet:
 652  packet_cnt++;
 653 if(packet_cnt >= tp->fackets_out)
 654 break;
 655  skb = skb->next;
 656 }
 657 }
 658
 659 /* Send a fin. The caller locks the socket for us. This cannot be
 660  * allowed to fail queueing a FIN frame under any circumstances.
 661  */
 662 voidtcp_send_fin(struct sock *sk)
 663 {
 664 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 665 struct sk_buff *skb =skb_peek_tail(&sk->write_queue);
 666 unsigned int mss_now;
 667
 668 /* Optimization, tack on the FIN if we have a queue of
 669  * unsent frames. But be careful about outgoing SACKS
 670  * and IP options.
 671  */
 672  mss_now =tcp_current_mss(sk);
 673
 674 if((tp->send_head != NULL) && (skb->len < mss_now)) {
 675 /* tcp_write_xmit() takes care of the rest. */
 676 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 677 TCP_SKB_CB(skb)->end_seq++;
 678  tp->write_seq++;
 679
 680 /* Special case to avoid Nagle bogosity. If this
 681  * segment is the last segment, and it was queued
 682  * due to Nagle/SWS-avoidance, send it out now.
 683  */
 684 if(tp->send_head == skb &&
 685 !sk->nonagle &&
 686  skb->len < (tp->mss_cache >>1) &&
 687  tp->packets_out &&
 688 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
 689 update_send_head(sk);
 690 TCP_SKB_CB(skb)->when = jiffies;
 691  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
 692  tp->packets_out++;
 693 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
 694 if(!tcp_timer_is_set(sk, TIME_RETRANS))
 695 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 696 }
 697 }else{
 698 /* Socket is locked, keep trying until memory is available. */
 699 do{
 700  skb =sock_wmalloc(sk,
 701 (MAX_HEADER +
 702  sk->prot->max_header),
 703 1, GFP_KERNEL);
 704 }while(skb == NULL);
 705
 706 /* Reserve space for headers and prepare control bits. */
 707 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 708  skb->csum =0;
 709 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 710 TCP_SKB_CB(skb)->sacked =0;
 711 TCP_SKB_CB(skb)->urg_ptr =0;
 712
 713 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 714 TCP_SKB_CB(skb)->seq = tp->write_seq;
 715 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
 716 tcp_send_skb(sk, skb,0);
 717 }
 718 }
 719
 720 /* We get here when a process closes a file descriptor (either due to
 721  * an explicit close() or as a byproduct of exit()'ing) and there
 722  * was unread data in the receive queue. This behavior is recommended
 723  * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
 724  */
 725 voidtcp_send_active_reset(struct sock *sk)
 726 {
 727 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 728 struct sk_buff *skb;
 729
 730 /* NOTE: No TCP options attached and we never retransmit this. */
 731 do{
 732  skb =alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
 733 }while(skb == NULL);
 734
 735 /* Reserve space for headers and prepare control bits. */
 736 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 737  skb->csum =0;
 738 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 739 TCP_SKB_CB(skb)->sacked =0;
 740 TCP_SKB_CB(skb)->urg_ptr =0;
 741
 742 /* Send it off. */
 743 TCP_SKB_CB(skb)->seq = tp->write_seq;
 744 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
 745 TCP_SKB_CB(skb)->when = jiffies;
 746 tcp_transmit_skb(sk, skb);
 747 }
 748
 749 /* WARNING: This routine must only be called when we have already sent
 750  * a SYN packet that crossed the incoming SYN that caused this routine
 751  * to get called. If this assumption fails then the initial rcv_wnd
 752  * and rcv_wscale values will not be correct.
 753  */
 754 inttcp_send_synack(struct sock *sk)
 755 {
 756 struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
 757 struct sk_buff* skb;
 758
 759  skb =sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 760 1, GFP_ATOMIC);
 761 if(skb == NULL)
 762 return-ENOMEM;
 763
 764 /* Reserve space for headers and prepare control bits. */
 765 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 766  skb->csum =0;
 767 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
 768 TCP_SKB_CB(skb)->sacked =0;
 769 TCP_SKB_CB(skb)->urg_ptr =0;
 770
 771 /* SYN eats a sequence byte. */
 772 TCP_SKB_CB(skb)->seq = tp->snd_una;
 773 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
 774 __skb_queue_tail(&sk->write_queue, skb);
 775 TCP_SKB_CB(skb)->when = jiffies;
 776  tp->packets_out++;
 777 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
 778 return0;
 779 }
 780
 781 /*
 782  * Prepare a SYN-ACK.
 783  */
 784 struct sk_buff *tcp_make_synack(struct sock *sk,struct dst_entry *dst,
 785 struct open_request *req,int mss)
 786 {
 787 struct tcphdr *th;
 788 int tcp_header_size;
 789 struct sk_buff *skb;
 790
 791  skb =sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header,1, GFP_ATOMIC);
 792 if(skb == NULL)
 793 return NULL;
 794
 795 /* Reserve space for headers. */
 796 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 797
 798  skb->dst =dst_clone(dst);
 799
 800 /* Don't offer more than they did.
 801  * This way we don't have to memorize who said what.
 802  * FIXME: maybe this should be changed for better performance
 803  * with syncookies.
 804  */
 805  req->mss =min(mss, req->mss);
 806 if(req->mss <8) {
 807 printk(KERN_DEBUG "initial req->mss below 8\n");
 808  req->mss =8;
 809 }
 810
 811  tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
 812 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0) +
 813 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED :0) +
 814 /* SACK_PERM is in the place of NOP NOP of TS */
 815 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED :0));
 816  skb->h.th = th = (struct tcphdr *)skb_push(skb, tcp_header_size);
 817
 818 memset(th,0,sizeof(struct tcphdr));
 819  th->syn =1;
 820  th->ack =1;
 821  th->source = sk->sport;
 822  th->dest = req->rmt_port;
 823 TCP_SKB_CB(skb)->seq = req->snt_isn;
 824 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
 825  th->seq =htonl(TCP_SKB_CB(skb)->seq);
 826  th->ack_seq =htonl(req->rcv_isn +1);
 827 if(req->rcv_wnd ==0) {/* ignored for retransmitted syns */
 828  __u8 rcv_wscale;
 829 /* Set this up on the first call only */
 830  req->window_clamp = skb->dst->window;
 831 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
 832 &req->rcv_wnd,
 833 &req->window_clamp,
 834  req->wscale_ok,
 835 &rcv_wscale);
 836  req->rcv_wscale = rcv_wscale;
 837 }
 838
 839 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 840  th->window =htons(req->rcv_wnd);
 841
 842 TCP_SKB_CB(skb)->when = jiffies;
 843 tcp_syn_build_options((__u32 *)(th +1), req->mss, req->tstamp_ok,
 844  req->sack_ok, req->wscale_ok, req->rcv_wscale,
 845 TCP_SKB_CB(skb)->when);
 846
 847  skb->csum =0;
 848  th->doff = (tcp_header_size >>2);
 849  tcp_statistics.TcpOutSegs++;
 850 return skb;
 851 }
 852
 853 voidtcp_connect(struct sock *sk,struct sk_buff *buff,int mtu)
 854 {
 855 struct dst_entry *dst = sk->dst_cache;
 856 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 857
 858 /* Reserve space for headers. */
 859 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 860
 861  tp->snd_wnd =0;
 862  tp->snd_wl1 =0;
 863  tp->snd_wl2 = tp->write_seq;
 864  tp->snd_una = tp->write_seq;
 865  tp->rcv_nxt =0;
 866
 867  sk->err =0;
 868
 869 /* We'll fix this up when we get a response from the other end.
 870  * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 871  */
 872  tp->tcp_header_len =sizeof(struct tcphdr) +
 873 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED :0);
 874
 875 /* If user gave his TCP_MAXSEG, record it to clamp */
 876 if(tp->user_mss)
 877  tp->mss_clamp = tp->user_mss;
 878 tcp_sync_mss(sk, mtu);
 879
 880 /* Now unpleasant action: if initial pmtu is too low
 881  set lower clamp. I am not sure that it is good.
 882  To be more exact, I do not think that clamping at value, which
 883  is apparently transient and may improve in future is good idea.
 884  It would be better to wait until peer will returns its MSS
 885  (probably 65535 too) and now advertise something sort of 65535
 886  or at least first hop device mtu. Is it clear, what I mean?
 887  We should tell peer what maximal mss we expect to RECEIVE,
 888  it has nothing to do with pmtu.
 889  I am afraid someone will be confused by such huge value.
 890  --ANK (980731)
 891  */
 892 if(tp->mss_cache + tp->tcp_header_len -sizeof(struct tcphdr) < tp->mss_clamp )
 893  tp->mss_clamp = tp->mss_cache + tp->tcp_header_len -sizeof(struct tcphdr);
 894
 895 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 896 TCP_SKB_CB(buff)->sacked =0;
 897 TCP_SKB_CB(buff)->urg_ptr =0;
 898  buff->csum =0;
 899 TCP_SKB_CB(buff)->seq = tp->write_seq++;
 900 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 901  tp->snd_nxt =TCP_SKB_CB(buff)->end_seq;
 902
 903  tp->window_clamp = dst->window;
 904 tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
 905 &tp->rcv_wnd,
 906 &tp->window_clamp,
 907  sysctl_tcp_window_scaling,
 908 &tp->rcv_wscale);
 909 /* Ok, now lock the socket before we make it visible to
 910  * the incoming packet engine.
 911  */
 912 lock_sock(sk);
 913
 914 /* Socket identity change complete, no longer
 915  * in TCP_CLOSE, so enter ourselves into the
 916  * hash tables.
 917  */
 918 tcp_set_state(sk,TCP_SYN_SENT);
 919  sk->prot->hash(sk);
 920
 921  tp->rto = dst->rtt;
 922 tcp_init_xmit_timers(sk);
 923  tp->retransmits =0;
 924  tp->fackets_out =0;
 925  tp->retrans_out =0;
 926
 927 /* Send it off. */
 928 __skb_queue_tail(&sk->write_queue, buff);
 929 TCP_SKB_CB(buff)->when = jiffies;
 930  tp->packets_out++;
 931 tcp_transmit_skb(sk,skb_clone(buff, GFP_KERNEL));
 932  tcp_statistics.TcpActiveOpens++;
 933
 934 /* Timer for repeating the SYN until an answer. */
 935 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 936
 937 /* Now, it is safe to release the socket. */
 938 release_sock(sk);
 939 }
 940
 941 /* Send out a delayed ack, the caller does the policy checking
 942  * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
 943  * for details.
 944  */
 945 voidtcp_send_delayed_ack(struct tcp_opt *tp,int max_timeout)
 946 {
 947 unsigned long timeout;
 948
 949 /* Stay within the limit we were given */
 950  timeout = tp->ato;
 951 if(timeout > max_timeout)
 952  timeout = max_timeout;
 953  timeout += jiffies;
 954
 955 /* Use new timeout only if there wasn't a older one earlier. */
 956 if(!tp->delack_timer.prev) {
 957  tp->delack_timer.expires = timeout;
 958 add_timer(&tp->delack_timer);
 959 }else{
 960 if(timeout < tp->delack_timer.expires)
 961 mod_timer(&tp->delack_timer, timeout);
 962 }
 963 }
 964
 965 /* This routine sends an ack and also updates the window. */
 966 voidtcp_send_ack(struct sock *sk)
 967 {
 968 /* If we have been reset, we may not send again. */
 969 if(!sk->zapped) {
 970 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 971 struct sk_buff *buff;
 972
 973 /* We are not putting this on the write queue, so
 974  * tcp_transmit_skb() will set the ownership to this
 975  * sock.
 976  */
 977  buff =alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
 978 if(buff == NULL) {
 979 /* Force it to send an ack. We don't have to do this
 980  * (ACK is unreliable) but it's much better use of
 981  * bandwidth on slow links to send a spare ack than
 982  * resend packets.
 983  */
 984 tcp_send_delayed_ack(tp, HZ/2);
 985 return;
 986 }
 987
 988 /* Reserve space for headers and prepare control bits. */
 989 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 990  buff->csum =0;
 991 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 992 TCP_SKB_CB(buff)->sacked =0;
 993 TCP_SKB_CB(buff)->urg_ptr =0;
 994
 995 /* Send it off, this clears delayed acks for us. */
 996 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
 997 TCP_SKB_CB(buff)->when = jiffies;
 998 tcp_transmit_skb(sk, buff);
 999 }
1000 }
1001
1002 /* This routine sends a packet with an out of date sequence
1003  * number. It assumes the other end will try to ack it.
1004  */
1005 voidtcp_write_wakeup(struct sock *sk)
1006 {
1007 /* After a valid reset we can send no more. */
1008 if(!sk->zapped) {
1009 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1010 struct sk_buff *skb;
1011
1012 /* Write data can still be transmitted/retransmitted in the
1013  * following states. If any other state is encountered, return.
1014  * [listen/close will never occur here anyway]
1015  */
1016 if((1<< sk->state) &
1017 ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1018  TCPF_LAST_ACK|TCPF_CLOSING))
1019 return;
1020
1021 if(before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1022 ((skb = tp->send_head) != NULL)) {
1023 unsigned long win_size;
1024
1025 /* We are probing the opening of a window
1026  * but the window size is != 0
1027  * must have been a result SWS avoidance ( sender )
1028  */
1029  win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1030 if(win_size <TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq) {
1031 if(tcp_fragment(sk, skb, win_size))
1032 return;/* Let a retransmit get it. */
1033 }
1034 update_send_head(sk);
1035 TCP_SKB_CB(skb)->when = jiffies;
1036  tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
1037  tp->packets_out++;
1038 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1039 if(!tcp_timer_is_set(sk, TIME_RETRANS))
1040 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1041 }else{
1042 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1043  skb =alloc_skb(MAX_HEADER + sk->prot->max_header,
1044  GFP_ATOMIC);
1045 if(skb == NULL)
1046 return;
1047
1048 /* Reserve space for headers and set control bits. */
1049 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1050  skb->csum =0;
1051 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1052 TCP_SKB_CB(skb)->sacked =0;
1053 TCP_SKB_CB(skb)->urg_ptr =0;
1054
1055 /* Use a previous sequence. This should cause the other
1056  * end to send an ack. Don't queue or clone SKB, just
1057  * send it.
1058  */
1059 TCP_SKB_CB(skb)->seq = tp->snd_nxt -1;
1060 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
1061 TCP_SKB_CB(skb)->when = jiffies;
1062 tcp_transmit_skb(sk, skb);
1063 }
1064 }
1065 }
1066
1067 /* A window probe timeout has occurred. If window is not closed send
1068  * a partial packet else a zero probe.
1069  */
1070 voidtcp_send_probe0(struct sock *sk)
1071 {
1072 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1073
1074 tcp_write_wakeup(sk);
1075  tp->pending = TIME_PROBE0;
1076  tp->backoff++;
1077  tp->probes_out++;
1078 tcp_reset_xmit_timer(sk, TIME_PROBE0,
1079 min(tp->rto << tp->backoff,120*HZ));
1080 }