Import 2.1.122
[davej-history.git] / net / ipv4 / tcp_output.c
blob6b7eac22ac84c791af0b64e024cb6e1cf5f9f339
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.94 1998/09/15 02:11:36 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
36 #include <net/tcp.h>
38 externint sysctl_tcp_timestamps;
39 externint sysctl_tcp_window_scaling;
40 externint sysctl_tcp_sack;
42 /* People can turn this off for buggy TCP's found in printers etc. */
43 int sysctl_tcp_retrans_collapse =1;
45 /* Get rid of any delayed acks, we sent one already.. */
46 static __inline__ voidclear_delayed_acks(struct sock * sk)
48 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
50 tp->delayed_acks =0;
51 if(tcp_in_quickack_mode(tp))
52 tp->ato = ((HZ/100)*2);
53 tcp_clear_xmit_timer(sk, TIME_DACK);
56 static __inline__ voidupdate_send_head(struct sock *sk)
58 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
60 tp->send_head = tp->send_head->next;
61 if(tp->send_head == (struct sk_buff *) &sk->write_queue)
62 tp->send_head = NULL;
65 /* This routine actually transmits TCP packets queued in by
66 * tcp_do_sendmsg(). This is used by both the initial
67 * transmission and possible later retransmissions.
68 * All SKB's seen here are completely headerless. It is our
69 * job to build the TCP header, and pass the packet down to
70 * IP so it can do the same plus pass the packet off to the
71 * device.
73 * We are working here with either a clone of the original
74 * SKB, or a fresh unique copy made by the retransmit engine.
76 voidtcp_transmit_skb(struct sock *sk,struct sk_buff *skb)
78 if(skb != NULL) {
79 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
80 struct tcp_skb_cb *tcb =TCP_SKB_CB(skb);
81 int tcp_header_size = tp->tcp_header_len;
82 struct tcphdr *th;
84 if(tcb->flags & TCPCB_FLAG_SYN) {
85 tcp_header_size =sizeof(struct tcphdr) + TCPOLEN_MSS;
86 if(sysctl_tcp_timestamps)
87 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
88 if(sysctl_tcp_window_scaling)
89 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
90 if(sysctl_tcp_sack && !sysctl_tcp_timestamps)
91 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
92 }else if(tp->sack_ok && tp->num_sacks) {
93 /* A SACK is 2 pad bytes, a 2 byte header, plus
94 * 2 32-bit sequence numbers for each SACK block.
96 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
97 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
99 th = (struct tcphdr *)skb_push(skb, tcp_header_size);
100 skb->h.th = th;
101 skb_set_owner_w(skb, sk);
103 /* Build TCP header and checksum it. */
104 th->source = sk->sport;
105 th->dest = sk->dport;
106 th->seq =htonl(TCP_SKB_CB(skb)->seq);
107 th->ack_seq =htonl(tp->rcv_nxt);
108 th->doff = (tcp_header_size >>2);
109 th->res1 =0;
110 *(((__u8 *)th) +13) = tcb->flags;
111 if(!(tcb->flags & TCPCB_FLAG_SYN))
112 th->window =htons(tcp_select_window(sk));
113 th->check =0;
114 th->urg_ptr =ntohs(tcb->urg_ptr);
115 if(tcb->flags & TCPCB_FLAG_SYN) {
116 /* RFC1323: The window in SYN & SYN/ACK segments
117 * is never scaled.
119 th->window =htons(tp->rcv_wnd);
120 tcp_syn_build_options((__u32 *)(th +1), tp->mss_clamp,
121 sysctl_tcp_timestamps,
122 sysctl_tcp_sack,
123 sysctl_tcp_window_scaling,
124 tp->rcv_wscale,
125 TCP_SKB_CB(skb)->when);
126 }else{
127 tcp_build_and_update_options((__u32 *)(th +1),
128 tp,TCP_SKB_CB(skb)->when);
130 tp->af_specific->send_check(sk, th, skb->len, skb);
132 clear_delayed_acks(sk);
133 tp->last_ack_sent = tp->rcv_nxt;
134 tcp_statistics.TcpOutSegs++;
135 tp->af_specific->queue_xmit(skb);
139 /* This is the main buffer sending routine. We queue the buffer
140 * and decide whether to queue or transmit now.
142 voidtcp_send_skb(struct sock *sk,struct sk_buff *skb,int force_queue)
144 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
146 /* Advance write_seq and place onto the write_queue. */
147 tp->write_seq += (TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq);
148 __skb_queue_tail(&sk->write_queue, skb);
150 if(!force_queue && tp->send_head == NULL &&tcp_snd_test(sk, skb)) {
151 /* Send it out now. */
152 TCP_SKB_CB(skb)->when = jiffies;
153 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
154 tp->packets_out++;
155 tcp_transmit_skb(sk,skb_clone(skb, GFP_KERNEL));
156 if(!tcp_timer_is_set(sk, TIME_RETRANS))
157 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
158 }else{
159 /* Queue it, remembering where we must start sending. */
160 if(tp->send_head == NULL)
161 tp->send_head = skb;
162 if(!force_queue && tp->packets_out ==0&& !tp->pending) {
163 tp->pending = TIME_PROBE0;
164 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
169 /* Function to create two new TCP segments. Shrinks the given segment
170 * to the specified size and appends a new segment with the rest of the
171 * packet to the list. This won't be called frequently, I hope.
172 * Remember, these are still headerless SKBs at this point.
174 static inttcp_fragment(struct sock *sk,struct sk_buff *skb, u32 len)
176 struct sk_buff *buff;
177 int nsize = skb->len - len;
178 u16 flags;
180 /* Get a new skb... force flag on. */
181 buff =sock_wmalloc(sk,
182 (nsize + MAX_HEADER + sk->prot->max_header),
183 1, GFP_ATOMIC);
184 if(buff == NULL)
185 return-1;/* We'll just try again later. */
187 /* Reserve space for headers. */
188 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
190 /* Correct the sequence numbers. */
191 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(skb)->seq + len;
192 TCP_SKB_CB(buff)->end_seq =TCP_SKB_CB(skb)->end_seq;
194 /* PSH and FIN should only be set in the second packet. */
195 flags =TCP_SKB_CB(skb)->flags;
196 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
197 if(flags & TCPCB_FLAG_URG) {
198 u16 old_urg_ptr =TCP_SKB_CB(skb)->urg_ptr;
200 /* Urgent data is always a pain in the ass. */
201 if(old_urg_ptr > len) {
202 TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
203 TCP_SKB_CB(skb)->urg_ptr =0;
204 TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
205 }else{
206 flags &= ~(TCPCB_FLAG_URG);
209 if(!(flags & TCPCB_FLAG_URG))
210 TCP_SKB_CB(buff)->urg_ptr =0;
211 TCP_SKB_CB(buff)->flags = flags;
212 TCP_SKB_CB(buff)->sacked =0;
214 /* Copy and checksum data tail into the new buffer. */
215 buff->csum =csum_partial_copy(skb->data + len,skb_put(buff, nsize),
216 nsize,0);
218 TCP_SKB_CB(skb)->end_seq -= nsize;
219 skb_trim(skb, skb->len - nsize);
221 /* Rechecksum original buffer. */
222 skb->csum =csum_partial(skb->data, skb->len,0);
224 /* Link BUFF into the send queue. */
225 __skb_append(skb, buff);
227 return0;
230 /* This function synchronize snd mss to current pmtu/exthdr set.
232 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
233 for TCP options, but includes only bare TCP header.
235 tp->mss_clamp is mss negotiated at connection setup.
236 It is minumum of user_mss and mss received with SYN.
237 It also does not include TCP options.
239 tp->pmtu_cookie is last pmtu, seen by this function.
241 tp->mss_cache is current effective sending mss, including
242 all tcp options except for SACKs. It is evaluated,
243 taking into account current pmtu, but never exceeds
244 tp->mss_clamp.
246 NOTE1. rfc1122 clearly states that advertised MSS
247 DOES NOT include either tcp or ip options.
249 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
250 this function. --ANK (980731)
253 inttcp_sync_mss(struct sock *sk, u32 pmtu)
255 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
256 int mss_now;
258 /* Calculate base mss without TCP options:
259 It is MMS_S - sizeof(tcphdr) of rfc1122
261 mss_now = pmtu - tp->af_specific->net_header_len -sizeof(struct tcphdr);
263 /* Clamp it (mss_clamp does not include tcp options) */
264 if(mss_now > tp->mss_clamp)
265 mss_now = tp->mss_clamp;
267 /* Now subtract TCP options size, not including SACKs */
268 mss_now -= tp->tcp_header_len -sizeof(struct tcphdr);
270 /* Now subtract optional transport overhead */
271 mss_now -= tp->ext_header_len;
273 /* It we got too small (or even negative) value,
274 clamp it by 8 from below. Why 8 ?
275 Well, it could be 1 with the same success,
276 but if IP accepted segment of length 1,
277 it would love 8 even more 8) --ANK (980731)
279 if(mss_now <8)
280 mss_now =8;
282 /* And store cached results */
283 tp->pmtu_cookie = pmtu;
284 tp->mss_cache = mss_now;
285 return mss_now;
289 /* This routine writes packets to the network. It advances the
290 * send_head. This happens as incoming acks open up the remote
291 * window for us.
293 voidtcp_write_xmit(struct sock *sk)
295 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
296 unsigned int mss_now;
298 /* Account for SACKS, we may need to fragment due to this.
299 * It is just like the real MSS changing on us midstream.
300 * We also handle things correctly when the user adds some
301 * IP options mid-stream. Silly to do, but cover it.
303 mss_now =tcp_current_mss(sk);
305 /* If we are zapped, the bytes will have to remain here.
306 * In time closedown will empty the write queue and all
307 * will be happy.
309 if(!sk->zapped) {
310 struct sk_buff *skb;
311 int sent_pkts =0;
313 /* Anything on the transmit queue that fits the window can
314 * be added providing we are:
316 * a) following SWS avoidance [and Nagle algorithm]
317 * b) not exceeding our congestion window.
318 * c) not retransmitting [Nagle]
320 while((skb = tp->send_head) &&tcp_snd_test(sk, skb)) {
321 if(skb->len > mss_now) {
322 if(tcp_fragment(sk, skb, mss_now))
323 break;
326 /* Advance the send_head. This one is going out. */
327 update_send_head(sk);
328 TCP_SKB_CB(skb)->when = jiffies;
329 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
330 tp->packets_out++;
331 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
332 sent_pkts =1;
335 /* If we sent anything, make sure the retransmit
336 * timer is active.
338 if(sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
339 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
343 /* This function returns the amount that we can raise the
344 * usable window based on the following constraints
346 * 1. The window can never be shrunk once it is offered (RFC 793)
347 * 2. We limit memory per socket
349 * RFC 1122:
350 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
351 * RECV.NEXT + RCV.WIN fixed until:
352 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
354 * i.e. don't raise the right edge of the window until you can raise
355 * it at least MSS bytes.
357 * Unfortunately, the recommended algorithm breaks header prediction,
358 * since header prediction assumes th->window stays fixed.
360 * Strictly speaking, keeping th->window fixed violates the receiver
361 * side SWS prevention criteria. The problem is that under this rule
362 * a stream of single byte packets will cause the right side of the
363 * window to always advance by a single byte.
365 * Of course, if the sender implements sender side SWS prevention
366 * then this will not be a problem.
368 * BSD seems to make the following compromise:
370 * If the free space is less than the 1/4 of the maximum
371 * space available and the free space is less than 1/2 mss,
372 * then set the window to 0.
373 * Otherwise, just prevent the window from shrinking
374 * and from being larger than the largest representable value.
376 * This prevents incremental opening of the window in the regime
377 * where TCP is limited by the speed of the reader side taking
378 * data out of the TCP receive queue. It does nothing about
379 * those cases where the window is constrained on the sender side
380 * because the pipeline is full.
382 * BSD also seems to "accidentally" limit itself to windows that are a
383 * multiple of MSS, at least until the free space gets quite small.
384 * This would appear to be a side effect of the mbuf implementation.
385 * Combining these two algorithms results in the observed behavior
386 * of having a fixed window size at almost all times.
388 * Below we obtain similar behavior by forcing the offered window to
389 * a multiple of the mss when it is feasible to do so.
391 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
393 u32 __tcp_select_window(struct sock *sk, u32 cur_win)
395 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
396 unsigned int mss = tp->mss_cache;
397 int free_space;
398 u32 window;
400 /* Sometimes free_space can be < 0. */
401 free_space = (sk->rcvbuf -atomic_read(&sk->rmem_alloc)) /2;
402 if(tp->window_clamp) {
403 if(free_space > ((int) tp->window_clamp))
404 free_space = tp->window_clamp;
405 mss =min(tp->window_clamp, mss);
406 }else{
407 printk("tcp_select_window: tp->window_clamp == 0.\n");
410 if(mss <1) {
411 mss =1;
412 printk("tcp_select_window: sk->mss fell to 0.\n");
415 if((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
416 window =0;
417 }else{
418 /* Get the largest window that is a nice multiple of mss.
419 * Window clamp already applied above.
420 * If our current window offering is within 1 mss of the
421 * free space we just keep it. This prevents the divide
422 * and multiply from happening most of the time.
423 * We also don't do any window rounding when the free space
424 * is too small.
426 window = tp->rcv_wnd;
427 if((((int) window) <= (free_space - ((int) mss))) ||
428 (((int) window) > free_space))
429 window = (((unsigned int) free_space)/mss)*mss;
431 return window;
434 /* Attempt to collapse two adjacent SKB's during retransmission. */
435 static voidtcp_retrans_try_collapse(struct sock *sk,struct sk_buff *skb,int mss_now)
437 struct sk_buff *next_skb = skb->next;
439 /* The first test we must make is that neither of these two
440 * SKB's are still referenced by someone else.
442 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
443 int skb_size = skb->len, next_skb_size = next_skb->len;
444 u16 flags =TCP_SKB_CB(skb)->flags;
446 /* Punt if the first SKB has URG set. */
447 if(flags & TCPCB_FLAG_URG)
448 return;
450 /* Also punt if next skb has been SACK'd. */
451 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
452 return;
454 /* Punt if not enough space exists in the first SKB for
455 * the data in the second, or the total combined payload
456 * would exceed the MSS.
458 if((next_skb_size >skb_tailroom(skb)) ||
459 ((skb_size + next_skb_size) > mss_now))
460 return;
462 /* Ok. We will be able to collapse the packet. */
463 __skb_unlink(next_skb, next_skb->list);
465 if(skb->len %4) {
466 /* Must copy and rechecksum all data. */
467 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
468 skb->csum =csum_partial(skb->data, skb->len,0);
469 }else{
470 /* Optimize, actually we could also combine next_skb->csum
471 * to skb->csum using a single add w/carry operation too.
473 skb->csum =csum_partial_copy(next_skb->data,
474 skb_put(skb, next_skb_size),
475 next_skb_size, skb->csum);
478 /* Update sequence range on original skb. */
479 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(next_skb)->end_seq;
481 /* Merge over control information. */
482 flags |=TCP_SKB_CB(next_skb)->flags;/* This moves PSH/FIN etc. over */
483 if(flags & TCPCB_FLAG_URG) {
484 u16 urgptr =TCP_SKB_CB(next_skb)->urg_ptr;
485 TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
487 TCP_SKB_CB(skb)->flags = flags;
489 /* All done, get rid of second SKB and account for it so
490 * packet counting does not break.
492 kfree_skb(next_skb);
493 sk->tp_pinfo.af_tcp.packets_out--;
497 /* Do a simple retransmit without using the backoff mechanisms in
498 * tcp_timer. This is used for path mtu discovery.
499 * The socket is already locked here.
501 voidtcp_simple_retransmit(struct sock *sk)
503 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
504 struct sk_buff *skb;
505 unsigned int mss =tcp_current_mss(sk);
507 /* Don't muck with the congestion window here. */
508 tp->dup_acks =0;
509 tp->high_seq = tp->snd_nxt;
510 tp->retrans_head = NULL;
512 /* Input control flow will see that this was retransmitted
513 * and not use it for RTT calculation in the absence of
514 * the timestamp option.
516 for(skb =skb_peek(&sk->write_queue);
517 ((skb != tp->send_head) &&
518 (skb != (struct sk_buff *)&sk->write_queue));
519 skb = skb->next)
520 if(skb->len > mss)
521 tcp_retransmit_skb(sk, skb);
524 static __inline__ voidupdate_retrans_head(struct sock *sk)
526 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
528 tp->retrans_head = tp->retrans_head->next;
529 if((tp->retrans_head == tp->send_head) ||
530 (tp->retrans_head == (struct sk_buff *) &sk->write_queue))
531 tp->retrans_head = NULL;
534 /* This retransmits one SKB. Policy decisions and retransmit queue
535 * state updates are done by the caller. Returns non-zero if an
536 * error occurred which prevented the send.
538 inttcp_retransmit_skb(struct sock *sk,struct sk_buff *skb)
540 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
541 unsigned int cur_mss =tcp_current_mss(sk);
543 if(skb->len > cur_mss) {
544 if(tcp_fragment(sk, skb, cur_mss))
545 return1;/* We'll try again later. */
547 /* New SKB created, account for it. */
548 tp->packets_out++;
551 /* Collapse two adjacent packets if worthwhile and we can. */
552 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
553 (skb->len < (cur_mss >>1)) &&
554 (skb->next != tp->send_head) &&
555 (skb->next != (struct sk_buff *)&sk->write_queue) &&
556 (sysctl_tcp_retrans_collapse !=0))
557 tcp_retrans_try_collapse(sk, skb, cur_mss);
559 if(tp->af_specific->rebuild_header(sk))
560 return1;/* Routing failure or similar. */
562 /* Ok, we're gonna send it out, update state. */
563 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
564 tp->retrans_out++;
566 /* Make a copy, if the first transmission SKB clone we made
567 * is still in somebody's hands, else make a clone.
569 TCP_SKB_CB(skb)->when = jiffies;
570 if(skb_cloned(skb))
571 skb =skb_copy(skb, GFP_ATOMIC);
572 else
573 skb =skb_clone(skb, GFP_ATOMIC);
574 tcp_transmit_skb(sk, skb);
576 /* Update global TCP statistics and return success. */
577 sk->prot->retransmits++;
578 tcp_statistics.TcpRetransSegs++;
580 return0;
583 /* This gets called after a retransmit timeout, and the initially
584 * retransmitted data is acknowledged. It tries to continue
585 * resending the rest of the retransmit queue, until either
586 * we've sent it all or the congestion window limit is reached.
587 * If doing SACK, the first ACK which comes back for a timeout
588 * based retransmit packet might feed us FACK information again.
589 * If so, we use it to avoid unnecessarily retransmissions.
591 voidtcp_xmit_retransmit_queue(struct sock *sk)
593 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
594 struct sk_buff *skb;
596 if(tp->retrans_head == NULL)
597 tp->retrans_head =skb_peek(&sk->write_queue);
598 if(tp->retrans_head == tp->send_head)
599 tp->retrans_head = NULL;
601 /* Each time, advance the retrans_head if we got
602 * a packet out or we skipped one because it was
603 * SACK'd. -DaveM
605 while((skb = tp->retrans_head) != NULL) {
606 /* If it has been ack'd by a SACK block, we don't
607 * retransmit it.
609 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
610 /* Send it out, punt if error occurred. */
611 if(tcp_retransmit_skb(sk, skb))
612 break;
614 update_retrans_head(sk);
616 /* Stop retransmitting if we've hit the congestion
617 * window limit.
619 if(tp->retrans_out >= tp->snd_cwnd)
620 break;
621 }else{
622 update_retrans_head(sk);
627 /* Using FACK information, retransmit all missing frames at the receiver
628 * up to the forward most SACK'd packet (tp->fackets_out) if the packet
629 * has not been retransmitted already.
631 voidtcp_fack_retransmit(struct sock *sk)
633 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
634 struct sk_buff *skb =skb_peek(&sk->write_queue);
635 int packet_cnt =0;
637 while((skb != NULL) &&
638 (skb != tp->send_head) &&
639 (skb != (struct sk_buff *)&sk->write_queue)) {
640 __u8 sacked =TCP_SKB_CB(skb)->sacked;
642 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
643 goto next_packet;
645 /* Ok, retransmit it. */
646 if(tcp_retransmit_skb(sk, skb))
647 break;
649 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
650 break;
651 next_packet:
652 packet_cnt++;
653 if(packet_cnt >= tp->fackets_out)
654 break;
655 skb = skb->next;
659 /* Send a fin. The caller locks the socket for us. This cannot be
660 * allowed to fail queueing a FIN frame under any circumstances.
662 voidtcp_send_fin(struct sock *sk)
664 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
665 struct sk_buff *skb =skb_peek_tail(&sk->write_queue);
666 unsigned int mss_now;
668 /* Optimization, tack on the FIN if we have a queue of
669 * unsent frames. But be careful about outgoing SACKS
670 * and IP options.
672 mss_now =tcp_current_mss(sk);
674 if((tp->send_head != NULL) && (skb->len < mss_now)) {
675 /* tcp_write_xmit() takes care of the rest. */
676 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
677 TCP_SKB_CB(skb)->end_seq++;
678 tp->write_seq++;
680 /* Special case to avoid Nagle bogosity. If this
681 * segment is the last segment, and it was queued
682 * due to Nagle/SWS-avoidance, send it out now.
684 if(tp->send_head == skb &&
685 !sk->nonagle &&
686 skb->len < (tp->mss_cache >>1) &&
687 tp->packets_out &&
688 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
689 update_send_head(sk);
690 TCP_SKB_CB(skb)->when = jiffies;
691 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
692 tp->packets_out++;
693 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
694 if(!tcp_timer_is_set(sk, TIME_RETRANS))
695 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
697 }else{
698 /* Socket is locked, keep trying until memory is available. */
700 skb =sock_wmalloc(sk,
701 (MAX_HEADER +
702 sk->prot->max_header),
703 1, GFP_KERNEL);
704 }while(skb == NULL);
706 /* Reserve space for headers and prepare control bits. */
707 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
708 skb->csum =0;
709 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
710 TCP_SKB_CB(skb)->sacked =0;
711 TCP_SKB_CB(skb)->urg_ptr =0;
713 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
714 TCP_SKB_CB(skb)->seq = tp->write_seq;
715 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
716 tcp_send_skb(sk, skb,0);
720 /* We get here when a process closes a file descriptor (either due to
721 * an explicit close() or as a byproduct of exit()'ing) and there
722 * was unread data in the receive queue. This behavior is recommended
723 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
725 voidtcp_send_active_reset(struct sock *sk)
727 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
728 struct sk_buff *skb;
730 /* NOTE: No TCP options attached and we never retransmit this. */
732 skb =alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
733 }while(skb == NULL);
735 /* Reserve space for headers and prepare control bits. */
736 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
737 skb->csum =0;
738 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
739 TCP_SKB_CB(skb)->sacked =0;
740 TCP_SKB_CB(skb)->urg_ptr =0;
742 /* Send it off. */
743 TCP_SKB_CB(skb)->seq = tp->write_seq;
744 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
745 TCP_SKB_CB(skb)->when = jiffies;
746 tcp_transmit_skb(sk, skb);
749 /* WARNING: This routine must only be called when we have already sent
750 * a SYN packet that crossed the incoming SYN that caused this routine
751 * to get called. If this assumption fails then the initial rcv_wnd
752 * and rcv_wscale values will not be correct.
754 inttcp_send_synack(struct sock *sk)
756 struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
757 struct sk_buff* skb;
759 skb =sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
760 1, GFP_ATOMIC);
761 if(skb == NULL)
762 return-ENOMEM;
764 /* Reserve space for headers and prepare control bits. */
765 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
766 skb->csum =0;
767 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
768 TCP_SKB_CB(skb)->sacked =0;
769 TCP_SKB_CB(skb)->urg_ptr =0;
771 /* SYN eats a sequence byte. */
772 TCP_SKB_CB(skb)->seq = tp->snd_una;
773 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
774 __skb_queue_tail(&sk->write_queue, skb);
775 TCP_SKB_CB(skb)->when = jiffies;
776 tp->packets_out++;
777 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
778 return0;
782 * Prepare a SYN-ACK.
784 struct sk_buff *tcp_make_synack(struct sock *sk,struct dst_entry *dst,
785 struct open_request *req,int mss)
787 struct tcphdr *th;
788 int tcp_header_size;
789 struct sk_buff *skb;
791 skb =sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header,1, GFP_ATOMIC);
792 if(skb == NULL)
793 return NULL;
795 /* Reserve space for headers. */
796 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
798 skb->dst =dst_clone(dst);
800 /* Don't offer more than they did.
801 * This way we don't have to memorize who said what.
802 * FIXME: maybe this should be changed for better performance
803 * with syncookies.
805 req->mss =min(mss, req->mss);
806 if(req->mss <8) {
807 printk(KERN_DEBUG "initial req->mss below 8\n");
808 req->mss =8;
811 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
812 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0) +
813 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED :0) +
814 /* SACK_PERM is in the place of NOP NOP of TS */
815 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED :0));
816 skb->h.th = th = (struct tcphdr *)skb_push(skb, tcp_header_size);
818 memset(th,0,sizeof(struct tcphdr));
819 th->syn =1;
820 th->ack =1;
821 th->source = sk->sport;
822 th->dest = req->rmt_port;
823 TCP_SKB_CB(skb)->seq = req->snt_isn;
824 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
825 th->seq =htonl(TCP_SKB_CB(skb)->seq);
826 th->ack_seq =htonl(req->rcv_isn +1);
827 if(req->rcv_wnd ==0) {/* ignored for retransmitted syns */
828 __u8 rcv_wscale;
829 /* Set this up on the first call only */
830 req->window_clamp = skb->dst->window;
831 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
832 &req->rcv_wnd,
833 &req->window_clamp,
834 req->wscale_ok,
835 &rcv_wscale);
836 req->rcv_wscale = rcv_wscale;
839 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
840 th->window =htons(req->rcv_wnd);
842 TCP_SKB_CB(skb)->when = jiffies;
843 tcp_syn_build_options((__u32 *)(th +1), req->mss, req->tstamp_ok,
844 req->sack_ok, req->wscale_ok, req->rcv_wscale,
845 TCP_SKB_CB(skb)->when);
847 skb->csum =0;
848 th->doff = (tcp_header_size >>2);
849 tcp_statistics.TcpOutSegs++;
850 return skb;
853 voidtcp_connect(struct sock *sk,struct sk_buff *buff,int mtu)
855 struct dst_entry *dst = sk->dst_cache;
856 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
858 /* Reserve space for headers. */
859 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
861 tp->snd_wnd =0;
862 tp->snd_wl1 =0;
863 tp->snd_wl2 = tp->write_seq;
864 tp->snd_una = tp->write_seq;
865 tp->rcv_nxt =0;
867 sk->err =0;
869 /* We'll fix this up when we get a response from the other end.
870 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
872 tp->tcp_header_len =sizeof(struct tcphdr) +
873 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED :0);
875 /* If user gave his TCP_MAXSEG, record it to clamp */
876 if(tp->user_mss)
877 tp->mss_clamp = tp->user_mss;
878 tcp_sync_mss(sk, mtu);
880 /* Now unpleasant action: if initial pmtu is too low
881 set lower clamp. I am not sure that it is good.
882 To be more exact, I do not think that clamping at value, which
883 is apparently transient and may improve in future is good idea.
884 It would be better to wait until peer will returns its MSS
885 (probably 65535 too) and now advertise something sort of 65535
886 or at least first hop device mtu. Is it clear, what I mean?
887 We should tell peer what maximal mss we expect to RECEIVE,
888 it has nothing to do with pmtu.
889 I am afraid someone will be confused by such huge value.
890 --ANK (980731)
892 if(tp->mss_cache + tp->tcp_header_len -sizeof(struct tcphdr) < tp->mss_clamp )
893 tp->mss_clamp = tp->mss_cache + tp->tcp_header_len -sizeof(struct tcphdr);
895 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
896 TCP_SKB_CB(buff)->sacked =0;
897 TCP_SKB_CB(buff)->urg_ptr =0;
898 buff->csum =0;
899 TCP_SKB_CB(buff)->seq = tp->write_seq++;
900 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
901 tp->snd_nxt =TCP_SKB_CB(buff)->end_seq;
903 tp->window_clamp = dst->window;
904 tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
905 &tp->rcv_wnd,
906 &tp->window_clamp,
907 sysctl_tcp_window_scaling,
908 &tp->rcv_wscale);
909 /* Ok, now lock the socket before we make it visible to
910 * the incoming packet engine.
912 lock_sock(sk);
914 /* Socket identity change complete, no longer
915 * in TCP_CLOSE, so enter ourselves into the
916 * hash tables.
918 tcp_set_state(sk,TCP_SYN_SENT);
919 sk->prot->hash(sk);
921 tp->rto = dst->rtt;
922 tcp_init_xmit_timers(sk);
923 tp->retransmits =0;
924 tp->fackets_out =0;
925 tp->retrans_out =0;
927 /* Send it off. */
928 __skb_queue_tail(&sk->write_queue, buff);
929 TCP_SKB_CB(buff)->when = jiffies;
930 tp->packets_out++;
931 tcp_transmit_skb(sk,skb_clone(buff, GFP_KERNEL));
932 tcp_statistics.TcpActiveOpens++;
934 /* Timer for repeating the SYN until an answer. */
935 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
937 /* Now, it is safe to release the socket. */
938 release_sock(sk);
941 /* Send out a delayed ack, the caller does the policy checking
942 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
943 * for details.
945 voidtcp_send_delayed_ack(struct tcp_opt *tp,int max_timeout)
947 unsigned long timeout;
949 /* Stay within the limit we were given */
950 timeout = tp->ato;
951 if(timeout > max_timeout)
952 timeout = max_timeout;
953 timeout += jiffies;
955 /* Use new timeout only if there wasn't a older one earlier. */
956 if(!tp->delack_timer.prev) {
957 tp->delack_timer.expires = timeout;
958 add_timer(&tp->delack_timer);
959 }else{
960 if(timeout < tp->delack_timer.expires)
961 mod_timer(&tp->delack_timer, timeout);
965 /* This routine sends an ack and also updates the window. */
966 voidtcp_send_ack(struct sock *sk)
968 /* If we have been reset, we may not send again. */
969 if(!sk->zapped) {
970 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
971 struct sk_buff *buff;
973 /* We are not putting this on the write queue, so
974 * tcp_transmit_skb() will set the ownership to this
975 * sock.
977 buff =alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
978 if(buff == NULL) {
979 /* Force it to send an ack. We don't have to do this
980 * (ACK is unreliable) but it's much better use of
981 * bandwidth on slow links to send a spare ack than
982 * resend packets.
984 tcp_send_delayed_ack(tp, HZ/2);
985 return;
988 /* Reserve space for headers and prepare control bits. */
989 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
990 buff->csum =0;
991 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
992 TCP_SKB_CB(buff)->sacked =0;
993 TCP_SKB_CB(buff)->urg_ptr =0;
995 /* Send it off, this clears delayed acks for us. */
996 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
997 TCP_SKB_CB(buff)->when = jiffies;
998 tcp_transmit_skb(sk, buff);
1002 /* This routine sends a packet with an out of date sequence
1003 * number. It assumes the other end will try to ack it.
1005 voidtcp_write_wakeup(struct sock *sk)
1007 /* After a valid reset we can send no more. */
1008 if(!sk->zapped) {
1009 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1010 struct sk_buff *skb;
1012 /* Write data can still be transmitted/retransmitted in the
1013 * following states. If any other state is encountered, return.
1014 * [listen/close will never occur here anyway]
1016 if((1<< sk->state) &
1017 ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1018 TCPF_LAST_ACK|TCPF_CLOSING))
1019 return;
1021 if(before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1022 ((skb = tp->send_head) != NULL)) {
1023 unsigned long win_size;
1025 /* We are probing the opening of a window
1026 * but the window size is != 0
1027 * must have been a result SWS avoidance ( sender )
1029 win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1030 if(win_size <TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq) {
1031 if(tcp_fragment(sk, skb, win_size))
1032 return;/* Let a retransmit get it. */
1034 update_send_head(sk);
1035 TCP_SKB_CB(skb)->when = jiffies;
1036 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
1037 tp->packets_out++;
1038 tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1039 if(!tcp_timer_is_set(sk, TIME_RETRANS))
1040 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1041 }else{
1042 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1043 skb =alloc_skb(MAX_HEADER + sk->prot->max_header,
1044 GFP_ATOMIC);
1045 if(skb == NULL)
1046 return;
1048 /* Reserve space for headers and set control bits. */
1049 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1050 skb->csum =0;
1051 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1052 TCP_SKB_CB(skb)->sacked =0;
1053 TCP_SKB_CB(skb)->urg_ptr =0;
1055 /* Use a previous sequence. This should cause the other
1056 * end to send an ack. Don't queue or clone SKB, just
1057 * send it.
1059 TCP_SKB_CB(skb)->seq = tp->snd_nxt -1;
1060 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
1061 TCP_SKB_CB(skb)->when = jiffies;
1062 tcp_transmit_skb(sk, skb);
1067 /* A window probe timeout has occurred. If window is not closed send
1068 * a partial packet else a zero probe.
1070 voidtcp_send_probe0(struct sock *sk)
1072 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1074 tcp_write_wakeup(sk);
1075 tp->pending = TIME_PROBE0;
1076 tp->backoff++;
1077 tp->probes_out++;
1078 tcp_reset_xmit_timer(sk, TIME_PROBE0,
1079 min(tp->rto << tp->backoff,120*HZ));
close