Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / ipv4 / tcp_output.c
blobca46db72c94a51dfd1ce98c7bdc2deedf867bc70
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.129 2000/11/28 17:04:10 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
39 #include <net/tcp.h>
41 #include <linux/smp_lock.h>
43 /* People can turn this off for buggy TCP's found in printers etc. */
44 int sysctl_tcp_retrans_collapse =1;
46 static __inline__
47 voidupdate_send_head(struct sock *sk,struct tcp_opt *tp,struct sk_buff *skb)
49 tp->send_head = skb->next;
50 if(tp->send_head == (struct sk_buff *) &sk->write_queue)
51 tp->send_head = NULL;
52 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
53 if(tp->packets_out++ ==0)
54 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
57 /* SND.NXT, if window was not shrunk.
58 * If window has been shrunk, what should we make? It is not clear at all.
59 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
60 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
61 * invalid. OK, let's make this for now:
63 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk,struct tcp_opt *tp)
65 if(!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
66 return tp->snd_nxt;
67 else
68 return tp->snd_una+tp->snd_wnd;
71 /* Calculate mss to advertise in SYN segment.
72 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
74 * 1. It is independent of path mtu.
75 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
76 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
77 * attached devices, because some buggy hosts are confused by
78 * large MSS.
79 * 4. We do not make 3, we advertise MSS, calculated from first
80 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
81 * This may be overriden via information stored in routing table.
82 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
83 * probably even Jumbo".
85 static __u16 tcp_advertise_mss(struct sock *sk)
87 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
88 struct dst_entry *dst =__sk_dst_get(sk);
89 int mss = tp->advmss;
91 if(dst && dst->advmss < mss) {
92 mss = dst->advmss;
93 tp->advmss = mss;
96 return(__u16)mss;
99 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
100 * This is the first part of cwnd validation mechanism. */
101 static voidtcp_cwnd_restart(struct tcp_opt *tp)
103 s32 delta = tcp_time_stamp - tp->lsndtime;
104 u32 restart_cwnd =tcp_init_cwnd(tp);
105 u32 cwnd = tp->snd_cwnd;
107 tp->snd_ssthresh =tcp_current_ssthresh(tp);
108 restart_cwnd =min(restart_cwnd, cwnd);
110 while((delta -= tp->rto) >0&& cwnd > restart_cwnd)
111 cwnd >>=1;
112 tp->snd_cwnd =max(cwnd, restart_cwnd);
113 tp->snd_cwnd_stamp = tcp_time_stamp;
114 tp->snd_cwnd_used =0;
117 static __inline__ voidtcp_event_data_sent(struct tcp_opt *tp,struct sk_buff *skb)
119 u32 now = tcp_time_stamp;
121 if(!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
122 tcp_cwnd_restart(tp);
124 tp->lsndtime = now;
126 /* If it is a reply for ato after last received
127 * packet, enter pingpong mode.
129 if((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
130 tp->ack.pingpong =1;
133 static __inline__ voidtcp_event_ack_sent(struct sock *sk)
135 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
137 tcp_dec_quickack_mode(tp);
138 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
141 /* Chose a new window to advertise, update state in tcp_opt for the
142 * socket, and return result with RFC1323 scaling applied. The return
143 * value can be stuffed directly into th->window for an outgoing
144 * frame.
146 static __inline__ u16 tcp_select_window(struct sock *sk)
148 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
149 u32 cur_win =tcp_receive_window(tp);
150 u32 new_win =__tcp_select_window(sk);
152 /* Never shrink the offered window */
153 if(new_win < cur_win) {
154 /* Danger Will Robinson!
155 * Don't update rcv_wup/rcv_wnd here or else
156 * we will not be able to advertise a zero
157 * window in time. --DaveM
159 * Relax Will Robinson.
161 new_win = cur_win;
163 tp->rcv_wnd = new_win;
164 tp->rcv_wup = tp->rcv_nxt;
166 /* RFC1323 scaling applied */
167 new_win >>= tp->rcv_wscale;
169 #ifdef TCP_FORMAL_WINDOW
170 if(new_win ==0) {
171 /* If we advertise zero window, disable fast path. */
172 tp->pred_flags =0;
173 }else if(cur_win ==0&& tp->pred_flags ==0&&
174 skb_queue_len(&tp->out_of_order_queue) ==0&&
175 !tp->urg_data) {
176 /* If we open zero window, enable fast path.
177 Without this it will be open by the first data packet,
178 it is too late to merge checksumming to copy.
180 tcp_fast_path_on(tp);
182 #endif
184 return new_win;
188 /* This routine actually transmits TCP packets queued in by
189 * tcp_do_sendmsg(). This is used by both the initial
190 * transmission and possible later retransmissions.
191 * All SKB's seen here are completely headerless. It is our
192 * job to build the TCP header, and pass the packet down to
193 * IP so it can do the same plus pass the packet off to the
194 * device.
196 * We are working here with either a clone of the original
197 * SKB, or a fresh unique copy made by the retransmit engine.
199 inttcp_transmit_skb(struct sock *sk,struct sk_buff *skb)
201 if(skb != NULL) {
202 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
203 struct tcp_skb_cb *tcb =TCP_SKB_CB(skb);
204 int tcp_header_size = tp->tcp_header_len;
205 struct tcphdr *th;
206 int sysctl_flags;
207 int err;
209 #define SYSCTL_FLAG_TSTAMPS 0x1
210 #define SYSCTL_FLAG_WSCALE 0x2
211 #define SYSCTL_FLAG_SACK 0x4
213 sysctl_flags =0;
214 if(tcb->flags & TCPCB_FLAG_SYN) {
215 tcp_header_size =sizeof(struct tcphdr) + TCPOLEN_MSS;
216 if(sysctl_tcp_timestamps) {
217 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
218 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
220 if(sysctl_tcp_window_scaling) {
221 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
222 sysctl_flags |= SYSCTL_FLAG_WSCALE;
224 if(sysctl_tcp_sack) {
225 sysctl_flags |= SYSCTL_FLAG_SACK;
226 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
227 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
229 }else if(tp->eff_sacks) {
230 /* A SACK is 2 pad bytes, a 2 byte header, plus
231 * 2 32-bit sequence numbers for each SACK block.
233 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
234 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
236 th = (struct tcphdr *)skb_push(skb, tcp_header_size);
237 skb->h.th = th;
238 skb_set_owner_w(skb, sk);
240 /* Build TCP header and checksum it. */
241 th->source = sk->sport;
242 th->dest = sk->dport;
243 th->seq =htonl(tcb->seq);
244 th->ack_seq =htonl(tp->rcv_nxt);
245 *(((__u16 *)th) +6) =htons(((tcp_header_size >>2) <<12) | tcb->flags);
246 if(tcb->flags & TCPCB_FLAG_SYN) {
247 /* RFC1323: The window in SYN & SYN/ACK segments
248 * is never scaled.
250 th->window =htons(tp->rcv_wnd);
251 }else{
252 th->window =htons(tcp_select_window(sk));
254 th->check =0;
255 th->urg_ptr =0;
257 if(tp->urg_mode &&
258 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
259 th->urg_ptr =htons(tp->snd_up-tcb->seq);
260 th->urg =1;
263 if(tcb->flags & TCPCB_FLAG_SYN) {
264 tcp_syn_build_options((__u32 *)(th +1),
265 tcp_advertise_mss(sk),
266 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
267 (sysctl_flags & SYSCTL_FLAG_SACK),
268 (sysctl_flags & SYSCTL_FLAG_WSCALE),
269 tp->rcv_wscale,
270 tcb->when,
271 tp->ts_recent);
272 }else{
273 tcp_build_and_update_options((__u32 *)(th +1),
274 tp, tcb->when);
276 TCP_ECN_send(sk, tp, skb, tcp_header_size);
278 tp->af_specific->send_check(sk, th, skb->len, skb);
280 if(tcb->flags & TCPCB_FLAG_ACK)
281 tcp_event_ack_sent(sk);
283 if(skb->len != tcp_header_size)
284 tcp_event_data_sent(tp, skb);
286 TCP_INC_STATS(TcpOutSegs);
288 err = tp->af_specific->queue_xmit(skb);
289 if(err <=0)
290 return err;
292 tcp_enter_cwr(tp);
294 /* NET_XMIT_CN is special. It does not guarantee,
295 * that this packet is lost. It tells that device
296 * is about to start to drop packets or already
297 * drops some packets of the same priority and
298 * invokes us to send less aggressively.
300 return err == NET_XMIT_CN ?0: err;
302 return-ENOBUFS;
303 #undef SYSCTL_FLAG_TSTAMPS
304 #undef SYSCTL_FLAG_WSCALE
305 #undef SYSCTL_FLAG_SACK
309 /* This is the main buffer sending routine. We queue the buffer
310 * and decide whether to queue or transmit now.
312 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
313 * otherwise socket can stall.
315 voidtcp_send_skb(struct sock *sk,struct sk_buff *skb,int force_queue,unsigned cur_mss)
317 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
319 /* Advance write_seq and place onto the write_queue. */
320 tp->write_seq =TCP_SKB_CB(skb)->end_seq;
321 __skb_queue_tail(&sk->write_queue, skb);
322 tcp_charge_skb(sk, skb);
324 if(!force_queue && tp->send_head == NULL &&tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
325 /* Send it out now. */
326 TCP_SKB_CB(skb)->when = tcp_time_stamp;
327 if(tcp_transmit_skb(sk,skb_clone(skb, sk->allocation)) ==0) {
328 tp->snd_nxt =TCP_SKB_CB(skb)->end_seq;
329 tcp_minshall_update(tp, cur_mss, skb);
330 if(tp->packets_out++ ==0)
331 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
332 return;
335 /* Queue it, remembering where we must start sending. */
336 if(tp->send_head == NULL)
337 tp->send_head = skb;
340 /* Function to create two new TCP segments. Shrinks the given segment
341 * to the specified size and appends a new segment with the rest of the
342 * packet to the list. This won't be called frequently, I hope.
343 * Remember, these are still headerless SKBs at this point.
345 static inttcp_fragment(struct sock *sk,struct sk_buff *skb, u32 len)
347 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
348 struct sk_buff *buff;
349 int nsize = skb->len - len;
350 u16 flags;
352 /* Get a new skb... force flag on. */
353 buff =tcp_alloc_skb(sk, nsize + MAX_TCP_HEADER, GFP_ATOMIC);
354 if(buff == NULL)
355 return-ENOMEM;/* We'll just try again later. */
356 tcp_charge_skb(sk, buff);
358 /* Reserve space for headers. */
359 skb_reserve(buff, MAX_TCP_HEADER);
361 /* Correct the sequence numbers. */
362 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(skb)->seq + len;
363 TCP_SKB_CB(buff)->end_seq =TCP_SKB_CB(skb)->end_seq;
365 /* PSH and FIN should only be set in the second packet. */
366 flags =TCP_SKB_CB(skb)->flags;
367 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
368 TCP_SKB_CB(buff)->flags = flags;
369 TCP_SKB_CB(buff)->sacked =TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
370 if(TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
371 tp->lost_out++;
372 tp->left_out++;
374 TCP_SKB_CB(buff)->sacked &= ~TCPCB_AT_TAIL;
376 /* Copy and checksum data tail into the new buffer. */
377 buff->csum =csum_partial_copy_nocheck(skb->data + len,skb_put(buff, nsize),
378 nsize,0);
380 /* This takes care of the FIN sequence number too. */
381 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(buff)->seq;
382 skb_trim(skb, len);
384 /* Rechecksum original buffer. */
385 skb->csum =csum_partial(skb->data, skb->len,0);
387 /* Looks stupid, but our code really uses when of
388 * skbs, which it never sent before. --ANK
390 TCP_SKB_CB(buff)->when =TCP_SKB_CB(skb)->when;
392 /* Link BUFF into the send queue. */
393 __skb_append(skb, buff);
395 return0;
398 /* This function synchronize snd mss to current pmtu/exthdr set.
400 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
401 for TCP options, but includes only bare TCP header.
403 tp->mss_clamp is mss negotiated at connection setup.
404 It is minumum of user_mss and mss received with SYN.
405 It also does not include TCP options.
407 tp->pmtu_cookie is last pmtu, seen by this function.
409 tp->mss_cache is current effective sending mss, including
410 all tcp options except for SACKs. It is evaluated,
411 taking into account current pmtu, but never exceeds
412 tp->mss_clamp.
414 NOTE1. rfc1122 clearly states that advertised MSS
415 DOES NOT include either tcp or ip options.
417 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
418 this function. --ANK (980731)
421 inttcp_sync_mss(struct sock *sk, u32 pmtu)
423 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
424 int mss_now;
426 /* Calculate base mss without TCP options:
427 It is MMS_S - sizeof(tcphdr) of rfc1122
430 mss_now = pmtu - tp->af_specific->net_header_len -sizeof(struct tcphdr);
432 /* Clamp it (mss_clamp does not include tcp options) */
433 if(mss_now > tp->mss_clamp)
434 mss_now = tp->mss_clamp;
436 /* Now subtract optional transport overhead */
437 mss_now -= tp->ext_header_len;
439 /* Then reserve room for full set of TCP options and 8 bytes of data */
440 if(mss_now <48)
441 mss_now =48;
443 /* Now subtract TCP options size, not including SACKs */
444 mss_now -= tp->tcp_header_len -sizeof(struct tcphdr);
446 /* Bound mss with half of window */
447 if(tp->max_window && mss_now > (tp->max_window>>1))
448 mss_now =max((tp->max_window>>1),68- tp->tcp_header_len);
450 /* And store cached results */
451 tp->pmtu_cookie = pmtu;
452 tp->mss_cache = mss_now;
453 return mss_now;
457 /* This routine writes packets to the network. It advances the
458 * send_head. This happens as incoming acks open up the remote
459 * window for us.
461 * Returns 1, if no segments are in flight and we have queued segments, but
462 * cannot send anything now because of SWS or another problem.
464 inttcp_write_xmit(struct sock *sk)
466 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
467 unsigned int mss_now;
469 /* If we are closed, the bytes will have to remain here.
470 * In time closedown will finish, we empty the write queue and all
471 * will be happy.
473 if(sk->state != TCP_CLOSE) {
474 struct sk_buff *skb;
475 int sent_pkts =0;
477 /* Account for SACKS, we may need to fragment due to this.
478 * It is just like the real MSS changing on us midstream.
479 * We also handle things correctly when the user adds some
480 * IP options mid-stream. Silly to do, but cover it.
482 mss_now =tcp_current_mss(sk);
484 while((skb = tp->send_head) &&
485 tcp_snd_test(tp, skb, mss_now,tcp_skb_is_last(sk, skb) ? tp->nonagle :1)) {
486 if(skb->len > mss_now) {
487 if(tcp_fragment(sk, skb, mss_now))
488 break;
491 TCP_SKB_CB(skb)->when = tcp_time_stamp;
492 if(tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC)))
493 break;
494 /* Advance the send_head. This one is sent out. */
495 update_send_head(sk, tp, skb);
496 tcp_minshall_update(tp, mss_now, skb);
497 sent_pkts =1;
500 if(sent_pkts) {
501 tcp_cwnd_validate(sk, tp);
502 return0;
505 return!tp->packets_out && tp->send_head;
507 return0;
510 /* This function returns the amount that we can raise the
511 * usable window based on the following constraints
513 * 1. The window can never be shrunk once it is offered (RFC 793)
514 * 2. We limit memory per socket
516 * RFC 1122:
517 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
518 * RECV.NEXT + RCV.WIN fixed until:
519 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
521 * i.e. don't raise the right edge of the window until you can raise
522 * it at least MSS bytes.
524 * Unfortunately, the recommended algorithm breaks header prediction,
525 * since header prediction assumes th->window stays fixed.
527 * Strictly speaking, keeping th->window fixed violates the receiver
528 * side SWS prevention criteria. The problem is that under this rule
529 * a stream of single byte packets will cause the right side of the
530 * window to always advance by a single byte.
532 * Of course, if the sender implements sender side SWS prevention
533 * then this will not be a problem.
535 * BSD seems to make the following compromise:
537 * If the free space is less than the 1/4 of the maximum
538 * space available and the free space is less than 1/2 mss,
539 * then set the window to 0.
540 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
541 * Otherwise, just prevent the window from shrinking
542 * and from being larger than the largest representable value.
544 * This prevents incremental opening of the window in the regime
545 * where TCP is limited by the speed of the reader side taking
546 * data out of the TCP receive queue. It does nothing about
547 * those cases where the window is constrained on the sender side
548 * because the pipeline is full.
550 * BSD also seems to "accidentally" limit itself to windows that are a
551 * multiple of MSS, at least until the free space gets quite small.
552 * This would appear to be a side effect of the mbuf implementation.
553 * Combining these two algorithms results in the observed behavior
554 * of having a fixed window size at almost all times.
556 * Below we obtain similar behavior by forcing the offered window to
557 * a multiple of the mss when it is feasible to do so.
559 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
560 * Regular options like TIMESTAMP are taken into account.
562 u32 __tcp_select_window(struct sock *sk)
564 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
565 /* MSS for the peer's data. Previous verions used mss_clamp
566 * here. I don't know if the value based on our guesses
567 * of peer's MSS is better for the performance. It's more correct
568 * but may be worse for the performance because of rcv_mss
569 * fluctuations. --SAW 1998/11/1
571 unsigned int mss = tp->ack.rcv_mss;
572 int free_space;
573 u32 window;
575 /* Sometimes free_space can be < 0. */
576 free_space =tcp_space(sk);
577 if(tp->window_clamp < mss)
578 mss = tp->window_clamp;
580 if(free_space < (int)min(tp->window_clamp,tcp_full_space(sk)) /2) {
581 tp->ack.quick =0;
583 if(tcp_memory_pressure)
584 tp->rcv_ssthresh =min(tp->rcv_ssthresh,4*tp->advmss);
586 if(free_space < ((int)mss))
587 return0;
590 if(free_space > tp->rcv_ssthresh)
591 free_space = tp->rcv_ssthresh;
593 /* Get the largest window that is a nice multiple of mss.
594 * Window clamp already applied above.
595 * If our current window offering is within 1 mss of the
596 * free space we just keep it. This prevents the divide
597 * and multiply from happening most of the time.
598 * We also don't do any window rounding when the free space
599 * is too small.
601 window = tp->rcv_wnd;
602 if((((int) window) <= (free_space - ((int) mss))) ||
603 (((int) window) > free_space))
604 window = (((unsigned int) free_space)/mss)*mss;
606 return window;
609 /* Attempt to collapse two adjacent SKB's during retransmission. */
610 static voidtcp_retrans_try_collapse(struct sock *sk,struct sk_buff *skb,int mss_now)
612 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
613 struct sk_buff *next_skb = skb->next;
615 /* The first test we must make is that neither of these two
616 * SKB's are still referenced by someone else.
618 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
619 int skb_size = skb->len, next_skb_size = next_skb->len;
620 u16 flags =TCP_SKB_CB(skb)->flags;
622 /* Also punt if next skb has been SACK'd. */
623 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
624 return;
626 /* Next skb is out of window. */
627 if(after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
628 return;
630 /* Punt if not enough space exists in the first SKB for
631 * the data in the second, or the total combined payload
632 * would exceed the MSS.
634 if((next_skb_size >skb_tailroom(skb)) ||
635 ((skb_size + next_skb_size) > mss_now))
636 return;
638 /* Ok. We will be able to collapse the packet. */
639 __skb_unlink(next_skb, next_skb->list);
641 if(skb->len %4) {
642 /* Must copy and rechecksum all data. */
643 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
644 skb->csum =csum_partial(skb->data, skb->len,0);
645 }else{
646 /* Optimize, actually we could also combine next_skb->csum
647 * to skb->csum using a single add w/carry operation too.
649 skb->csum =csum_partial_copy_nocheck(next_skb->data,
650 skb_put(skb, next_skb_size),
651 next_skb_size, skb->csum);
654 /* Update sequence range on original skb. */
655 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(next_skb)->end_seq;
657 /* Merge over control information. */
658 flags |=TCP_SKB_CB(next_skb)->flags;/* This moves PSH/FIN etc. over */
659 TCP_SKB_CB(skb)->flags = flags;
661 /* All done, get rid of second SKB and account for it so
662 * packet counting does not break.
664 TCP_SKB_CB(skb)->sacked |=TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
665 if(TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
666 tp->retrans_out--;
667 if(TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
668 tp->lost_out--;
669 tp->left_out--;
671 if(!tp->sack_ok && tp->sacked_out) {
672 /* Reno case is special. Sigh... */
673 tp->sacked_out--;
674 tp->left_out--;
676 /* Not quite right: it can be > snd.fack, but
677 * it is better to underestimate fackets.
679 if(tp->fackets_out)
680 tp->fackets_out--;
681 tcp_free_skb(sk, next_skb);
682 tp->packets_out--;
686 /* Do a simple retransmit without using the backoff mechanisms in
687 * tcp_timer. This is used for path mtu discovery.
688 * The socket is already locked here.
690 voidtcp_simple_retransmit(struct sock *sk)
692 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
693 struct sk_buff *skb;
694 unsigned int mss =tcp_current_mss(sk);
695 int lost =0;
697 for_retrans_queue(skb, sk, tp) {
698 if(skb->len > mss &&
699 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
700 if(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
701 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
702 tp->retrans_out--;
704 if(!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
705 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
706 tp->lost_out++;
707 lost =1;
712 if(!lost)
713 return;
715 tp->left_out = tp->sacked_out + tp->lost_out;
717 /* Don't muck with the congestion window here.
718 * Reason is that we do not increase amount of _data_
719 * in network, but units changed and effective
720 * cwnd/ssthresh really reduced now.
722 if(tp->ca_state != TCP_CA_Loss) {
723 tp->high_seq = tp->snd_nxt;
724 tp->snd_ssthresh =tcp_current_ssthresh(tp);
725 tp->prior_ssthresh =0;
726 tp->undo_marker =0;
727 tp->ca_state = TCP_CA_Loss;
729 tcp_xmit_retransmit_queue(sk);
732 /* This retransmits one SKB. Policy decisions and retransmit queue
733 * state updates are done by the caller. Returns non-zero if an
734 * error occurred which prevented the send.
736 inttcp_retransmit_skb(struct sock *sk,struct sk_buff *skb)
738 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
739 unsigned int cur_mss =tcp_current_mss(sk);
740 int err;
742 /* Do not sent more than we queued. 1/4 is reserved for possible
743 * copying overhead: frgagmentation, tunneling, mangling etc.
745 if(atomic_read(&sk->wmem_alloc) >min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
746 return-EAGAIN;
748 if(skb->len > cur_mss) {
749 if(tcp_fragment(sk, skb, cur_mss))
750 return-ENOMEM;/* We'll try again later. */
752 /* New SKB created, account for it. */
753 tp->packets_out++;
756 /* Collapse two adjacent packets if worthwhile and we can. */
757 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
758 (skb->len < (cur_mss >>1)) &&
759 (skb->next != tp->send_head) &&
760 (skb->next != (struct sk_buff *)&sk->write_queue) &&
761 (sysctl_tcp_retrans_collapse !=0))
762 tcp_retrans_try_collapse(sk, skb, cur_mss);
764 if(tp->af_specific->rebuild_header(sk))
765 return-EHOSTUNREACH;/* Routing failure or similar. */
767 /* Some Solaris stacks overoptimize and ignore the FIN on a
768 * retransmit when old data is attached. So strip it off
769 * since it is cheap to do so and saves bytes on the network.
771 if(skb->len >0&&
772 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
773 tp->snd_una == (TCP_SKB_CB(skb)->end_seq -1)) {
774 TCP_SKB_CB(skb)->seq =TCP_SKB_CB(skb)->end_seq -1;
775 skb_trim(skb,0);
776 skb->csum =0;
779 /* Make a copy, if the first transmission SKB clone we made
780 * is still in somebody's hands, else make a clone.
782 TCP_SKB_CB(skb)->when = tcp_time_stamp;
784 err =tcp_transmit_skb(sk, (skb_cloned(skb) ?
785 skb_copy(skb, GFP_ATOMIC):
786 skb_clone(skb, GFP_ATOMIC)));
788 if(err ==0) {
789 /* Update global TCP statistics. */
790 TCP_INC_STATS(TcpRetransSegs);
792 #if FASTRETRANS_DEBUG > 0
793 if(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
794 if(net_ratelimit())
795 printk(KERN_DEBUG "retrans_out leaked.\n");
797 #endif
798 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
799 tp->retrans_out++;
801 /* Save stamp of the first retransmit. */
802 if(!tp->retrans_stamp)
803 tp->retrans_stamp =TCP_SKB_CB(skb)->when;
805 tp->undo_retrans++;
807 /* snd_nxt is stored to detect loss of retransmitted segment,
808 * see tcp_input.c tcp_sacktag_write_queue().
810 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
812 return err;
815 /* This gets called after a retransmit timeout, and the initially
816 * retransmitted data is acknowledged. It tries to continue
817 * resending the rest of the retransmit queue, until either
818 * we've sent it all or the congestion window limit is reached.
819 * If doing SACK, the first ACK which comes back for a timeout
820 * based retransmit packet might feed us FACK information again.
821 * If so, we use it to avoid unnecessarily retransmissions.
823 voidtcp_xmit_retransmit_queue(struct sock *sk)
825 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
826 struct sk_buff *skb;
827 int packet_cnt = tp->lost_out;
829 /* First pass: retransmit lost packets. */
830 if(packet_cnt) {
831 for_retrans_queue(skb, sk, tp) {
832 __u8 sacked =TCP_SKB_CB(skb)->sacked;
834 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
835 return;
837 if(sacked&TCPCB_LOST) {
838 if(!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
839 if(tcp_retransmit_skb(sk, skb))
840 return;
841 if(tp->ca_state != TCP_CA_Loss)
842 NET_INC_STATS_BH(TCPFastRetrans);
843 else
844 NET_INC_STATS_BH(TCPSlowStartRetrans);
846 if(skb ==skb_peek(&sk->write_queue))
847 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
850 if(--packet_cnt <=0)
851 break;
856 /* OK, demanded retransmission is finished. */
858 /* Forward retransmissions are possible only during Recovery. */
859 if(tp->ca_state != TCP_CA_Recovery)
860 return;
862 /* No forward retransmissions in Reno are possible. */
863 if(!tp->sack_ok)
864 return;
866 /* Yeah, we have to make difficult choice between forward transmission
867 * and retransmission... Both ways have their merits...
869 * For now we do not retrnamsit anything, while we have some new
870 * segments to send.
873 if(tcp_may_send_now(sk, tp))
874 return;
876 packet_cnt =0;
878 for_retrans_queue(skb, sk, tp) {
879 if(++packet_cnt > tp->fackets_out)
880 break;
882 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
883 break;
885 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
886 continue;
888 /* Ok, retransmit it. */
889 if(tcp_retransmit_skb(sk, skb))
890 break;
892 if(skb ==skb_peek(&sk->write_queue))
893 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
895 NET_INC_STATS_BH(TCPForwardRetrans);
900 /* Send a fin. The caller locks the socket for us. This cannot be
901 * allowed to fail queueing a FIN frame under any circumstances.
903 voidtcp_send_fin(struct sock *sk)
905 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
906 struct sk_buff *skb =skb_peek_tail(&sk->write_queue);
907 unsigned int mss_now;
909 /* Optimization, tack on the FIN if we have a queue of
910 * unsent frames. But be careful about outgoing SACKS
911 * and IP options.
913 mss_now =tcp_current_mss(sk);
915 /* Please, find seven differences of 2.3.33 and loook
916 * what I broke here. 8) --ANK
919 if(tp->send_head != NULL) {
920 /* tcp_write_xmit() takes care of the rest. */
921 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
922 TCP_SKB_CB(skb)->end_seq++;
923 tp->write_seq++;
925 /* Special case to avoid Nagle bogosity. If this
926 * segment is the last segment, and it was queued
927 * due to Nagle/SWS-avoidance, send it out now.
929 if(tp->send_head == skb &&
930 !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) {
931 TCP_SKB_CB(skb)->when = tcp_time_stamp;
932 if(!tcp_transmit_skb(sk,skb_clone(skb, GFP_KERNEL)))
933 update_send_head(sk, tp, skb);
934 else
935 tcp_check_probe_timer(sk, tp);
937 }else{
938 /* Socket is locked, keep trying until memory is available. */
939 for(;;) {
940 skb =alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
941 if(skb)
942 break;
943 current->policy |= SCHED_YIELD;
944 schedule();
947 /* Reserve space for headers and prepare control bits. */
948 skb_reserve(skb, MAX_TCP_HEADER);
949 skb->csum =0;
950 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
951 TCP_SKB_CB(skb)->sacked =0;
953 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
954 TCP_SKB_CB(skb)->seq = tp->write_seq;
955 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
956 tcp_send_skb(sk, skb,0, mss_now);
957 __tcp_push_pending_frames(sk, tp, mss_now,1);
961 /* We get here when a process closes a file descriptor (either due to
962 * an explicit close() or as a byproduct of exit()'ing) and there
963 * was unread data in the receive queue. This behavior is recommended
964 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
966 voidtcp_send_active_reset(struct sock *sk,int priority)
968 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
969 struct sk_buff *skb;
971 /* NOTE: No TCP options attached and we never retransmit this. */
972 skb =alloc_skb(MAX_TCP_HEADER, priority);
973 if(!skb) {
974 NET_INC_STATS(TCPAbortFailed);
975 return;
978 /* Reserve space for headers and prepare control bits. */
979 skb_reserve(skb, MAX_TCP_HEADER);
980 skb->csum =0;
981 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
982 TCP_SKB_CB(skb)->sacked =0;
984 /* Send it off. */
985 TCP_SKB_CB(skb)->seq =tcp_acceptable_seq(sk, tp);
986 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
987 TCP_SKB_CB(skb)->when = tcp_time_stamp;
988 if(tcp_transmit_skb(sk, skb))
989 NET_INC_STATS(TCPAbortFailed);
992 /* WARNING: This routine must only be called when we have already sent
993 * a SYN packet that crossed the incoming SYN that caused this routine
994 * to get called. If this assumption fails then the initial rcv_wnd
995 * and rcv_wscale values will not be correct.
997 inttcp_send_synack(struct sock *sk)
999 struct sk_buff* skb;
1001 skb =skb_peek(&sk->write_queue);
1002 if(skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1003 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1004 return-EFAULT;
1006 if(!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1007 if(skb_cloned(skb)) {
1008 struct sk_buff *nskb =skb_copy(skb, GFP_ATOMIC);
1009 if(nskb == NULL)
1010 return-ENOMEM;
1011 __skb_unlink(skb, &sk->write_queue);
1012 __skb_queue_head(&sk->write_queue, nskb);
1013 tcp_free_skb(sk, skb);
1014 tcp_charge_skb(sk, nskb);
1015 skb = nskb;
1018 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1019 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1021 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1022 returntcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1026 * Prepare a SYN-ACK.
1028 struct sk_buff *tcp_make_synack(struct sock *sk,struct dst_entry *dst,
1029 struct open_request *req)
1031 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1032 struct tcphdr *th;
1033 int tcp_header_size;
1034 struct sk_buff *skb;
1036 skb =sock_wmalloc(sk, MAX_TCP_HEADER +15,1, GFP_ATOMIC);
1037 if(skb == NULL)
1038 return NULL;
1040 /* Reserve space for headers. */
1041 skb_reserve(skb, MAX_TCP_HEADER);
1043 skb->dst =dst_clone(dst);
1045 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1046 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0) +
1047 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED :0) +
1048 /* SACK_PERM is in the place of NOP NOP of TS */
1049 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED :0));
1050 skb->h.th = th = (struct tcphdr *)skb_push(skb, tcp_header_size);
1052 memset(th,0,sizeof(struct tcphdr));
1053 th->syn =1;
1054 th->ack =1;
1055 TCP_ECN_make_synack(req, th);
1056 th->source = sk->sport;
1057 th->dest = req->rmt_port;
1058 TCP_SKB_CB(skb)->seq = req->snt_isn;
1059 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq +1;
1060 th->seq =htonl(TCP_SKB_CB(skb)->seq);
1061 th->ack_seq =htonl(req->rcv_isn +1);
1062 if(req->rcv_wnd ==0) {/* ignored for retransmitted syns */
1063 __u8 rcv_wscale;
1064 /* Set this up on the first call only */
1065 req->window_clamp = tp->window_clamp ? : dst->window;
1066 /* tcp_full_space because it is guaranteed to be the first packet */
1067 tcp_select_initial_window(tcp_full_space(sk),
1068 dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED :0),
1069 &req->rcv_wnd,
1070 &req->window_clamp,
1071 req->wscale_ok,
1072 &rcv_wscale);
1073 req->rcv_wscale = rcv_wscale;
1076 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1077 th->window =htons(req->rcv_wnd);
1079 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1080 tcp_syn_build_options((__u32 *)(th +1), dst->advmss, req->tstamp_ok,
1081 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1082 TCP_SKB_CB(skb)->when,
1083 req->ts_recent);
1085 skb->csum =0;
1086 th->doff = (tcp_header_size >>2);
1087 TCP_INC_STATS(TcpOutSegs);
1088 return skb;
1091 inttcp_connect(struct sock *sk,struct sk_buff *buff)
1093 struct dst_entry *dst =__sk_dst_get(sk);
1094 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1096 /* Reserve space for headers. */
1097 skb_reserve(buff, MAX_TCP_HEADER);
1099 /* We'll fix this up when we get a response from the other end.
1100 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1102 tp->tcp_header_len =sizeof(struct tcphdr) +
1103 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED :0);
1105 /* If user gave his TCP_MAXSEG, record it to clamp */
1106 if(tp->user_mss)
1107 tp->mss_clamp = tp->user_mss;
1108 tp->max_window =0;
1109 tcp_sync_mss(sk, dst->pmtu);
1111 if(!tp->window_clamp)
1112 tp->window_clamp = dst->window;
1113 tp->advmss = dst->advmss;
1114 tcp_initialize_rcv_mss(sk);
1116 tcp_select_initial_window(tcp_full_space(sk),
1117 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len -sizeof(struct tcphdr) :0),
1118 &tp->rcv_wnd,
1119 &tp->window_clamp,
1120 sysctl_tcp_window_scaling,
1121 &tp->rcv_wscale);
1123 tp->rcv_ssthresh = tp->rcv_wnd;
1125 /* Socket identity change complete, no longer
1126 * in TCP_CLOSE, so enter ourselves into the
1127 * hash tables.
1129 tcp_set_state(sk,TCP_SYN_SENT);
1130 if(tp->af_specific->hash_connecting(sk))
1131 goto err_out;
1133 sk->err =0;
1134 sk->done =0;
1135 tp->snd_wnd =0;
1136 tcp_init_wl(tp, tp->write_seq,0);
1137 tp->snd_una = tp->write_seq;
1138 tp->snd_sml = tp->write_seq;
1139 tp->rcv_nxt =0;
1140 tp->rcv_wup =0;
1141 tp->copied_seq =0;
1143 tp->rto = TCP_TIMEOUT_INIT;
1144 tp->retransmits =0;
1145 tcp_clear_retrans(tp);
1147 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1148 TCP_ECN_send_syn(tp, buff);
1149 TCP_SKB_CB(buff)->sacked =0;
1150 buff->csum =0;
1151 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1152 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1153 tp->snd_nxt = tp->write_seq;
1154 tp->pushed_seq = tp->write_seq;
1156 /* Send it off. */
1157 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1158 tp->retrans_stamp =TCP_SKB_CB(buff)->when;
1159 __skb_queue_tail(&sk->write_queue, buff);
1160 tcp_charge_skb(sk, buff);
1161 tp->packets_out++;
1162 tcp_transmit_skb(sk,skb_clone(buff, GFP_KERNEL));
1163 TCP_INC_STATS(TcpActiveOpens);
1165 /* Timer for repeating the SYN until an answer. */
1166 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1167 return0;
1169 err_out:
1170 tcp_set_state(sk,TCP_CLOSE);
1171 kfree_skb(buff);
1172 return-EADDRNOTAVAIL;
1175 /* Send out a delayed ack, the caller does the policy checking
1176 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1177 * for details.
1179 voidtcp_send_delayed_ack(struct sock *sk)
1181 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1182 int ato = tp->ack.ato;
1183 unsigned long timeout;
1185 if(ato > TCP_DELACK_MIN) {
1186 int max_ato = HZ/2;
1188 if(tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1189 max_ato = TCP_DELACK_MAX;
1191 /* Slow path, intersegment interval is "high". */
1193 /* If some rtt estimate is known, use it to bound delayed ack.
1194 * Do not use tp->rto here, use results of rtt measurements
1195 * directly.
1197 if(tp->srtt) {
1198 int rtt =max(tp->srtt>>3, TCP_DELACK_MIN);
1200 if(rtt < max_ato)
1201 max_ato = rtt;
1204 ato =min(ato, max_ato);
1207 /* Stay within the limit we were given */
1208 timeout = jiffies + ato;
1210 /* Use new timeout only if there wasn't a older one earlier. */
1211 if(tp->ack.pending&TCP_ACK_TIMER) {
1212 /* If delack timer was blocked or is about to expire,
1213 * send ACK now.
1215 if(tp->ack.blocked ||time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1216 tcp_send_ack(sk);
1217 return;
1220 if(!time_before(timeout, tp->ack.timeout))
1221 timeout = tp->ack.timeout;
1223 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1224 tp->ack.timeout = timeout;
1225 if(!mod_timer(&tp->delack_timer, timeout))
1226 sock_hold(sk);
1228 #ifdef TCP_FORMAL_WINDOW
1229 /* Explanation. Header prediction path does not handle
1230 * case of zero window. If we send ACK immediately, pred_flags
1231 * are reset when sending ACK. If rcv_nxt is advanced and
1232 * ack is not sent, than delayed ack is scheduled.
1233 * Hence, it is the best place to check for zero window.
1235 if(tp->pred_flags) {
1236 if(tcp_receive_window(tp) ==0)
1237 tp->pred_flags =0;
1238 }else{
1239 if(skb_queue_len(&tp->out_of_order_queue) ==0&&
1240 !tp->urg_data)
1241 tcp_fast_path_on(tp);
1243 #endif
1246 /* This routine sends an ack and also updates the window. */
1247 voidtcp_send_ack(struct sock *sk)
1249 /* If we have been reset, we may not send again. */
1250 if(sk->state != TCP_CLOSE) {
1251 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1252 struct sk_buff *buff;
1254 /* We are not putting this on the write queue, so
1255 * tcp_transmit_skb() will set the ownership to this
1256 * sock.
1258 buff =alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1259 if(buff == NULL) {
1260 tcp_schedule_ack(tp);
1261 tp->ack.ato = TCP_ATO_MIN;
1262 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1263 return;
1266 /* Reserve space for headers and prepare control bits. */
1267 skb_reserve(buff, MAX_TCP_HEADER);
1268 buff->csum =0;
1269 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1270 TCP_SKB_CB(buff)->sacked =0;
1272 /* Send it off, this clears delayed acks for us. */
1273 TCP_SKB_CB(buff)->seq =TCP_SKB_CB(buff)->end_seq =tcp_acceptable_seq(sk, tp);
1274 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1275 tcp_transmit_skb(sk, buff);
1279 /* This routine sends a packet with an out of date sequence
1280 * number. It assumes the other end will try to ack it.
1282 * Question: what should we make while urgent mode?
1283 * 4.4BSD forces sending single byte of data. We cannot send
1284 * out of window data, because we have SND.NXT==SND.MAX...
1286 * Current solution: to send TWO zero-length segments in urgent mode:
1287 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1288 * out-of-date with SND.UNA-1 to probe window.
1290 static inttcp_xmit_probe_skb(struct sock *sk,int urgent)
1292 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1293 struct sk_buff *skb;
1295 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1296 skb =alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1297 if(skb == NULL)
1298 return-1;
1300 /* Reserve space for headers and set control bits. */
1301 skb_reserve(skb, MAX_TCP_HEADER);
1302 skb->csum =0;
1303 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1304 TCP_SKB_CB(skb)->sacked = urgent;
1306 /* Use a previous sequence. This should cause the other
1307 * end to send an ack. Don't queue or clone SKB, just
1308 * send it.
1310 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una -1;
1311 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq;
1312 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1313 returntcp_transmit_skb(sk, skb);
1316 inttcp_write_wakeup(struct sock *sk)
1318 if(sk->state != TCP_CLOSE) {
1319 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1320 struct sk_buff *skb;
1322 if((skb = tp->send_head) != NULL &&
1323 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1324 int err;
1325 int mss =tcp_current_mss(sk);
1326 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1328 if(before(tp->pushed_seq,TCP_SKB_CB(skb)->end_seq))
1329 tp->pushed_seq =TCP_SKB_CB(skb)->end_seq;
1331 /* We are probing the opening of a window
1332 * but the window size is != 0
1333 * must have been a result SWS avoidance ( sender )
1335 if(seg_size <TCP_SKB_CB(skb)->end_seq -TCP_SKB_CB(skb)->seq ||
1336 skb->len > mss) {
1337 seg_size =min(seg_size, mss);
1338 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1339 if(tcp_fragment(sk, skb, seg_size))
1340 return-1;
1342 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1343 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1344 err =tcp_transmit_skb(sk,skb_clone(skb, GFP_ATOMIC));
1345 if(!err) {
1346 update_send_head(sk, tp, skb);
1348 return err;
1349 }else{
1350 if(tp->urg_mode &&
1351 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1352 tcp_xmit_probe_skb(sk, TCPCB_URG);
1353 returntcp_xmit_probe_skb(sk,0);
1356 return-1;
1359 /* A window probe timeout has occurred. If window is not closed send
1360 * a partial packet else a zero probe.
1362 voidtcp_send_probe0(struct sock *sk)
1364 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1365 int err;
1367 err =tcp_write_wakeup(sk);
1369 if(tp->packets_out || !tp->send_head) {
1370 /* Cancel probe timer, if it is not required. */
1371 tp->probes_out =0;
1372 tp->backoff =0;
1373 return;
1376 if(err <=0) {
1377 tp->backoff++;
1378 tp->probes_out++;
1379 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
1380 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1381 }else{
1382 /* If packet was not sent due to local congestion,
1383 * do not backoff and do not remember probes_out.
1384 * Let local senders to fight for local resources.
1386 * Use accumulated backoff yet.
1388 if(!tp->probes_out)
1389 tp->probes_out=1;
1390 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
1391 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
close