- Revert TCP delayed ACK fix, and fix correctly.
[davej-history.git] / net / ipv4 / tcp_timer.c
blob330f8e992fe657001c19fa75b996a7966f8e25f1
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.39 1998/03/13 08:02:17 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <net/tcp.h>
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
27 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
28 int sysctl_tcp_retries1 = TCP_RETR1;
29 int sysctl_tcp_retries2 = TCP_RETR2;
31 static voidtcp_sltimer_handler(unsigned long);
32 static voidtcp_syn_recv_timer(unsigned long);
33 static voidtcp_keepalive(unsigned long data);
34 static voidtcp_bucketgc(unsigned long);
36 struct timer_list tcp_slow_timer = {
37 NULL, NULL,
38 0,0,
39 tcp_sltimer_handler,
43 struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
44 {ATOMIC_INIT(0), TCP_SYNACK_PERIOD,0, tcp_syn_recv_timer},/* SYNACK */
45 {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD,0, tcp_keepalive},/* KEEPALIVE */
46 {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD,0, tcp_bucketgc}/* BUCKETGC */
49 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
52 * Using different timers for retransmit, delayed acks and probes
53 * We may wish use just one timer maintaining a list of expire jiffies
54 * to optimize.
57 voidtcp_init_xmit_timers(struct sock *sk)
59 init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
60 sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
61 sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
63 init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
64 sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
65 sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
67 init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
68 sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
69 sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
73 * Reset the retransmission timer
76 voidtcp_reset_xmit_timer(struct sock *sk,int what,unsigned long when)
78 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
80 if((long)when <=0) {
81 printk(KERN_DEBUG "xmit_timer <= 0 - timer:%d when:%lx\n", what, when);
82 when=HZ/50;
85 switch(what) {
86 case TIME_RETRANS:
87 /* When seting the transmit timer the probe timer
88 * should not be set.
89 * The delayed ack timer can be set if we are changing the
90 * retransmit timer when removing acked frames.
92 if(tp->probe_timer.prev)
93 del_timer(&tp->probe_timer);
94 if(tp->retransmit_timer.prev)
95 del_timer(&tp->retransmit_timer);
96 tp->retransmit_timer.expires=jiffies+when;
97 add_timer(&tp->retransmit_timer);
98 break;
100 case TIME_DACK:
101 if(tp->delack_timer.prev)
102 del_timer(&tp->delack_timer);
103 tp->delack_timer.expires=jiffies+when;
104 add_timer(&tp->delack_timer);
105 break;
107 case TIME_PROBE0:
108 if(tp->probe_timer.prev)
109 del_timer(&tp->probe_timer);
110 tp->probe_timer.expires=jiffies+when;
111 add_timer(&tp->probe_timer);
112 break;
114 case TIME_WRITE:
115 printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
116 break;
118 default:
119 printk(KERN_DEBUG "bug: unknown timer value\n");
123 voidtcp_clear_xmit_timers(struct sock *sk)
125 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
127 if(tp->retransmit_timer.prev)
128 del_timer(&tp->retransmit_timer);
129 if(tp->delack_timer.prev)
130 del_timer(&tp->delack_timer);
131 if(tp->probe_timer.prev)
132 del_timer(&tp->probe_timer);
135 static inttcp_write_err(struct sock *sk,int force)
137 sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
138 sk->error_report(sk);
140 tcp_clear_xmit_timers(sk);
142 /* Time wait the socket. */
143 if(!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
144 tcp_time_wait(sk);
145 }else{
146 /* Clean up time. */
147 tcp_set_state(sk, TCP_CLOSE);
148 return0;
150 return1;
154 * A write timeout has occurred. Process the after effects. BROKEN (badly)
157 static inttcp_write_timeout(struct sock *sk)
159 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
162 * Look for a 'soft' timeout.
164 if((sk->state == TCP_ESTABLISHED &&
165 tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) ==0) ||
166 (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
167 dst_negative_advice(&sk->dst_cache);
170 /* Have we tried to SYN too many times (repent repent 8)) */
171 if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
172 tcp_write_err(sk,1);
173 /* Don't FIN, we got nothing back */
174 return0;
177 /* Has it gone just too far? */
178 if(tp->retransmits > sysctl_tcp_retries2)
179 returntcp_write_err(sk,0);
181 return1;
184 voidtcp_delack_timer(unsigned long data)
186 struct sock *sk = (struct sock*)data;
188 if(sk->zapped)
189 return;
191 if(sk->tp_pinfo.af_tcp.delayed_acks)
192 tcp_read_wakeup(sk);
195 voidtcp_probe_timer(unsigned long data)
197 struct sock *sk = (struct sock*)data;
198 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
200 if(sk->zapped)
201 return;
203 if(sk->sock_readers) {
204 /* Try again in second. */
205 tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ);
206 return;
210 * *WARNING* RFC 1122 forbids this
211 * It doesn't AFAIK, because we kill the retransmit timer -AK
212 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
213 * this behaviour in Solaris down as a bug fix. [AC]
215 if(tp->probes_out > sysctl_tcp_retries2) {
216 if(sk->err_soft)
217 sk->err = sk->err_soft;
218 else
219 sk->err = ETIMEDOUT;
220 sk->error_report(sk);
222 if((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
223 /* Time wait the socket. */
224 tcp_time_wait(sk);
225 }else{
226 /* Clean up time. */
227 tcp_set_state(sk, TCP_CLOSE);
231 tcp_send_probe0(sk);
234 static __inline__ inttcp_keepopen_proc(struct sock *sk)
236 int res =0;
238 if((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
239 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
240 __u32 elapsed = jiffies - tp->rcv_tstamp;
242 if(elapsed >= sysctl_tcp_keepalive_time) {
243 if(tp->probes_out > sysctl_tcp_keepalive_probes) {
244 if(sk->err_soft)
245 sk->err = sk->err_soft;
246 else
247 sk->err = ETIMEDOUT;
249 tcp_set_state(sk, TCP_CLOSE);
250 }else{
251 tp->probes_out++;
252 tp->pending = TIME_KEEPOPEN;
253 tcp_write_wakeup(sk);
254 res =1;
258 return res;
261 /* Garbage collect TCP bind buckets. */
262 static voidtcp_bucketgc(unsigned long __unused)
264 int i;
266 for(i =0; i < TCP_BHTABLE_SIZE; i++) {
267 struct tcp_bind_bucket *tb = tcp_bound_hash[i];
269 while(tb) {
270 struct tcp_bind_bucket *next = tb->next;
272 if((tb->owners == NULL) &&
273 !(tb->flags & TCPB_FLAG_LOCKED)) {
274 /* Eat timer reference. */
275 tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
277 /* Unlink bucket. */
278 if(tb->next)
279 tb->next->pprev = tb->pprev;
280 *tb->pprev = tb->next;
282 /* Finally, free it up. */
283 kmem_cache_free(tcp_bucket_cachep, tb);
285 tb = next;
291 * Check all sockets for keepalive timer
292 * Called every 75 seconds
293 * This timer is started by af_inet init routine and is constantly
294 * running.
296 * It might be better to maintain a count of sockets that need it using
297 * setsockopt/tcp_destroy_sk and only set the timer when needed.
301 * don't send over 5 keepopens at a time to avoid burstiness
302 * on big servers [AC]
304 #define MAX_KA_PROBES 5
306 int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
308 /* Keepopen's are only valid for "established" TCP's, nicely our listener
309 * hash gets rid of most of the useless testing, so we run through a couple
310 * of the established hash chains each clock tick. -DaveM
312 * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
313 * going off for them, so we only need check the first half of the established
314 * hash table, even less testing under heavy load.
316 * I _really_ would rather do this by adding a new timer_struct to struct sock,
317 * and this way only those who set the keepalive option will get the overhead.
318 * The idea is you set it for 2 hours when the sock is first connected, when it
319 * does fire off (if at all, most sockets die earlier) you check for the keepalive
320 * option and also if the sock has been idle long enough to start probing.
322 static voidtcp_keepalive(unsigned long data)
324 static int chain_start =0;
325 int count =0;
326 int i;
328 for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >>2)); i++) {
329 struct sock *sk = tcp_established_hash[i];
330 while(sk) {
331 if(sk->keepopen) {
332 count +=tcp_keepopen_proc(sk);
333 if(count == sysctl_tcp_max_ka_probes)
334 goto out;
336 sk = sk->next;
339 out:
340 chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
341 ((TCP_HTABLE_SIZE/2) -1));
345 * The TCP retransmit timer. This lacks a few small details.
347 * 1. An initial rtt timeout on the probe0 should cause what we can
348 * of the first write queue buffer to be split and sent.
349 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
350 * ETIMEDOUT if we know an additional 'soft' error caused this.
351 * tcp_err should save a 'soft error' for us.
352 * [Unless someone has broken it then it does, except for one 2.0
353 * broken case of a send when the route/device is directly unreachable,
354 * and we error but should retry! - FIXME] [AC]
357 voidtcp_retransmit_timer(unsigned long data)
359 struct sock *sk = (struct sock*)data;
360 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
362 /* We are reset. We will send no more retransmits. */
363 if(sk->zapped) {
364 tcp_clear_xmit_timer(sk, TIME_RETRANS);
365 return;
368 if(sk->sock_readers) {
369 /* Try again in a second. */
370 tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ);
371 return;
373 lock_sock(sk);
375 /* Clear delay ack timer. */
376 tcp_clear_xmit_timer(sk, TIME_DACK);
378 /* Retransmission. */
379 tp->retrans_head = NULL;
380 if(tp->retransmits ==0) {
381 /* remember window where we lost
382 * "one half of the current window but at least 2 segments"
384 tp->snd_ssthresh =max(tp->snd_cwnd >>1,2);
385 tp->snd_cwnd_cnt =0;
386 tp->snd_cwnd =1;
389 tp->retransmits++;
391 tp->dup_acks =0;
392 tp->high_seq = tp->snd_nxt;
393 tcp_do_retransmit(sk,0);
395 /* Increase the timeout each time we retransmit. Note that
396 * we do not increase the rtt estimate. rto is initialized
397 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
398 * that doubling rto each time is the least we can get away with.
399 * In KA9Q, Karn uses this for the first few times, and then
400 * goes to quadratic. netBSD doubles, but only goes up to *64,
401 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
402 * defined in the protocol as the maximum possible RTT. I guess
403 * we'll have to use something other than TCP to talk to the
404 * University of Mars.
406 * PAWS allows us longer timeouts and large windows, so once
407 * implemented ftp to mars will work nicely. We will have to fix
408 * the 120 second clamps though!
410 tp->backoff++;/* FIXME: always same as retransmits? -- erics */
411 tp->rto =min(tp->rto <<1,120*HZ);
412 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
414 tcp_write_timeout(sk);
416 release_sock(sk);
420 * Slow timer for SYN-RECV sockets
423 /* This now scales very nicely. -DaveM */
424 static voidtcp_syn_recv_timer(unsigned long data)
426 struct sock *sk;
427 unsigned long now = jiffies;
428 int i;
430 for(i =0; i < TCP_LHTABLE_SIZE; i++) {
431 sk = tcp_listening_hash[i];
433 while(sk) {
434 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
436 /* TCP_LISTEN is implied. */
437 if(!sk->sock_readers && tp->syn_wait_queue) {
438 struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
439 struct open_request *req = tp->syn_wait_queue;
441 struct open_request *conn;
443 conn = req;
444 req = req->dl_next;
446 if(conn->sk) {
447 prev = conn;
448 continue;
451 if((long)(now - conn->expires) <=0)
452 break;
454 tcp_synq_unlink(tp, conn, prev);
455 if(conn->retrans >= sysctl_tcp_retries1) {
456 #ifdef TCP_DEBUG
457 printk(KERN_DEBUG "syn_recv: "
458 "too many retransmits\n");
459 #endif
460 (*conn->class->destructor)(conn);
461 tcp_dec_slow_timer(TCP_SLT_SYNACK);
462 sk->ack_backlog--;
463 tcp_openreq_free(conn);
465 if(!tp->syn_wait_queue)
466 break;
467 }else{
468 __u32 timeo;
469 struct open_request *op;
471 (*conn->class->rtx_syn_ack)(sk, conn);
473 conn->retrans++;
474 #ifdef TCP_DEBUG
475 printk(KERN_DEBUG "syn_ack rtx %d\n",
476 conn->retrans);
477 #endif
478 timeo =min((TCP_TIMEOUT_INIT
479 << conn->retrans),
480 120*HZ);
481 conn->expires = now + timeo;
482 op = prev->dl_next;
483 tcp_synq_queue(tp, conn);
484 if(op != prev->dl_next)
485 prev = prev->dl_next;
487 /* old prev still valid here */
488 }while(req);
490 sk = sk->next;
495 voidtcp_sltimer_handler(unsigned long data)
497 struct tcp_sl_timer *slt = tcp_slt_array;
498 unsigned long next = ~0UL;
499 unsigned long now = jiffies;
500 int i;
502 for(i=0; i < TCP_SLT_MAX; i++, slt++) {
503 if(atomic_read(&slt->count)) {
504 long trigger;
506 trigger = slt->period - ((long)(now - slt->last));
508 if(trigger <=0) {
509 (*slt->handler)((unsigned long) slt);
510 slt->last = now;
511 trigger = slt->period;
513 next =min(next, trigger);
517 if(next != ~0UL) {
518 tcp_slow_timer.expires = now + next;
519 add_timer(&tcp_slow_timer);
523 void__tcp_inc_slow_timer(struct tcp_sl_timer *slt)
525 unsigned long now = jiffies;
526 unsigned long next =0;
527 unsigned long when;
529 slt->last = now;
531 when = now + slt->period;
532 if(del_timer(&tcp_slow_timer))
533 next = tcp_slow_timer.expires;
535 if(next && ((long)(next - when) <0))
536 when = next;
538 tcp_slow_timer.expires = when;
539 add_timer(&tcp_slow_timer);
close