2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 6 * Implementation of the Transmission Control Protocol(TCP). 8 * Version: $Id: tcp_timer.c,v 1.39 1998/03/13 08:02:17 davem Exp $ 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 25 int sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
; 26 int sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
; 27 int sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
; 28 int sysctl_tcp_retries1
= TCP_RETR1
; 29 int sysctl_tcp_retries2
= TCP_RETR2
; 31 static voidtcp_sltimer_handler(unsigned long); 32 static voidtcp_syn_recv_timer(unsigned long); 33 static voidtcp_keepalive(unsigned long data
); 34 static voidtcp_bucketgc(unsigned long); 36 struct timer_list tcp_slow_timer
= { 43 struct tcp_sl_timer tcp_slt_array
[TCP_SLT_MAX
] = { 44 {ATOMIC_INIT(0), TCP_SYNACK_PERIOD
,0, tcp_syn_recv_timer
},/* SYNACK */ 45 {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD
,0, tcp_keepalive
},/* KEEPALIVE */ 46 {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD
,0, tcp_bucketgc
}/* BUCKETGC */ 49 const char timer_bug_msg
[] = KERN_DEBUG
"tcpbug: unknown timer value\n"; 52 * Using different timers for retransmit, delayed acks and probes 53 * We may wish use just one timer maintaining a list of expire jiffies 57 voidtcp_init_xmit_timers(struct sock
*sk
) 59 init_timer(&sk
->tp_pinfo
.af_tcp
.retransmit_timer
); 60 sk
->tp_pinfo
.af_tcp
.retransmit_timer
.function
=&tcp_retransmit_timer
; 61 sk
->tp_pinfo
.af_tcp
.retransmit_timer
.data
= (unsigned long) sk
; 63 init_timer(&sk
->tp_pinfo
.af_tcp
.delack_timer
); 64 sk
->tp_pinfo
.af_tcp
.delack_timer
.function
=&tcp_delack_timer
; 65 sk
->tp_pinfo
.af_tcp
.delack_timer
.data
= (unsigned long) sk
; 67 init_timer(&sk
->tp_pinfo
.af_tcp
.probe_timer
); 68 sk
->tp_pinfo
.af_tcp
.probe_timer
.function
=&tcp_probe_timer
; 69 sk
->tp_pinfo
.af_tcp
.probe_timer
.data
= (unsigned long) sk
; 73 * Reset the retransmission timer 76 voidtcp_reset_xmit_timer(struct sock
*sk
,int what
,unsigned long when
) 78 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 81 printk(KERN_DEBUG
"xmit_timer <= 0 - timer:%d when:%lx\n", what
, when
); 87 /* When seting the transmit timer the probe timer 89 * The delayed ack timer can be set if we are changing the 90 * retransmit timer when removing acked frames. 92 if(tp
->probe_timer
.prev
) 93 del_timer(&tp
->probe_timer
); 94 if(tp
->retransmit_timer
.prev
) 95 del_timer(&tp
->retransmit_timer
); 96 tp
->retransmit_timer
.expires
=jiffies
+when
; 97 add_timer(&tp
->retransmit_timer
); 101 if(tp
->delack_timer
.prev
) 102 del_timer(&tp
->delack_timer
); 103 tp
->delack_timer
.expires
=jiffies
+when
; 104 add_timer(&tp
->delack_timer
); 108 if(tp
->probe_timer
.prev
) 109 del_timer(&tp
->probe_timer
); 110 tp
->probe_timer
.expires
=jiffies
+when
; 111 add_timer(&tp
->probe_timer
); 115 printk(KERN_DEBUG
"bug: tcp_reset_xmit_timer TIME_WRITE\n"); 119 printk(KERN_DEBUG
"bug: unknown timer value\n"); 123 voidtcp_clear_xmit_timers(struct sock
*sk
) 125 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 127 if(tp
->retransmit_timer
.prev
) 128 del_timer(&tp
->retransmit_timer
); 129 if(tp
->delack_timer
.prev
) 130 del_timer(&tp
->delack_timer
); 131 if(tp
->probe_timer
.prev
) 132 del_timer(&tp
->probe_timer
); 135 static inttcp_write_err(struct sock
*sk
,int force
) 137 sk
->err
= sk
->err_soft
? sk
->err_soft
: ETIMEDOUT
; 138 sk
->error_report(sk
); 140 tcp_clear_xmit_timers(sk
); 142 /* Time wait the socket. */ 143 if(!force
&& ((1<<sk
->state
) & (TCPF_FIN_WAIT1
|TCPF_FIN_WAIT2
|TCPF_CLOSING
))) { 147 tcp_set_state(sk
, TCP_CLOSE
); 154 * A write timeout has occurred. Process the after effects. BROKEN (badly) 157 static inttcp_write_timeout(struct sock
*sk
) 159 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
); 162 * Look for a 'soft' timeout. 164 if((sk
->state
== TCP_ESTABLISHED
&& 165 tp
->retransmits
&& (tp
->retransmits
% TCP_QUICK_TRIES
) ==0) || 166 (sk
->state
!= TCP_ESTABLISHED
&& tp
->retransmits
> sysctl_tcp_retries1
)) { 167 dst_negative_advice(&sk
->dst_cache
); 170 /* Have we tried to SYN too many times (repent repent 8)) */ 171 if(tp
->retransmits
> sysctl_tcp_syn_retries
&& sk
->state
==TCP_SYN_SENT
) { 173 /* Don't FIN, we got nothing back */ 177 /* Has it gone just too far? */ 178 if(tp
->retransmits
> sysctl_tcp_retries2
) 179 returntcp_write_err(sk
,0); 184 voidtcp_delack_timer(unsigned long data
) 186 struct sock
*sk
= (struct sock
*)data
; 191 if(sk
->tp_pinfo
.af_tcp
.delayed_acks
) 195 voidtcp_probe_timer(unsigned long data
) 197 struct sock
*sk
= (struct sock
*)data
; 198 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 203 if(sk
->sock_readers
) { 204 /* Try again in second. */ 205 tcp_reset_xmit_timer(sk
, TIME_PROBE0
, HZ
); 210 * *WARNING* RFC 1122 forbids this 211 * It doesn't AFAIK, because we kill the retransmit timer -AK 212 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing 213 * this behaviour in Solaris down as a bug fix. [AC] 215 if(tp
->probes_out
> sysctl_tcp_retries2
) { 217 sk
->err
= sk
->err_soft
; 220 sk
->error_report(sk
); 222 if((1<<sk
->state
) & (TCPF_FIN_WAIT1
|TCPF_FIN_WAIT2
|TCPF_CLOSING
)) { 223 /* Time wait the socket. */ 227 tcp_set_state(sk
, TCP_CLOSE
); 234 static __inline__
inttcp_keepopen_proc(struct sock
*sk
) 238 if((1<<sk
->state
) & (TCPF_ESTABLISHED
|TCPF_CLOSE_WAIT
|TCPF_FIN_WAIT2
)) { 239 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 240 __u32 elapsed
= jiffies
- tp
->rcv_tstamp
; 242 if(elapsed
>= sysctl_tcp_keepalive_time
) { 243 if(tp
->probes_out
> sysctl_tcp_keepalive_probes
) { 245 sk
->err
= sk
->err_soft
; 249 tcp_set_state(sk
, TCP_CLOSE
); 252 tp
->pending
= TIME_KEEPOPEN
; 253 tcp_write_wakeup(sk
); 261 /* Garbage collect TCP bind buckets. */ 262 static voidtcp_bucketgc(unsigned long __unused
) 266 for(i
=0; i
< TCP_BHTABLE_SIZE
; i
++) { 267 struct tcp_bind_bucket
*tb
= tcp_bound_hash
[i
]; 270 struct tcp_bind_bucket
*next
= tb
->next
; 272 if((tb
->owners
== NULL
) && 273 !(tb
->flags
& TCPB_FLAG_LOCKED
)) { 274 /* Eat timer reference. */ 275 tcp_dec_slow_timer(TCP_SLT_BUCKETGC
); 279 tb
->next
->pprev
= tb
->pprev
; 280 *tb
->pprev
= tb
->next
; 282 /* Finally, free it up. */ 283 kmem_cache_free(tcp_bucket_cachep
, tb
); 291 * Check all sockets for keepalive timer 292 * Called every 75 seconds 293 * This timer is started by af_inet init routine and is constantly 296 * It might be better to maintain a count of sockets that need it using 297 * setsockopt/tcp_destroy_sk and only set the timer when needed. 301 * don't send over 5 keepopens at a time to avoid burstiness 302 * on big servers [AC] 304 #define MAX_KA_PROBES 5 306 int sysctl_tcp_max_ka_probes
= MAX_KA_PROBES
; 308 /* Keepopen's are only valid for "established" TCP's, nicely our listener 309 * hash gets rid of most of the useless testing, so we run through a couple 310 * of the established hash chains each clock tick. -DaveM 312 * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes 313 * going off for them, so we only need check the first half of the established 314 * hash table, even less testing under heavy load. 316 * I _really_ would rather do this by adding a new timer_struct to struct sock, 317 * and this way only those who set the keepalive option will get the overhead. 318 * The idea is you set it for 2 hours when the sock is first connected, when it 319 * does fire off (if at all, most sockets die earlier) you check for the keepalive 320 * option and also if the sock has been idle long enough to start probing. 322 static voidtcp_keepalive(unsigned long data
) 324 static int chain_start
=0; 328 for(i
= chain_start
; i
< (chain_start
+ ((TCP_HTABLE_SIZE
/2) >>2)); i
++) { 329 struct sock
*sk
= tcp_established_hash
[i
]; 332 count
+=tcp_keepopen_proc(sk
); 333 if(count
== sysctl_tcp_max_ka_probes
) 340 chain_start
= ((chain_start
+ ((TCP_HTABLE_SIZE
/2)>>2)) & 341 ((TCP_HTABLE_SIZE
/2) -1)); 345 * The TCP retransmit timer. This lacks a few small details. 347 * 1. An initial rtt timeout on the probe0 should cause what we can 348 * of the first write queue buffer to be split and sent. 349 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report 350 * ETIMEDOUT if we know an additional 'soft' error caused this. 351 * tcp_err should save a 'soft error' for us. 352 * [Unless someone has broken it then it does, except for one 2.0 353 * broken case of a send when the route/device is directly unreachable, 354 * and we error but should retry! - FIXME] [AC] 357 voidtcp_retransmit_timer(unsigned long data
) 359 struct sock
*sk
= (struct sock
*)data
; 360 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 362 /* We are reset. We will send no more retransmits. */ 364 tcp_clear_xmit_timer(sk
, TIME_RETRANS
); 368 if(sk
->sock_readers
) { 369 /* Try again in a second. */ 370 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, HZ
); 375 /* Clear delay ack timer. */ 376 tcp_clear_xmit_timer(sk
, TIME_DACK
); 378 /* Retransmission. */ 379 tp
->retrans_head
= NULL
; 380 if(tp
->retransmits
==0) { 381 /* remember window where we lost 382 * "one half of the current window but at least 2 segments" 384 tp
->snd_ssthresh
=max(tp
->snd_cwnd
>>1,2); 392 tp
->high_seq
= tp
->snd_nxt
; 393 tcp_do_retransmit(sk
,0); 395 /* Increase the timeout each time we retransmit. Note that 396 * we do not increase the rtt estimate. rto is initialized 397 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests 398 * that doubling rto each time is the least we can get away with. 399 * In KA9Q, Karn uses this for the first few times, and then 400 * goes to quadratic. netBSD doubles, but only goes up to *64, 401 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is 402 * defined in the protocol as the maximum possible RTT. I guess 403 * we'll have to use something other than TCP to talk to the 404 * University of Mars. 406 * PAWS allows us longer timeouts and large windows, so once 407 * implemented ftp to mars will work nicely. We will have to fix 408 * the 120 second clamps though! 410 tp
->backoff
++;/* FIXME: always same as retransmits? -- erics */ 411 tp
->rto
=min(tp
->rto
<<1,120*HZ
); 412 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
); 414 tcp_write_timeout(sk
); 420 * Slow timer for SYN-RECV sockets 423 /* This now scales very nicely. -DaveM */ 424 static voidtcp_syn_recv_timer(unsigned long data
) 427 unsigned long now
= jiffies
; 430 for(i
=0; i
< TCP_LHTABLE_SIZE
; i
++) { 431 sk
= tcp_listening_hash
[i
]; 434 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 436 /* TCP_LISTEN is implied. */ 437 if(!sk
->sock_readers
&& tp
->syn_wait_queue
) { 438 struct open_request
*prev
= (struct open_request
*)(&tp
->syn_wait_queue
); 439 struct open_request
*req
= tp
->syn_wait_queue
; 441 struct open_request
*conn
; 451 if((long)(now
- conn
->expires
) <=0) 454 tcp_synq_unlink(tp
, conn
, prev
); 455 if(conn
->retrans
>= sysctl_tcp_retries1
) { 457 printk(KERN_DEBUG
"syn_recv: " 458 "too many retransmits\n"); 460 (*conn
->class->destructor
)(conn
); 461 tcp_dec_slow_timer(TCP_SLT_SYNACK
); 463 tcp_openreq_free(conn
); 465 if(!tp
->syn_wait_queue
) 469 struct open_request
*op
; 471 (*conn
->class->rtx_syn_ack
)(sk
, conn
); 475 printk(KERN_DEBUG
"syn_ack rtx %d\n", 478 timeo
=min((TCP_TIMEOUT_INIT
481 conn
->expires
= now
+ timeo
; 483 tcp_synq_queue(tp
, conn
); 484 if(op
!= prev
->dl_next
) 485 prev
= prev
->dl_next
; 487 /* old prev still valid here */ 495 voidtcp_sltimer_handler(unsigned long data
) 497 struct tcp_sl_timer
*slt
= tcp_slt_array
; 498 unsigned long next
= ~0UL; 499 unsigned long now
= jiffies
; 502 for(i
=0; i
< TCP_SLT_MAX
; i
++, slt
++) { 503 if(atomic_read(&slt
->count
)) { 506 trigger
= slt
->period
- ((long)(now
- slt
->last
)); 509 (*slt
->handler
)((unsigned long) slt
); 511 trigger
= slt
->period
; 513 next
=min(next
, trigger
); 518 tcp_slow_timer
.expires
= now
+ next
; 519 add_timer(&tcp_slow_timer
); 523 void__tcp_inc_slow_timer(struct tcp_sl_timer
*slt
) 525 unsigned long now
= jiffies
; 526 unsigned long next
=0; 531 when
= now
+ slt
->period
; 532 if(del_timer(&tcp_slow_timer
)) 533 next
= tcp_slow_timer
.expires
; 535 if(next
&& ((long)(next
- when
) <0)) 538 tcp_slow_timer
.expires
= when
; 539 add_timer(&tcp_slow_timer
);