net/ipv4/tcp_timer.c

Name: Public Git Hosting - davej-history.git/blob - net/ipv4/tcp_timer.c
Rating: 4.7 (2329 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * Implementation of the Transmission Control Protocol(TCP).
 7  *
 8  * Version: $Id: tcp_timer.c,v 1.39 1998/03/13 08:02:17 davem Exp $
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  * Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  * Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  * Florian La Roche, <flla@stud.uni-sb.de>
 15  * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  * Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  * Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  * Matthew Dillon, <dillon@apollo.west.oic.com>
 19  * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  * Jorge Cwik, <jorge@laser.satlink.net>
 21  */
 22
 23 #include <net/tcp.h>
 24
 25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 26 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
 27 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 28 int sysctl_tcp_retries1 = TCP_RETR1;
 29 int sysctl_tcp_retries2 = TCP_RETR2;
 30
 31 static voidtcp_sltimer_handler(unsigned long);
 32 static voidtcp_syn_recv_timer(unsigned long);
 33 static voidtcp_keepalive(unsigned long data);
 34 static voidtcp_bucketgc(unsigned long);
 35
 36 struct timer_list tcp_slow_timer = {
 37  NULL, NULL,
 38 0,0,
 39  tcp_sltimer_handler,
 40 };
 41
 42
 43 struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
 44 {ATOMIC_INIT(0), TCP_SYNACK_PERIOD,0, tcp_syn_recv_timer},/* SYNACK */
 45 {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD,0, tcp_keepalive},/* KEEPALIVE */
 46 {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD,0, tcp_bucketgc}/* BUCKETGC */
 47 };
 48
 49 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
 50
 51 /*
 52  * Using different timers for retransmit, delayed acks and probes
 53  * We may wish use just one timer maintaining a list of expire jiffies
 54  * to optimize.
 55  */
 56
 57 voidtcp_init_xmit_timers(struct sock *sk)
 58 {
 59 init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
 60  sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
 61  sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
 62
 63 init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
 64  sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
 65  sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
 66
 67 init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
 68  sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
 69  sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
 70 }
 71
 72 /*
 73  * Reset the retransmission timer
 74  */
 75
 76 voidtcp_reset_xmit_timer(struct sock *sk,int what,unsigned long when)
 77 {
 78 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 79
 80 if((long)when <=0) {
 81 printk(KERN_DEBUG "xmit_timer <= 0 - timer:%d when:%lx\n", what, when);
 82  when=HZ/50;
 83 }
 84
 85 switch(what) {
 86 case TIME_RETRANS:
 87 /* When seting the transmit timer the probe timer
 88  * should not be set.
 89  * The delayed ack timer can be set if we are changing the
 90  * retransmit timer when removing acked frames.
 91  */
 92 if(tp->probe_timer.prev)
 93 del_timer(&tp->probe_timer);
 94 if(tp->retransmit_timer.prev)
 95 del_timer(&tp->retransmit_timer);
 96  tp->retransmit_timer.expires=jiffies+when;
 97 add_timer(&tp->retransmit_timer);
 98 break;
 99
 100 case TIME_DACK:
 101 if(tp->delack_timer.prev)
 102 del_timer(&tp->delack_timer);
 103  tp->delack_timer.expires=jiffies+when;
 104 add_timer(&tp->delack_timer);
 105 break;
 106
 107 case TIME_PROBE0:
 108 if(tp->probe_timer.prev)
 109 del_timer(&tp->probe_timer);
 110  tp->probe_timer.expires=jiffies+when;
 111 add_timer(&tp->probe_timer);
 112 break;
 113
 114 case TIME_WRITE:
 115 printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
 116 break;
 117
 118 default:
 119 printk(KERN_DEBUG "bug: unknown timer value\n");
 120 };
 121 }
 122
 123 voidtcp_clear_xmit_timers(struct sock *sk)
 124 {
 125 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 126
 127 if(tp->retransmit_timer.prev)
 128 del_timer(&tp->retransmit_timer);
 129 if(tp->delack_timer.prev)
 130 del_timer(&tp->delack_timer);
 131 if(tp->probe_timer.prev)
 132 del_timer(&tp->probe_timer);
 133 }
 134
 135 static inttcp_write_err(struct sock *sk,int force)
 136 {
 137  sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
 138  sk->error_report(sk);
 139
 140 tcp_clear_xmit_timers(sk);
 141
 142 /* Time wait the socket. */
 143 if(!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
 144 tcp_time_wait(sk);
 145 }else{
 146 /* Clean up time. */
 147 tcp_set_state(sk, TCP_CLOSE);
 148 return0;
 149 }
 150 return1;
 151 }
 152
 153 /*
 154  * A write timeout has occurred. Process the after effects. BROKEN (badly)
 155  */
 156
 157 static inttcp_write_timeout(struct sock *sk)
 158 {
 159 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 160
 161 /*
 162  * Look for a 'soft' timeout.
 163  */
 164 if((sk->state == TCP_ESTABLISHED &&
 165  tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) ==0) ||
 166 (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
 167 dst_negative_advice(&sk->dst_cache);
 168 }
 169
 170 /* Have we tried to SYN too many times (repent repent 8)) */
 171 if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
 172 tcp_write_err(sk,1);
 173 /* Don't FIN, we got nothing back */
 174 return0;
 175 }
 176
 177 /* Has it gone just too far? */
 178 if(tp->retransmits > sysctl_tcp_retries2)
 179 returntcp_write_err(sk,0);
 180
 181 return1;
 182 }
 183
 184 voidtcp_delack_timer(unsigned long data)
 185 {
 186 struct sock *sk = (struct sock*)data;
 187
 188 if(sk->zapped)
 189 return;
 190
 191 if(sk->tp_pinfo.af_tcp.delayed_acks)
 192 tcp_read_wakeup(sk);
 193 }
 194
 195 voidtcp_probe_timer(unsigned long data)
 196 {
 197 struct sock *sk = (struct sock*)data;
 198 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 199
 200 if(sk->zapped)
 201 return;
 202
 203 if(sk->sock_readers) {
 204 /* Try again in second. */
 205 tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ);
 206 return;
 207 }
 208
 209 /*
 210  * *WARNING* RFC 1122 forbids this
 211  * It doesn't AFAIK, because we kill the retransmit timer -AK
 212  * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 213  * this behaviour in Solaris down as a bug fix. [AC]
 214  */
 215 if(tp->probes_out > sysctl_tcp_retries2) {
 216 if(sk->err_soft)
 217  sk->err = sk->err_soft;
 218 else
 219  sk->err = ETIMEDOUT;
 220  sk->error_report(sk);
 221
 222 if((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
 223 /* Time wait the socket. */
 224 tcp_time_wait(sk);
 225 }else{
 226 /* Clean up time. */
 227 tcp_set_state(sk, TCP_CLOSE);
 228 }
 229 }
 230
 231 tcp_send_probe0(sk);
 232 }
 233
 234 static __inline__ inttcp_keepopen_proc(struct sock *sk)
 235 {
 236 int res =0;
 237
 238 if((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
 239 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 240  __u32 elapsed = jiffies - tp->rcv_tstamp;
 241
 242 if(elapsed >= sysctl_tcp_keepalive_time) {
 243 if(tp->probes_out > sysctl_tcp_keepalive_probes) {
 244 if(sk->err_soft)
 245  sk->err = sk->err_soft;
 246 else
 247  sk->err = ETIMEDOUT;
 248
 249 tcp_set_state(sk, TCP_CLOSE);
 250 }else{
 251  tp->probes_out++;
 252  tp->pending = TIME_KEEPOPEN;
 253 tcp_write_wakeup(sk);
 254  res =1;
 255 }
 256 }
 257 }
 258 return res;
 259 }
 260
 261 /* Garbage collect TCP bind buckets. */
 262 static voidtcp_bucketgc(unsigned long __unused)
 263 {
 264 int i;
 265
 266 for(i =0; i < TCP_BHTABLE_SIZE; i++) {
 267 struct tcp_bind_bucket *tb = tcp_bound_hash[i];
 268
 269 while(tb) {
 270 struct tcp_bind_bucket *next = tb->next;
 271
 272 if((tb->owners == NULL) &&
 273 !(tb->flags & TCPB_FLAG_LOCKED)) {
 274 /* Eat timer reference. */
 275 tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
 276
 277 /* Unlink bucket. */
 278 if(tb->next)
 279  tb->next->pprev = tb->pprev;
 280 *tb->pprev = tb->next;
 281
 282 /* Finally, free it up. */
 283 kmem_cache_free(tcp_bucket_cachep, tb);
 284 }
 285  tb = next;
 286 }
 287 }
 288 }
 289
 290 /*
 291  * Check all sockets for keepalive timer
 292  * Called every 75 seconds
 293  * This timer is started by af_inet init routine and is constantly
 294  * running.
 295  *
 296  * It might be better to maintain a count of sockets that need it using
 297  * setsockopt/tcp_destroy_sk and only set the timer when needed.
 298  */
 299
 300 /*
 301  * don't send over 5 keepopens at a time to avoid burstiness
 302  * on big servers [AC]
 303  */
 304 #define MAX_KA_PROBES 5
 305
 306 int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
 307
 308 /* Keepopen's are only valid for "established" TCP's, nicely our listener
 309  * hash gets rid of most of the useless testing, so we run through a couple
 310  * of the established hash chains each clock tick. -DaveM
 311  *
 312  * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
 313  * going off for them, so we only need check the first half of the established
 314  * hash table, even less testing under heavy load.
 315  *
 316  * I _really_ would rather do this by adding a new timer_struct to struct sock,
 317  * and this way only those who set the keepalive option will get the overhead.
 318  * The idea is you set it for 2 hours when the sock is first connected, when it
 319  * does fire off (if at all, most sockets die earlier) you check for the keepalive
 320  * option and also if the sock has been idle long enough to start probing.
 321  */
 322 static voidtcp_keepalive(unsigned long data)
 323 {
 324 static int chain_start =0;
 325 int count =0;
 326 int i;
 327
 328 for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >>2)); i++) {
 329 struct sock *sk = tcp_established_hash[i];
 330 while(sk) {
 331 if(sk->keepopen) {
 332  count +=tcp_keepopen_proc(sk);
 333 if(count == sysctl_tcp_max_ka_probes)
 334 goto out;
 335 }
 336  sk = sk->next;
 337 }
 338 }
 339 out:
 340  chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
 341 ((TCP_HTABLE_SIZE/2) -1));
 342 }
 343
 344 /*
 345  * The TCP retransmit timer. This lacks a few small details.
 346  *
 347  * 1. An initial rtt timeout on the probe0 should cause what we can
 348  * of the first write queue buffer to be split and sent.
 349  * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
 350  * ETIMEDOUT if we know an additional 'soft' error caused this.
 351  * tcp_err should save a 'soft error' for us.
 352  * [Unless someone has broken it then it does, except for one 2.0
 353  * broken case of a send when the route/device is directly unreachable,
 354  * and we error but should retry! - FIXME] [AC]
 355  */
 356
 357 voidtcp_retransmit_timer(unsigned long data)
 358 {
 359 struct sock *sk = (struct sock*)data;
 360 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 361
 362 /* We are reset. We will send no more retransmits. */
 363 if(sk->zapped) {
 364 tcp_clear_xmit_timer(sk, TIME_RETRANS);
 365 return;
 366 }
 367
 368 if(sk->sock_readers) {
 369 /* Try again in a second. */
 370 tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ);
 371 return;
 372 }
 373 lock_sock(sk);
 374
 375 /* Clear delay ack timer. */
 376 tcp_clear_xmit_timer(sk, TIME_DACK);
 377
 378 /* Retransmission. */
 379  tp->retrans_head = NULL;
 380 if(tp->retransmits ==0) {
 381 /* remember window where we lost
 382  * "one half of the current window but at least 2 segments"
 383  */
 384  tp->snd_ssthresh =max(tp->snd_cwnd >>1,2);
 385  tp->snd_cwnd_cnt =0;
 386  tp->snd_cwnd =1;
 387 }
 388
 389  tp->retransmits++;
 390
 391  tp->dup_acks =0;
 392  tp->high_seq = tp->snd_nxt;
 393 tcp_do_retransmit(sk,0);
 394
 395 /* Increase the timeout each time we retransmit. Note that
 396  * we do not increase the rtt estimate. rto is initialized
 397  * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
 398  * that doubling rto each time is the least we can get away with.
 399  * In KA9Q, Karn uses this for the first few times, and then
 400  * goes to quadratic. netBSD doubles, but only goes up to *64,
 401  * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
 402  * defined in the protocol as the maximum possible RTT. I guess
 403  * we'll have to use something other than TCP to talk to the
 404  * University of Mars.
 405  *
 406  * PAWS allows us longer timeouts and large windows, so once
 407  * implemented ftp to mars will work nicely. We will have to fix
 408  * the 120 second clamps though!
 409  */
 410  tp->backoff++;/* FIXME: always same as retransmits? -- erics */
 411  tp->rto =min(tp->rto <<1,120*HZ);
 412 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 413
 414 tcp_write_timeout(sk);
 415
 416 release_sock(sk);
 417 }
 418
 419 /*
 420  * Slow timer for SYN-RECV sockets
 421  */
 422
 423 /* This now scales very nicely. -DaveM */
 424 static voidtcp_syn_recv_timer(unsigned long data)
 425 {
 426 struct sock *sk;
 427 unsigned long now = jiffies;
 428 int i;
 429
 430 for(i =0; i < TCP_LHTABLE_SIZE; i++) {
 431  sk = tcp_listening_hash[i];
 432
 433 while(sk) {
 434 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 435
 436 /* TCP_LISTEN is implied. */
 437 if(!sk->sock_readers && tp->syn_wait_queue) {
 438 struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
 439 struct open_request *req = tp->syn_wait_queue;
 440 do{
 441 struct open_request *conn;
 442
 443  conn = req;
 444  req = req->dl_next;
 445
 446 if(conn->sk) {
 447  prev = conn;
 448 continue;
 449 }
 450
 451 if((long)(now - conn->expires) <=0)
 452 break;
 453
 454 tcp_synq_unlink(tp, conn, prev);
 455 if(conn->retrans >= sysctl_tcp_retries1) {
 456 #ifdef TCP_DEBUG
 457 printk(KERN_DEBUG "syn_recv: "
 458 "too many retransmits\n");
 459 #endif
 460 (*conn->class->destructor)(conn);
 461 tcp_dec_slow_timer(TCP_SLT_SYNACK);
 462  sk->ack_backlog--;
 463 tcp_openreq_free(conn);
 464
 465 if(!tp->syn_wait_queue)
 466 break;
 467 }else{
 468  __u32 timeo;
 469 struct open_request *op;
 470
 471 (*conn->class->rtx_syn_ack)(sk, conn);
 472
 473  conn->retrans++;
 474 #ifdef TCP_DEBUG
 475 printk(KERN_DEBUG "syn_ack rtx %d\n",
 476  conn->retrans);
 477 #endif
 478  timeo =min((TCP_TIMEOUT_INIT
 479 << conn->retrans),
 480 120*HZ);
 481  conn->expires = now + timeo;
 482  op = prev->dl_next;
 483 tcp_synq_queue(tp, conn);
 484 if(op != prev->dl_next)
 485  prev = prev->dl_next;
 486 }
 487 /* old prev still valid here */
 488 }while(req);
 489 }
 490  sk = sk->next;
 491 }
 492 }
 493 }
 494
 495 voidtcp_sltimer_handler(unsigned long data)
 496 {
 497 struct tcp_sl_timer *slt = tcp_slt_array;
 498 unsigned long next = ~0UL;
 499 unsigned long now = jiffies;
 500 int i;
 501
 502 for(i=0; i < TCP_SLT_MAX; i++, slt++) {
 503 if(atomic_read(&slt->count)) {
 504 long trigger;
 505
 506  trigger = slt->period - ((long)(now - slt->last));
 507
 508 if(trigger <=0) {
 509 (*slt->handler)((unsigned long) slt);
 510  slt->last = now;
 511  trigger = slt->period;
 512 }
 513  next =min(next, trigger);
 514 }
 515 }
 516
 517 if(next != ~0UL) {
 518  tcp_slow_timer.expires = now + next;
 519 add_timer(&tcp_slow_timer);
 520 }
 521 }
 522
 523 void__tcp_inc_slow_timer(struct tcp_sl_timer *slt)
 524 {
 525 unsigned long now = jiffies;
 526 unsigned long next =0;
 527 unsigned long when;
 528
 529  slt->last = now;
 530
 531  when = now + slt->period;
 532 if(del_timer(&tcp_slow_timer))
 533  next = tcp_slow_timer.expires;
 534
 535 if(next && ((long)(next - when) <0))
 536  when = next;
 537
 538  tcp_slow_timer.expires = when;
 539 add_timer(&tcp_slow_timer);
 540 }