include/net/tcp.h

Name: Public Git Hosting - davej-history.git/blob - include/net/tcp.h
Rating: 4.5 (8141 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * Definitions for the TCP module.
 7  *
 8  * Version: @(#)tcp.h 1.0.5 05/23/93
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *
 13  * This program is free software; you can redistribute it and/or
 14  * modify it under the terms of the GNU General Public License
 15  * as published by the Free Software Foundation; either version
 16  * 2 of the License, or (at your option) any later version.
 17  */
 18 #ifndef _TCP_H
 19 #define _TCP_H
 20
 21 #include <linux/config.h>
 22 #include <linux/tcp.h>
 23 #include <linux/slab.h>
 24 #include <net/checksum.h>
 25
 26 /* This is for all connections with a full identity, no wildcards.
 27  * New scheme, half the table is for TIME_WAIT, the other half is
 28  * for the rest. I'll experiment with dynamic table growth later.
 29  */
 30 #define TCP_HTABLE_SIZE 512
 31
 32 /* This is for listening sockets, thus all sockets which possess wildcards. */
 33 #define TCP_LHTABLE_SIZE 32/* Yes, really, this is all you need. */
 34
 35 /* This is for all sockets, to keep track of the local port allocations. */
 36 #define TCP_BHTABLE_SIZE 512
 37
 38 /* tcp_ipv4.c: These need to be shared by v4 and v6 because the lookup
 39  * and hashing code needs to work with different AF's yet
 40  * the port space is shared.
 41  */
 42 externstruct sock *tcp_established_hash[TCP_HTABLE_SIZE];
 43 externstruct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
 44
 45 /* There are a few simple rules, which allow for local port reuse by
 46  * an application. In essence:
 47  *
 48  * 1) Sockets bound to different interfaces may share a local port.
 49  * Failing that, goto test 2.
 50  * 2) If all sockets have sk->reuse set, and none of them are in
 51  * TCP_LISTEN state, the port may be shared.
 52  * Failing that, goto test 3.
 53  * 3) If all sockets are bound to a specific sk->rcv_saddr local
 54  * address, and none of them are the same, the port may be
 55  * shared.
 56  * Failing this, the port cannot be shared.
 57  *
 58  * The interesting point, is test #2. This is what an FTP server does
 59  * all day. To optimize this case we use a specific flag bit defined
 60  * below. As we add sockets to a bind bucket list, we perform a
 61  * check of: (newsk->reuse && (newsk->state != TCP_LISTEN))
 62  * As long as all sockets added to a bind bucket pass this test,
 63  * the flag bit will be set.
 64  * The resulting situation is that tcp_v[46]_verify_bind() can just check
 65  * for this flag bit, if it is set and the socket trying to bind has
 66  * sk->reuse set, we don't even have to walk the owners list at all,
 67  * we return that it is ok to bind this socket to the requested local port.
 68  *
 69  * Sounds like a lot of work, but it is worth it. In a more naive
 70  * implementation (ie. current FreeBSD etc.) the entire list of ports
 71  * must be walked for each data port opened by an ftp server. Needless
 72  * to say, this does not scale at all. With a couple thousand FTP
 73  * users logged onto your box, isn't it nice to know that new data
 74  * ports are created in O(1) time? I thought so. ;-) -DaveM
 75  */
 76 struct tcp_bind_bucket {
 77 unsigned short port;
 78 unsigned short flags;
 79 #define TCPB_FLAG_LOCKED 0x0001
 80 #define TCPB_FLAG_FASTREUSE 0x0002
 81
 82 struct tcp_bind_bucket *next;
 83 struct sock *owners;
 84 struct tcp_bind_bucket **pprev;
 85 };
 86
 87 externstruct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
 88 extern kmem_cache_t *tcp_bucket_cachep;
 89 externstruct tcp_bind_bucket *tcp_bucket_create(unsigned short snum);
 90 externvoidtcp_bucket_unlock(struct sock *sk);
 91 externint tcp_port_rover;
 92
 93 /* Level-1 socket-demux cache. */
 94 #define TCP_NUM_REGS 32
 95 externstruct sock *tcp_regs[TCP_NUM_REGS];
 96
 97 #define TCP_RHASH_FN(__fport) \
 98  ((((__fport) >> 7) ^ (__fport)) & (TCP_NUM_REGS - 1))
 99 #define TCP_RHASH(__fport) tcp_regs[TCP_RHASH_FN((__fport))]
 100 #define TCP_SK_RHASH_FN(__sock) TCP_RHASH_FN((__sock)->dport)
 101 #define TCP_SK_RHASH(__sock) tcp_regs[TCP_SK_RHASH_FN((__sock))]
 102
 103 static __inline__ voidtcp_reg_zap(struct sock *sk)
 104 {
 105 struct sock **rpp;
 106
 107  rpp = &(TCP_SK_RHASH(sk));
 108 if(*rpp == sk)
 109 *rpp = NULL;
 110 }
 111
 112 /* These are AF independent. */
 113 static __inline__ inttcp_bhashfn(__u16 lport)
 114 {
 115 return(lport & (TCP_BHTABLE_SIZE -1));
 116 }
 117
 118 static __inline__ voidtcp_sk_bindify(struct sock *sk)
 119 {
 120 struct tcp_bind_bucket *tb;
 121 unsigned short snum = sk->num;
 122
 123 for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb->port != snum; tb = tb->next)
 124 ;
 125 /* Update bucket flags. */
 126 if(tb->owners == NULL) {
 127 /* We're the first. */
 128 if(sk->reuse && sk->state != TCP_LISTEN)
 129  tb->flags = TCPB_FLAG_FASTREUSE;
 130 else
 131  tb->flags =0;
 132 }else{
 133 if((tb->flags & TCPB_FLAG_FASTREUSE) &&
 134 ((sk->reuse ==0) || (sk->state == TCP_LISTEN)))
 135  tb->flags &= ~TCPB_FLAG_FASTREUSE;
 136 }
 137 if((sk->bind_next = tb->owners) != NULL)
 138  tb->owners->bind_pprev = &sk->bind_next;
 139  tb->owners = sk;
 140  sk->bind_pprev = &tb->owners;
 141  sk->prev = (struct sock *) tb;
 142 }
 143
 144 /* This is a TIME_WAIT bucket. It works around the memory consumption
 145  * problems of sockets in such a state on heavily loaded servers, but
 146  * without violating the protocol specification.
 147  */
 148 struct tcp_tw_bucket {
 149 /* These _must_ match the beginning of struct sock precisely.
 150  * XXX Yes I know this is gross, but I'd have to edit every single
 151  * XXX networking file if I created a "struct sock_header". -DaveM
 152  */
 153 struct sock *sklist_next;
 154 struct sock *sklist_prev;
 155 struct sock *bind_next;
 156 struct sock **bind_pprev;
 157 struct sock *next;
 158 struct sock **pprev;
 159  __u32 daddr;
 160  __u32 rcv_saddr;
 161 int bound_dev_if;
 162 unsigned short num;
 163 unsigned char state,
 164  zapped;
 165  __u16 sport;
 166  __u16 dport;
 167 unsigned short family;
 168 unsigned char reuse,
 169  nonagle;
 170
 171 /* And these are ours. */
 172  __u32 rcv_nxt;
 173 struct tcp_func *af_specific;
 174 struct tcp_bind_bucket *tb;
 175 struct timer_list timer;
 176 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 177 struct in6_addr v6_daddr;
 178 struct in6_addr v6_rcv_saddr;
 179 #endif
 180 };
 181
 182 extern kmem_cache_t *tcp_timewait_cachep;
 183
 184 /* tcp_ipv4.c: These sysctl variables need to be shared between v4 and v6
 185  * because the v6 tcp code to intialize a connection needs to interoperate
 186  * with the v4 code using the same variables.
 187  * FIXME: It would be better to rewrite the connection code to be
 188  * address family independent and just leave one copy in the ipv4 section.
 189  * This would also clean up some code duplication. -- erics
 190  */
 191 externint sysctl_tcp_timestamps;
 192 externint sysctl_tcp_window_scaling;
 193 externint sysctl_tcp_sack;
 194
 195 /* These can have wildcards, don't try too hard. */
 196 static __inline__ inttcp_lhashfn(unsigned short num)
 197 {
 198 return num & (TCP_LHTABLE_SIZE -1);
 199 }
 200
 201 static __inline__ inttcp_sk_listen_hashfn(struct sock *sk)
 202 {
 203 returntcp_lhashfn(sk->num);
 204 }
 205
 206 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 207 #define NETHDR_SIZE sizeof(struct ipv6hdr)
 208 #else
 209 #define NETHDR_SIZE sizeof(struct iphdr) + 40
 210 #endif
 211
 212 /*
 213  * 40 is maximal IP options size
 214  * 20 is the maximum TCP options size we can currently construct on a SYN.
 215  * 40 is the maximum possible TCP options size.
 216  */
 217
 218 #define MAX_SYN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15)
 219 #define MAX_FIN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
 220 #define BASE_ACK_SIZE (NETHDR_SIZE + MAX_HEADER + 15)
 221 #define MAX_ACK_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
 222 #define MAX_RESET_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
 223 #define MAX_TCPHEADER_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15)
 224
 225 #define MAX_WINDOW 32767/* Never offer a window over 32767 without using
 226  window scaling (not yet supported). Some poor
 227  stacks do signed 16bit maths! */
 228 #define MIN_WINDOW 2048
 229 #define MAX_ACK_BACKLOG 2
 230 #define MAX_DELAY_ACK 2
 231 #define MIN_WRITE_SPACE 2048
 232 #define TCP_WINDOW_DIFF 2048
 233
 234 /* urg_data states */
 235 #define URG_VALID 0x0100
 236 #define URG_NOTYET 0x0200
 237 #define URG_READ 0x0400
 238
 239 #define TCP_RETR1 7/*
 240  * This is how many retries it does before it
 241  * tries to figure out if the gateway is
 242  * down.
 243  */
 244
 245 #define TCP_RETR2 15/*
 246  * This should take at least
 247  * 90 minutes to time out.
 248  */
 249
 250 #define TCP_TIMEOUT_LEN (15*60*HZ)/* should be about 15 mins */
 251 #define TCP_TIMEWAIT_LEN (60*HZ)/* how long to wait to successfully
 252  * close the socket, about 60 seconds */
 253 #define TCP_FIN_TIMEOUT (3*60*HZ)/* BSD style FIN_WAIT2 deadlock breaker */
 254
 255 #define TCP_ACK_TIME (3*HZ)/* time to delay before sending an ACK */
 256 #define TCP_DONE_TIME (5*HZ/2)/* maximum time to wait before actually
 257  * destroying a socket */
 258 #define TCP_WRITE_TIME (30*HZ)/* initial time to wait for an ACK,
 259  * after last transmit */
 260 #define TCP_TIMEOUT_INIT (3*HZ)/* RFC 1122 initial timeout value */
 261 #define TCP_SYN_RETRIES 10/* number of times to retry opening a
 262  * connection (TCP_RETR2-....) */
 263 #define TCP_PROBEWAIT_LEN (1*HZ)/* time to wait between probes when
 264  * I've got something to write and
 265  * there is no window */
 266 #define TCP_KEEPALIVE_TIME (180*60*HZ)/* two hours */
 267 #define TCP_KEEPALIVE_PROBES 9/* Max of 9 keepalive probes */
 268 #define TCP_KEEPALIVE_PERIOD ((75*HZ)>>2)/* period of keepalive check */
 269 #define TCP_NO_CHECK 0/* turn to one if you want the default
 270  * to be no checksum */
 271
 272 #define TCP_SYNACK_PERIOD (HZ/2)
 273 #define TCP_QUICK_TRIES 8/* How often we try to retransmit, until
 274  * we tell the LL layer that it is something
 275  * wrong (e.g. that it can expire redirects) */
 276
 277 #define TCP_BUCKETGC_PERIOD (HZ)
 278
 279 /*
 280  * TCP option
 281  */
 282
 283 #define TCPOPT_NOP 1/* Padding */
 284 #define TCPOPT_EOL 0/* End of options */
 285 #define TCPOPT_MSS 2/* Segment size negotiating */
 286 #define TCPOPT_WINDOW 3/* Window scaling */
 287 #define TCPOPT_SACK_PERM 4/* SACK Permitted */
 288 #define TCPOPT_SACK 5/* SACK Block */
 289 #define TCPOPT_TIMESTAMP 8/* Better RTT estimations/PAWS */
 290
 291 /*
 292  * TCP option lengths
 293  */
 294
 295 #define TCPOLEN_MSS 4
 296 #define TCPOLEN_WINDOW 3
 297 #define TCPOLEN_SACK_PERM 2
 298 #define TCPOLEN_TIMESTAMP 10
 299
 300 /* But this is what stacks really send out. */
 301 #define TCPOLEN_TSTAMP_ALIGNED 12
 302 #define TCPOLEN_WSCALE_ALIGNED 4
 303 #define TCPOLEN_SACKPERM_ALIGNED 4
 304 #define TCPOLEN_SACK_BASE 2
 305 #define TCPOLEN_SACK_BASE_ALIGNED 4
 306 #define TCPOLEN_SACK_PERBLOCK 8
 307
 308 /*
 309  * TCP Vegas constants
 310  */
 311
 312 #define TCP_VEGAS_ALPHA 2/* v_cong_detect_top_nseg */
 313 #define TCP_VEGAS_BETA 4/* v_cong_detect_bot_nseg */
 314 #define TCP_VEGAS_GAMMA 1/* v_exp_inc_nseg */
 315
 316 struct open_request;
 317
 318 struct or_calltable {
 319 void(*rtx_syn_ack) (struct sock *sk,struct open_request *req);
 320 void(*destructor) (struct open_request *req);
 321 void(*send_reset) (struct sk_buff *skb);
 322 };
 323
 324 struct tcp_v4_open_req {
 325  __u32 loc_addr;
 326  __u32 rmt_addr;
 327 struct ip_options *opt;
 328 };
 329
 330 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 331 struct tcp_v6_open_req {
 332 struct in6_addr loc_addr;
 333 struct in6_addr rmt_addr;
 334 struct ipv6_options *opt;
 335 int iif;
 336 };
 337 #endif
 338
 339 /* this structure is too big */
 340 struct open_request {
 341 struct open_request *dl_next;/* Must be first member! */
 342  __u32 rcv_isn;
 343  __u32 snt_isn;
 344  __u16 rmt_port;
 345  __u16 mss;
 346  __u8 retrans;
 347  __u8 __pad;
 348 unsigned snd_wscale :4,
 349  rcv_wscale :4,
 350  tstamp_ok :1,
 351  sack_ok :1,
 352  wscale_ok :1;
 353 /* The following two fields can be easily recomputed I think -AK */
 354  __u32 window_clamp;/* window clamp at creation time */
 355  __u32 rcv_wnd;/* rcv_wnd offered first time */
 356  __u32 ts_recent;
 357 unsigned long expires;
 358 struct or_calltable *class;
 359 struct sock *sk;
 360 union{
 361 struct tcp_v4_open_req v4_req;
 362 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 363 struct tcp_v6_open_req v6_req;
 364 #endif
 365 } af;
 366 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 367  __u16 lcl_port;/* LVE */
 368 #endif
 369 };
 370
 371 /* SLAB cache for open requests. */
 372 extern kmem_cache_t *tcp_openreq_cachep;
 373
 374 #define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC)
 375 #define tcp_openreq_free(req) kmem_cache_free(tcp_openreq_cachep, req)
 376
 377 /*
 378  * Pointers to address related TCP functions
 379  * (i.e. things that depend on the address family)
 380  */
 381
 382 struct tcp_func {
 383 void(*queue_xmit) (struct sk_buff *skb);
 384
 385 void(*send_check) (struct sock *sk,
 386 struct tcphdr *th,
 387 int len,
 388 struct sk_buff *skb);
 389
 390 int(*rebuild_header) (struct sock *sk);
 391
 392 int(*conn_request) (struct sock *sk,
 393 struct sk_buff *skb,
 394 void*opt, __u32 isn);
 395
 396 struct sock * (*syn_recv_sock) (struct sock *sk,
 397 struct sk_buff *skb,
 398 struct open_request *req,
 399 struct dst_entry *dst);
 400
 401 struct sock * (*get_sock) (struct sk_buff *skb,
 402 struct tcphdr *th);
 403
 404 int(*setsockopt) (struct sock *sk,
 405 int level,
 406 int optname,
 407 char*optval,
 408 int optlen);
 409
 410 int(*getsockopt) (struct sock *sk,
 411 int level,
 412 int optname,
 413 char*optval,
 414 int*optlen);
 415
 416
 417 void(*addr2sockaddr) (struct sock *sk,
 418 struct sockaddr *);
 419
 420 int sockaddr_len;
 421 };
 422
 423 /*
 424  * The next routines deal with comparing 32 bit unsigned ints
 425  * and worry about wraparound (automatic with unsigned arithmetic).
 426  */
 427
 428 extern __inline intbefore(__u32 seq1, __u32 seq2)
 429 {
 430 return(__s32)(seq1-seq2) <0;
 431 }
 432
 433 extern __inline intafter(__u32 seq1, __u32 seq2)
 434 {
 435 return(__s32)(seq2-seq1) <0;
 436 }
 437
 438
 439 /* is s2<=s1<=s3 ? */
 440 extern __inline intbetween(__u32 seq1, __u32 seq2, __u32 seq3)
 441 {
 442 return seq3 - seq2 >= seq1 - seq2;
 443 }
 444
 445
 446 externstruct proto tcp_prot;
 447 externstruct tcp_mib tcp_statistics;
 448
 449 externunsigned shorttcp_good_socknum(void);
 450
 451 externvoidtcp_v4_err(struct sk_buff *skb,
 452 unsigned char*,int);
 453
 454 externvoidtcp_shutdown(struct sock *sk,int how);
 455
 456 externinttcp_v4_rcv(struct sk_buff *skb,
 457 unsigned short len);
 458
 459 externinttcp_do_sendmsg(struct sock *sk,
 460 int iovlen,struct iovec *iov,
 461 int flags);
 462
 463 externinttcp_ioctl(struct sock *sk,
 464 int cmd,
 465 unsigned long arg);
 466
 467 externinttcp_rcv_state_process(struct sock *sk,
 468 struct sk_buff *skb,
 469 struct tcphdr *th,
 470 void*opt, __u16 len);
 471
 472 externinttcp_rcv_established(struct sock *sk,
 473 struct sk_buff *skb,
 474 struct tcphdr *th,
 475  __u16 len);
 476
 477 externinttcp_timewait_state_process(struct tcp_tw_bucket *tw,
 478 struct sk_buff *skb,
 479 struct tcphdr *th,
 480 void*opt, __u16 len);
 481
 482 externvoidtcp_close(struct sock *sk,
 483 unsigned long timeout);
 484 externstruct sock *tcp_accept(struct sock *sk,int flags);
 485 externunsigned inttcp_poll(struct file * file,struct socket *sock,struct poll_table_struct *wait);
 486 externinttcp_getsockopt(struct sock *sk,int level,
 487 int optname,char*optval,
 488 int*optlen);
 489 externinttcp_setsockopt(struct sock *sk,int level,
 490 int optname,char*optval,
 491 int optlen);
 492 externvoidtcp_set_keepalive(struct sock *sk,int val);
 493 externinttcp_recvmsg(struct sock *sk,
 494 struct msghdr *msg,
 495 int len,int nonblock,
 496 int flags,int*addr_len);
 497
 498 externvoidtcp_parse_options(struct sock *sk,struct tcphdr *th,
 499 struct tcp_opt *tp,int no_fancy);
 500
 501 /*
 502  * TCP v4 functions exported for the inet6 API
 503  */
 504
 505 externinttcp_v4_rebuild_header(struct sock *sk);
 506
 507 externinttcp_v4_build_header(struct sock *sk,
 508 struct sk_buff *skb);
 509
 510 externvoidtcp_v4_send_check(struct sock *sk,
 511 struct tcphdr *th,int len,
 512 struct sk_buff *skb);
 513
 514 externinttcp_v4_conn_request(struct sock *sk,
 515 struct sk_buff *skb,
 516 void*ptr, __u32 isn);
 517
 518 externstruct sock *tcp_create_openreq_child(struct sock *sk,
 519 struct open_request *req,
 520 struct sk_buff *skb,
 521 int mss);
 522
 523 externstruct sock *tcp_v4_syn_recv_sock(struct sock *sk,
 524 struct sk_buff *skb,
 525 struct open_request *req,
 526 struct dst_entry *dst);
 527
 528 externinttcp_v4_do_rcv(struct sock *sk,
 529 struct sk_buff *skb);
 530
 531 externinttcp_v4_connect(struct sock *sk,
 532 struct sockaddr *uaddr,
 533 int addr_len);
 534
 535 externvoidtcp_connect(struct sock *sk,
 536 struct sk_buff *skb,
 537 int est_mss);
 538
 539 externstruct sk_buff *tcp_make_synack(struct sock *sk,
 540 struct dst_entry *dst,
 541 struct open_request *req,
 542 int mss);
 543
 544
 545 /* From syncookies.c */
 546 externstruct sock *cookie_v4_check(struct sock *sk,struct sk_buff *skb,
 547 struct ip_options *opt);
 548 extern __u32 cookie_v4_init_sequence(struct sock *sk,struct sk_buff *skb,
 549  __u16 *mss);
 550
 551 externvoidtcp_read_wakeup(struct sock *);
 552 externvoidtcp_write_xmit(struct sock *);
 553 externvoidtcp_time_wait(struct sock *);
 554 externinttcp_retransmit_skb(struct sock *,struct sk_buff *);
 555 externvoidtcp_xmit_retransmit_queue(struct sock *);
 556 externvoidtcp_simple_retransmit(struct sock *);
 557
 558 /* tcp_output.c */
 559
 560 externvoidtcp_send_probe0(struct sock *);
 561 externvoidtcp_send_partial(struct sock *);
 562 externvoidtcp_write_wakeup(struct sock *);
 563 externvoidtcp_send_fin(struct sock *sk);
 564 externvoidtcp_send_active_reset(struct sock *sk);
 565 externinttcp_send_synack(struct sock *);
 566 externvoidtcp_transmit_skb(struct sock *,struct sk_buff *);
 567 externvoidtcp_send_skb(struct sock *,struct sk_buff *,int force_queue);
 568 externvoidtcp_send_ack(struct sock *sk);
 569 externvoidtcp_send_delayed_ack(struct tcp_opt *tp,int max_timeout);
 570
 571 /* CONFIG_IP_TRANSPARENT_PROXY */
 572 externinttcp_chkaddr(struct sk_buff *);
 573
 574 /* tcp_timer.c */
 575 #define tcp_reset_msl_timer(x,y,z) net_reset_timer(x,y,z)
 576 externvoidtcp_reset_xmit_timer(struct sock *,int,unsigned long);
 577 externvoidtcp_init_xmit_timers(struct sock *);
 578 externvoidtcp_clear_xmit_timers(struct sock *);
 579
 580 externvoidtcp_retransmit_timer(unsigned long);
 581 externvoidtcp_delack_timer(unsigned long);
 582 externvoidtcp_probe_timer(unsigned long);
 583
 584 externstruct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 585 struct open_request *req);
 586
 587 /*
 588  * TCP slow timer
 589  */
 590 externstruct timer_list tcp_slow_timer;
 591
 592 struct tcp_sl_timer {
 593  atomic_t count;
 594 unsigned long period;
 595 unsigned long last;
 596 void(*handler) (unsigned long);
 597 };
 598
 599 #define TCP_SLT_SYNACK 0
 600 #define TCP_SLT_KEEPALIVE 1
 601 #define TCP_SLT_BUCKETGC 2
 602 #define TCP_SLT_MAX 3
 603
 604 externstruct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX];
 605
 606 /* Compute the actual receive window we are currently advertising. */
 607 static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
 608 {
 609 return tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd);
 610 }
 611
 612 /* Choose a new window, without checks for shrinking, and without
 613  * scaling applied to the result. The caller does these things
 614  * if necessary. This is a "raw" window selection.
 615  */
 616 extern u32 __tcp_select_window(struct sock *sk);
 617
 618 /* Chose a new window to advertise, update state in tcp_opt for the
 619  * socket, and return result with RFC1323 scaling applied. The return
 620  * value can be stuffed directly into th->window for an outgoing
 621  * frame.
 622  */
 623 extern __inline__ u16 tcp_select_window(struct sock *sk)
 624 {
 625 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 626  u32 new_win =__tcp_select_window(sk);
 627  u32 cur_win =tcp_receive_window(tp);
 628
 629 /* Never shrink the offered window */
 630 if(new_win < cur_win)
 631  new_win = cur_win;
 632  tp->rcv_wnd = new_win;
 633  tp->rcv_wup = tp->rcv_nxt;
 634
 635 /* RFC1323 scaling applied */
 636 return new_win >> tp->rcv_wscale;
 637 }
 638
 639 /* See if we can advertise non-zero, and if so how much we
 640  * can increase our advertisement. If it becomes more than
 641  * twice what we are talking about right now, return true.
 642  */
 643 extern __inline__ inttcp_raise_window(struct sock *sk)
 644 {
 645 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 646  u32 new_win =__tcp_select_window(sk);
 647  u32 cur_win =tcp_receive_window(tp);
 648
 649 return(new_win && (new_win > (cur_win <<1)));
 650 }
 651
 652 /* This is what the send packet queueing engine uses to pass
 653  * TCP per-packet control information to the transmission
 654  * code.
 655  */
 656 struct tcp_skb_cb {
 657  __u8 flags;/* TCP header flags. */
 658
 659 /* NOTE: These must match up to the flags byte in a
 660  * real TCP header.
 661  */
 662 #define TCPCB_FLAG_FIN 0x01
 663 #define TCPCB_FLAG_SYN 0x02
 664 #define TCPCB_FLAG_RST 0x04
 665 #define TCPCB_FLAG_PSH 0x08
 666 #define TCPCB_FLAG_ACK 0x10
 667 #define TCPCB_FLAG_URG 0x20
 668
 669  __u8 sacked;/* State flags for SACK/FACK. */
 670 #define TCPCB_SACKED_ACKED 0x01/* SKB ACK'd by a SACK block */
 671 #define TCPCB_SACKED_RETRANS 0x02/* SKB retransmitted */
 672
 673  __u16 urg_ptr;/* Valid w/URG flags is set. */
 674 };
 675
 676 #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
 677
 678 /* This checks if the data bearing packet SKB (usually tp->send_head)
 679  * should be put on the wire right now.
 680  */
 681 static __inline__ inttcp_snd_test(struct sock *sk,struct sk_buff *skb)
 682 {
 683 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 684 int nagle_check =1;
 685 int len;
 686
 687 /* RFC 1122 - section 4.2.3.4
 688  *
 689  * We must queue if
 690  *
 691  * a) The right edge of this frame exceeds the window
 692  * b) There are packets in flight and we have a small segment
 693  * [SWS avoidance and Nagle algorithm]
 694  * (part of SWS is done on packetization)
 695  * c) We are retransmiting [Nagle]
 696  * d) We have too many packets 'in flight'
 697  *
 698  * Don't use the nagle rule for urgent data.
 699  */
 700  len = skb->end_seq - skb->seq;
 701 if(!sk->nonagle && len < (sk->mss >>1) && tp->packets_out &&
 702 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG))
 703  nagle_check =0;
 704
 705 return(nagle_check && tp->packets_out < tp->snd_cwnd &&
 706 !after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
 707  tp->retransmits ==0);
 708 }
 709
 710 /* This tells the input processing path that an ACK should go out
 711  * right now.
 712  */
 713 #define tcp_enter_quickack_mode(__tp) ((__tp)->ato = (HZ/100))
 714 #define tcp_in_quickack_mode(__tp) ((__tp)->ato == (HZ/100))
 715
 716 /*
 717  * List all states of a TCP socket that can be viewed as a "connected"
 718  * state. This now includes TCP_SYN_RECV, although I am not yet fully
 719  * convinced that this is the solution for the 'getpeername(2)'
 720  * problem. Thanks to Stephen A. Wood <saw@cebaf.gov> -FvK
 721  */
 722
 723 extern __inline const inttcp_connected(const int state)
 724 {
 725 return((1<< state) &
 726 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
 727  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
 728 }
 729
 730 /*
 731  * Calculate(/check) TCP checksum
 732  */
 733 static __inline__ u16 tcp_v4_check(struct tcphdr *th,int len,
 734 unsigned long saddr,unsigned long daddr,
 735 unsigned long base)
 736 {
 737 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
 738 }
 739
 740 #undef STATE_TRACE
 741
 742 #ifdef STATE_TRACE
 743 static char*statename[]={
 744 "Unused","Established","Syn Sent","Syn Recv",
 745 "Fin Wait 1","Fin Wait 2","Time Wait","Close",
 746 "Close Wait","Last ACK","Listen","Closing"
 747 };
 748 #endif
 749
 750 static __inline__ voidtcp_set_state(struct sock *sk,int state)
 751 {
 752 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 753 int oldstate = sk->state;
 754
 755  sk->state = state;
 756
 757 #ifdef STATE_TRACE
 758 SOCK_DEBUG(sk,"TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
 759 #endif
 760
 761 switch(state) {
 762 case TCP_ESTABLISHED:
 763 if(oldstate != TCP_ESTABLISHED)
 764  tcp_statistics.TcpCurrEstab++;
 765 break;
 766
 767 case TCP_CLOSE:
 768 /* Should be about 2 rtt's */
 769 net_reset_timer(sk, TIME_DONE,min(tp->srtt *2, TCP_DONE_TIME));
 770  sk->prot->unhash(sk);
 771 /* fall through */
 772 default:
 773 if(oldstate==TCP_ESTABLISHED)
 774  tcp_statistics.TcpCurrEstab--;
 775 }
 776 }
 777
 778 static __inline__ voidtcp_build_and_update_options(__u32 *ptr,struct tcp_opt *tp, __u32 tstamp)
 779 {
 780 if(tp->tstamp_ok) {
 781 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) |
 782 (TCPOPT_NOP <<16) |
 783 (TCPOPT_TIMESTAMP <<8) |
 784  TCPOLEN_TIMESTAMP);
 785 *ptr++ =htonl(tstamp);
 786 *ptr++ =htonl(tp->ts_recent);
 787 }
 788 if(tp->sack_ok && tp->num_sacks) {
 789 int this_sack;
 790
 791 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) |
 792 (TCPOPT_NOP <<16) |
 793 (TCPOPT_SACK <<8) |
 794 (TCPOLEN_SACK_BASE +
 795 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)));
 796 for(this_sack =0; this_sack < tp->num_sacks; this_sack++) {
 797 *ptr++ =htonl(tp->selective_acks[this_sack].start_seq);
 798 *ptr++ =htonl(tp->selective_acks[this_sack].end_seq);
 799 }
 800 }
 801 }
 802
 803 /* Construct a tcp options header for a SYN or SYN_ACK packet.
 804  * If this is every changed make sure to change the definition of
 805  * MAX_SYN_SIZE to match the new maximum number of options that you
 806  * can generate.
 807  */
 808 extern __inline__ voidtcp_syn_build_options(__u32 *ptr,int mss,int ts,int sack,
 809 int offer_wscale,int wscale, __u32 tstamp)
 810 {
 811 /* We always get an MSS option.
 812  * The option bytes which will be seen in normal data
 813  * packets should timestamps be used, must be in the MSS
 814  * advertised. But we subtract them from sk->mss so
 815  * that calculations in tcp_sendmsg are simpler etc.
 816  * So account for this fact here if necessary. If we
 817  * don't do this correctly, as a receiver we won't
 818  * recognize data packets as being full sized when we
 819  * should, and thus we won't abide by the delayed ACK
 820  * rules correctly.
 821  * SACKs don't matter, we never delay an ACK when we
 822  * have any of those going out.
 823  */
 824 if(ts)
 825  mss += TCPOLEN_TSTAMP_ALIGNED;
 826 *ptr++ =htonl((TCPOPT_MSS <<24) | (TCPOLEN_MSS <<16) | mss);
 827 if(ts) {
 828 if(sack)
 829 *ptr++ =__constant_htonl((TCPOPT_SACK_PERM <<24) | (TCPOLEN_SACK_PERM <<16) |
 830 (TCPOPT_TIMESTAMP <<8) | TCPOLEN_TIMESTAMP);
 831 else
 832 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) | (TCPOPT_NOP <<16) |
 833 (TCPOPT_TIMESTAMP <<8) | TCPOLEN_TIMESTAMP);
 834 *ptr++ =htonl(tstamp);/* TSVAL */
 835 *ptr++ =__constant_htonl(0);/* TSECR */
 836 }else if(sack)
 837 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) | (TCPOPT_NOP <<16) |
 838 (TCPOPT_SACK_PERM <<8) | TCPOLEN_SACK_PERM);
 839 if(offer_wscale)
 840 *ptr++ =htonl((TCPOPT_NOP <<24) | (TCPOPT_WINDOW <<16) | (TCPOLEN_WINDOW <<8) | (wscale));
 841 }
 842
 843 /* Determine a window scaling and initial window to offer.
 844  * Based on the assumption that the given amount of space
 845  * will be offered. Store the results in the tp structure.
 846  * NOTE: for smooth operation initial space offering should
 847  * be a multiple of mss if possible. We assume here that mss >= 1.
 848  * This MUST be enforced by all callers.
 849  */
 850 extern __inline__ voidtcp_select_initial_window(__u32 space, __u16 mss,
 851  __u32 *rcv_wnd,
 852  __u32 *window_clamp,
 853 int wscale_ok,
 854  __u8 *rcv_wscale)
 855 {
 856 /* If no clamp set the clamp to the max possible scaled window */
 857 if(*window_clamp ==0)
 858 (*window_clamp) = (65535<<14);
 859  space =min(*window_clamp,space);
 860
 861 /* Quantize space offering to a multiple of mss if possible. */
 862 if(space > mss)
 863  space = (space/mss)*mss;
 864
 865 /* NOTE: offering an initial window larger than 32767
 866  * will break some buggy TCP stacks. We try to be nice.
 867  * If we are not window scaling, then this truncates
 868  * our initial window offering to 32k. There should also
 869  * be a sysctl option to stop being nice.
 870  */
 871 (*rcv_wnd) =min(space,32767);
 872 (*rcv_wscale) =0;
 873 if(wscale_ok) {
 874 /* See RFC1323 for an explanation of the limit to 14 */
 875 while(space >65535&& (*rcv_wscale) <14) {
 876  space >>=1;
 877 (*rcv_wscale)++;
 878 }
 879 }
 880 /* Set the clamp no higher than max representable value */
 881 (*window_clamp) =min(65535<<(*rcv_wscale),*window_clamp);
 882 }
 883
 884 extern __inline__ voidtcp_synq_unlink(struct tcp_opt *tp,struct open_request *req,struct open_request *prev)
 885 {
 886 if(!req->dl_next)
 887  tp->syn_wait_last = (struct open_request **)prev;
 888  prev->dl_next = req->dl_next;
 889 }
 890
 891 extern __inline__ voidtcp_synq_queue(struct tcp_opt *tp,struct open_request *req)
 892 {
 893  req->dl_next = NULL;
 894 *tp->syn_wait_last = req;
 895  tp->syn_wait_last = &req->dl_next;
 896 }
 897
 898 extern __inline__ voidtcp_synq_init(struct tcp_opt *tp)
 899 {
 900  tp->syn_wait_queue = NULL;
 901  tp->syn_wait_last = &tp->syn_wait_queue;
 902 }
 903
 904 extern __inline__ struct open_request *tcp_synq_unlink_tail(struct tcp_opt *tp)
 905 {
 906 struct open_request *head = tp->syn_wait_queue;
 907 #if 0
 908 /* Should be a net-ratelimit'd thing, not all the time. */
 909 printk(KERN_DEBUG "synq tail drop with expire=%ld\n",
 910  head->expires-jiffies);
 911 #endif
 912 if(head->dl_next == NULL)
 913  tp->syn_wait_last = &tp->syn_wait_queue;
 914  tp->syn_wait_queue = head->dl_next;
 915 return head;
 916 }
 917
 918 externvoid__tcp_inc_slow_timer(struct tcp_sl_timer *slt);
 919 extern __inline__ voidtcp_inc_slow_timer(int timer)
 920 {
 921 struct tcp_sl_timer *slt = &tcp_slt_array[timer];
 922
 923 if(atomic_read(&slt->count) ==0)
 924 {
 925 __tcp_inc_slow_timer(slt);
 926 }
 927
 928 atomic_inc(&slt->count);
 929 }
 930
 931 extern __inline__ voidtcp_dec_slow_timer(int timer)
 932 {
 933 struct tcp_sl_timer *slt = &tcp_slt_array[timer];
 934
 935 atomic_dec(&slt->count);
 936 }
 937
 938 /* This needs to use a slow timer, so it is here. */
 939 static __inline__ voidtcp_sk_unbindify(struct sock *sk)
 940 {
 941 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *) sk->prev;
 942 if(sk->bind_next)
 943  sk->bind_next->bind_pprev = sk->bind_pprev;
 944 *sk->bind_pprev = sk->bind_next;
 945 if(tb->owners == NULL)
 946 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 947 }
 948
 949 externconst char timer_bug_msg[];
 950
 951 staticinlinevoidtcp_clear_xmit_timer(struct sock *sk,int what)
 952 {
 953 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 954 struct timer_list *timer;
 955
 956 switch(what) {
 957 case TIME_RETRANS:
 958  timer = &tp->retransmit_timer;
 959 break;
 960 case TIME_DACK:
 961  timer = &tp->delack_timer;
 962 break;
 963 case TIME_PROBE0:
 964  timer = &tp->probe_timer;
 965 break;
 966 default:
 967 printk(timer_bug_msg);
 968 return;
 969 };
 970 if(timer->prev != NULL)
 971 del_timer(timer);
 972 }
 973
 974 staticinlineinttcp_timer_is_set(struct sock *sk,int what)
 975 {
 976 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 977
 978 switch(what) {
 979 case TIME_RETRANS:
 980 return tp->retransmit_timer.prev != NULL;
 981 break;
 982 case TIME_DACK:
 983 return tp->delack_timer.prev != NULL;
 984 break;
 985 case TIME_PROBE0:
 986 return tp->probe_timer.prev != NULL;
 987 break;
 988 default:
 989 printk(timer_bug_msg);
 990 };
 991 return0;
 992 }
 993
 994
 995 #endif/* _TCP_H */