2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 6 * Definitions for the TCP module. 8 * Version: @(#)tcp.h 1.0.5 05/23/93 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 13 * This program is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU General Public License 15 * as published by the Free Software Foundation; either version 16 * 2 of the License, or (at your option) any later version. 23 #include <linux/config.h> 24 #include <linux/tcp.h> 25 #include <linux/slab.h> 26 #include <net/checksum.h> 29 /* This is for all connections with a full identity, no wildcards. 30 * New scheme, half the table is for TIME_WAIT, the other half is 31 * for the rest. I'll experiment with dynamic table growth later. 33 struct tcp_ehash_bucket
{ 36 }__attribute__((__aligned__(8))); 38 externint tcp_ehash_size
; 39 externstruct tcp_ehash_bucket
*tcp_ehash
; 41 /* This is for listening sockets, thus all sockets which possess wildcards. */ 42 #define TCP_LHTABLE_SIZE 32/* Yes, really, this is all you need. */ 44 /* tcp_ipv4.c: These need to be shared by v4 and v6 because the lookup 45 * and hashing code needs to work with different AF's yet 46 * the port space is shared. 48 externstruct sock
*tcp_listening_hash
[TCP_LHTABLE_SIZE
]; 49 extern rwlock_t tcp_lhash_lock
; 50 extern atomic_t tcp_lhash_users
; 51 extern wait_queue_head_t tcp_lhash_wait
; 53 /* There are a few simple rules, which allow for local port reuse by 54 * an application. In essence: 56 * 1) Sockets bound to different interfaces may share a local port. 57 * Failing that, goto test 2. 58 * 2) If all sockets have sk->reuse set, and none of them are in 59 * TCP_LISTEN state, the port may be shared. 60 * Failing that, goto test 3. 61 * 3) If all sockets are bound to a specific sk->rcv_saddr local 62 * address, and none of them are the same, the port may be 64 * Failing this, the port cannot be shared. 66 * The interesting point, is test #2. This is what an FTP server does 67 * all day. To optimize this case we use a specific flag bit defined 68 * below. As we add sockets to a bind bucket list, we perform a 69 * check of: (newsk->reuse && (newsk->state != TCP_LISTEN)) 70 * As long as all sockets added to a bind bucket pass this test, 71 * the flag bit will be set. 72 * The resulting situation is that tcp_v[46]_verify_bind() can just check 73 * for this flag bit, if it is set and the socket trying to bind has 74 * sk->reuse set, we don't even have to walk the owners list at all, 75 * we return that it is ok to bind this socket to the requested local port. 77 * Sounds like a lot of work, but it is worth it. In a more naive 78 * implementation (ie. current FreeBSD etc.) the entire list of ports 79 * must be walked for each data port opened by an ftp server. Needless 80 * to say, this does not scale at all. With a couple thousand FTP 81 * users logged onto your box, isn't it nice to know that new data 82 * ports are created in O(1) time? I thought so. ;-) -DaveM 84 struct tcp_bind_bucket
{ 86 unsigned short fastreuse
; 87 struct tcp_bind_bucket
*next
; 89 struct tcp_bind_bucket
**pprev
; 92 struct tcp_bind_hashbucket
{ 94 struct tcp_bind_bucket
*chain
; 97 externstruct tcp_bind_hashbucket
*tcp_bhash
; 98 externint tcp_bhash_size
; 99 extern spinlock_t tcp_portalloc_lock
; 101 extern kmem_cache_t
*tcp_bucket_cachep
; 102 externstruct tcp_bind_bucket
*tcp_bucket_create(struct tcp_bind_hashbucket
*head
, 103 unsigned short snum
); 104 externvoidtcp_bucket_unlock(struct sock
*sk
); 105 externint tcp_port_rover
; 106 externstruct sock
*tcp_v4_lookup_listener(u32 addr
,unsigned short hnum
,int dif
); 108 /* These are AF independent. */ 109 static __inline__
inttcp_bhashfn(__u16 lport
) 111 return(lport
& (tcp_bhash_size
-1)); 114 /* This is a TIME_WAIT bucket. It works around the memory consumption 115 * problems of sockets in such a state on heavily loaded servers, but 116 * without violating the protocol specification. 118 struct tcp_tw_bucket
{ 119 /* These _must_ match the beginning of struct sock precisely. 120 * XXX Yes I know this is gross, but I'd have to edit every single 121 * XXX networking file if I created a "struct sock_header". -DaveM 130 struct sock
*bind_next
; 131 struct sock
**bind_pprev
; 135 unsigned short family
; 140 /* And these are ours. */ 145 long ts_recent_stamp
; 146 struct tcp_bind_bucket
*tb
; 147 struct tcp_tw_bucket
*next_death
; 148 struct tcp_tw_bucket
**pprev_death
; 150 #ifdef CONFIG_TCP_TW_RECYCLE 154 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 155 struct in6_addr v6_daddr
; 156 struct in6_addr v6_rcv_saddr
; 160 extern kmem_cache_t
*tcp_timewait_cachep
; 162 extern __inline__
voidtcp_tw_put(struct tcp_tw_bucket
*tw
) 164 if(atomic_dec_and_test(&tw
->refcnt
)) { 165 #ifdef INET_REFCNT_DEBUG 166 printk(KERN_DEBUG
"tw_bucket %p released\n", tw
); 168 kmem_cache_free(tcp_timewait_cachep
, tw
); 172 externint tcp_tw_death_row_slot
; 173 externvoidtcp_timewait_kill(struct tcp_tw_bucket
*tw
); 174 externvoidtcp_tw_schedule(struct tcp_tw_bucket
*tw
); 175 externvoidtcp_tw_reschedule(struct tcp_tw_bucket
*tw
); 176 externvoidtcp_tw_deschedule(struct tcp_tw_bucket
*tw
); 179 /* Socket demux engine toys. */ 181 #define TCP_COMBINED_PORTS(__sport, __dport) \ 182 (((__u32)(__sport)<<16) | (__u32)(__dport)) 183 #else/* __LITTLE_ENDIAN */ 184 #define TCP_COMBINED_PORTS(__sport, __dport) \ 185 (((__u32)(__dport)<<16) | (__u32)(__sport)) 188 #if (BITS_PER_LONG == 64) 190 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ 191 __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr)); 192 #else/* __LITTLE_ENDIAN */ 193 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ 194 __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr)); 195 #endif/* __BIG_ENDIAN */ 196 #define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ 197 (((*((__u64 *)&((__sk)->daddr)))== (__cookie)) && \ 198 ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ 199 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) 200 #else/* 32-bit arch */ 201 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) 202 #define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ 203 (((__sk)->daddr == (__saddr)) && \ 204 ((__sk)->rcv_saddr == (__daddr)) && \ 205 ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ 206 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) 207 #endif/* 64-bit arch */ 209 #define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ 210 (((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ 211 ((__sk)->family == AF_INET6) && \ 212 !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.daddr, (__saddr)) && \ 213 !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.rcv_saddr, (__daddr)) && \ 214 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) 216 /* These can have wildcards, don't try too hard. */ 217 static __inline__
inttcp_lhashfn(unsigned short num
) 219 return num
& (TCP_LHTABLE_SIZE
-1); 222 static __inline__
inttcp_sk_listen_hashfn(struct sock
*sk
) 224 returntcp_lhashfn(sk
->num
); 227 /* Note, that it is > than ipv6 header */ 228 #define NETHDR_SIZE (sizeof(struct iphdr) + 40) 231 * 40 is maximal IP options size 232 * 20 is the maximum TCP options size we can currently construct on a SYN. 233 * 40 is the maximum possible TCP options size. 236 #define MAX_SYN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15) 237 #define MAX_FIN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) 238 #define BASE_ACK_SIZE (NETHDR_SIZE + MAX_HEADER + 15) 239 #define MAX_ACK_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) 240 #define MAX_RESET_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) 241 #define MAX_TCPHEADER_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15) 244 * Never offer a window over 32767 without using window scaling. Some 245 * poor stacks do signed 16bit maths! 247 #define MAX_WINDOW 32767 248 #define MAX_DELAY_ACK 2 251 * How much of the receive buffer do we advertize 252 * (the rest is reserved for headers and driver packet overhead) 255 #define WINDOW_ADVERTISE_DIVISOR 2 257 /* urg_data states */ 258 #define URG_VALID 0x0100 259 #define URG_NOTYET 0x0200 260 #define URG_READ 0x0400 262 #define TCP_RETR1 7/* 263 * This is how many retries it does before it 264 * tries to figure out if the gateway is 268 #define TCP_RETR2 15/* 269 * This should take at least 270 * 90 minutes to time out. 273 #define TCP_TIMEOUT_LEN (15*60*HZ)/* should be about 15 mins */ 274 #define TCP_TIMEWAIT_LEN (60*HZ)/* how long to wait to successfully 275 * close the socket, about 60 seconds */ 276 #define TCP_FIN_TIMEOUT (3*60*HZ)/* BSD style FIN_WAIT2 deadlock breaker */ 278 #define TCP_ACK_TIME (3*HZ)/* time to delay before sending an ACK */ 279 #define TCP_WRITE_TIME (30*HZ)/* initial time to wait for an ACK, 280 * after last transmit */ 281 #define TCP_TIMEOUT_INIT (3*HZ)/* RFC 1122 initial timeout value */ 282 #define TCP_SYN_RETRIES 10/* number of times to retry opening a 283 * connection (TCP_RETR2-....) */ 284 #define TCP_PROBEWAIT_LEN (1*HZ)/* time to wait between probes when 285 * I've got something to write and 286 * there is no window */ 287 #define TCP_KEEPALIVE_TIME (120*60*HZ)/* two hours */ 288 #define TCP_KEEPALIVE_PROBES 9/* Max of 9 keepalive probes */ 289 #define TCP_KEEPALIVE_INTVL (75*HZ) 291 #define MAX_TCP_KEEPIDLE 32767 292 #define MAX_TCP_KEEPINTVL 32767 293 #define MAX_TCP_KEEPCNT 127 294 #define MAX_TCP_SYNCNT 127 296 #define TCP_SYNACK_PERIOD (HZ/2)/* How often to run the synack slow timer */ 297 #define TCP_QUICK_TRIES 8/* How often we try to retransmit, until 298 * we tell the link layer that it is something 299 * wrong (e.g. that it can expire redirects) */ 301 /* TIME_WAIT reaping mechanism. */ 302 #define TCP_TWKILL_SLOTS 8/* Please keep this a power of 2. */ 303 #define TCP_TWKILL_PERIOD ((HZ*60)/TCP_TWKILL_SLOTS) 309 #define TCPOPT_NOP 1/* Padding */ 310 #define TCPOPT_EOL 0/* End of options */ 311 #define TCPOPT_MSS 2/* Segment size negotiating */ 312 #define TCPOPT_WINDOW 3/* Window scaling */ 313 #define TCPOPT_SACK_PERM 4/* SACK Permitted */ 314 #define TCPOPT_SACK 5/* SACK Block */ 315 #define TCPOPT_TIMESTAMP 8/* Better RTT estimations/PAWS */ 321 #define TCPOLEN_MSS 4 322 #define TCPOLEN_WINDOW 3 323 #define TCPOLEN_SACK_PERM 2 324 #define TCPOLEN_TIMESTAMP 10 326 /* But this is what stacks really send out. */ 327 #define TCPOLEN_TSTAMP_ALIGNED 12 328 #define TCPOLEN_WSCALE_ALIGNED 4 329 #define TCPOLEN_SACKPERM_ALIGNED 4 330 #define TCPOLEN_SACK_BASE 2 331 #define TCPOLEN_SACK_BASE_ALIGNED 4 332 #define TCPOLEN_SACK_PERBLOCK 8 334 #define TIME_WRITE 1/* Not yet used */ 335 #define TIME_RETRANS 2/* Retransmit timer */ 336 #define TIME_DACK 3/* Delayed ack timer */ 337 #define TIME_PROBE0 4 338 #define TIME_KEEPOPEN 5 340 /* sysctl variables for tcp */ 341 externint sysctl_tcp_keepalive_time
; 342 externint sysctl_tcp_keepalive_probes
; 343 externint sysctl_tcp_keepalive_intvl
; 344 externint sysctl_tcp_syn_retries
; 348 struct or_calltable
{ 350 void(*rtx_syn_ack
) (struct sock
*sk
,struct open_request
*req
); 351 void(*send_ack
) (struct sk_buff
*skb
,struct open_request
*req
); 352 void(*destructor
) (struct open_request
*req
); 353 void(*send_reset
) (struct sk_buff
*skb
); 356 struct tcp_v4_open_req
{ 359 struct ip_options
*opt
; 362 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 363 struct tcp_v6_open_req
{ 364 struct in6_addr loc_addr
; 365 struct in6_addr rmt_addr
; 366 struct sk_buff
*pktopts
; 371 /* this structure is too big */ 372 struct open_request
{ 373 struct open_request
*dl_next
;/* Must be first member! */ 380 unsigned snd_wscale
:4, 385 /* The following two fields can be easily recomputed I think -AK */ 386 __u32 window_clamp
;/* window clamp at creation time */ 387 __u32 rcv_wnd
;/* rcv_wnd offered first time */ 389 unsigned long expires
; 390 struct or_calltable
*class; 393 struct tcp_v4_open_req v4_req
; 394 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 395 struct tcp_v6_open_req v6_req
; 400 /* SLAB cache for open requests. */ 401 extern kmem_cache_t
*tcp_openreq_cachep
; 403 #define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC) 404 #define tcp_openreq_free(req) kmem_cache_free(tcp_openreq_cachep, req) 406 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 407 #define TCP_INET_FAMILY(fam) ((fam) == AF_INET) 409 #define TCP_INET_FAMILY(fam) 1 413 * Pointers to address related TCP functions 414 * (i.e. things that depend on the address family) 416 * BUGGG_FUTURE: all the idea behind this struct is wrong. 417 * It mixes socket frontend with transport function. 418 * With port sharing between IPv6/v4 it gives the only advantage, 419 * only poor IPv6 needs to permanently recheck, that it 420 * is still IPv6 8)8) It must be cleaned up as soon as possible. 425 int(*queue_xmit
) (struct sk_buff
*skb
); 427 void(*send_check
) (struct sock
*sk
, 430 struct sk_buff
*skb
); 432 int(*rebuild_header
) (struct sock
*sk
); 434 int(*conn_request
) (struct sock
*sk
, 435 struct sk_buff
*skb
); 437 struct sock
* (*syn_recv_sock
) (struct sock
*sk
, 439 struct open_request
*req
, 440 struct dst_entry
*dst
); 442 int(*hash_connecting
) (struct sock
*sk
); 444 __u16 net_header_len
; 448 int(*setsockopt
) (struct sock
*sk
, 454 int(*getsockopt
) (struct sock
*sk
, 461 void(*addr2sockaddr
) (struct sock
*sk
, 468 * The next routines deal with comparing 32 bit unsigned ints 469 * and worry about wraparound (automatic with unsigned arithmetic). 472 extern __inline
intbefore(__u32 seq1
, __u32 seq2
) 474 return(__s32
)(seq1
-seq2
) <0; 477 extern __inline
intafter(__u32 seq1
, __u32 seq2
) 479 return(__s32
)(seq2
-seq1
) <0; 483 /* is s2<=s1<=s3 ? */ 484 extern __inline
intbetween(__u32 seq1
, __u32 seq2
, __u32 seq3
) 486 return seq3
- seq2
>= seq1
- seq2
; 490 externstruct proto tcp_prot
; 491 externstruct tcp_mib tcp_statistics
; 493 externvoidtcp_put_port(struct sock
*sk
); 494 externvoid__tcp_put_port(struct sock
*sk
); 495 externvoidtcp_inherit_port(struct sock
*sk
,struct sock
*child
); 497 externvoidtcp_v4_err(struct sk_buff
*skb
, 500 externvoidtcp_shutdown(struct sock
*sk
,int how
); 502 externinttcp_v4_rcv(struct sk_buff
*skb
, 505 externinttcp_do_sendmsg(struct sock
*sk
,struct msghdr
*msg
); 507 externinttcp_ioctl(struct sock
*sk
, 511 externinttcp_rcv_state_process(struct sock
*sk
, 516 externinttcp_rcv_established(struct sock
*sk
, 529 externenum tcp_tw_status
tcp_timewait_state_process(struct tcp_tw_bucket
*tw
, 534 externstruct sock
*tcp_check_req(struct sock
*sk
,struct sk_buff
*skb
, 535 struct open_request
*req
, 536 struct open_request
*prev
); 538 externvoidtcp_close(struct sock
*sk
, 540 externstruct sock
*tcp_accept(struct sock
*sk
,int flags
,int*err
); 541 externunsigned inttcp_poll(struct file
* file
,struct socket
*sock
,struct poll_table_struct
*wait
); 542 externvoidtcp_write_space(struct sock
*sk
); 544 externinttcp_getsockopt(struct sock
*sk
,int level
, 545 int optname
,char*optval
, 547 externinttcp_setsockopt(struct sock
*sk
,int level
, 548 int optname
,char*optval
, 550 externvoidtcp_set_keepalive(struct sock
*sk
,int val
); 551 externinttcp_recvmsg(struct sock
*sk
, 553 int len
,int nonblock
, 554 int flags
,int*addr_len
); 556 externvoidtcp_parse_options(struct sock
*sk
,struct tcphdr
*th
, 557 struct tcp_opt
*tp
,int no_fancy
); 560 * TCP v4 functions exported for the inet6 API 563 externinttcp_v4_rebuild_header(struct sock
*sk
); 565 externinttcp_v4_build_header(struct sock
*sk
, 566 struct sk_buff
*skb
); 568 externvoidtcp_v4_send_check(struct sock
*sk
, 569 struct tcphdr
*th
,int len
, 570 struct sk_buff
*skb
); 572 externinttcp_v4_conn_request(struct sock
*sk
, 573 struct sk_buff
*skb
); 575 externstruct sock
*tcp_create_openreq_child(struct sock
*sk
, 576 struct open_request
*req
, 577 struct sk_buff
*skb
); 579 externstruct sock
*tcp_v4_syn_recv_sock(struct sock
*sk
, 581 struct open_request
*req
, 582 struct dst_entry
*dst
); 584 externinttcp_v4_do_rcv(struct sock
*sk
, 585 struct sk_buff
*skb
); 587 externinttcp_v4_connect(struct sock
*sk
, 588 struct sockaddr
*uaddr
, 591 externinttcp_connect(struct sock
*sk
, 592 struct sk_buff
*skb
); 594 externstruct sk_buff
*tcp_make_synack(struct sock
*sk
, 595 struct dst_entry
*dst
, 596 struct open_request
*req
); 598 externinttcp_disconnect(struct sock
*sk
,int flags
); 600 externvoidtcp_unhash(struct sock
*sk
); 602 externinttcp_v4_hash_connecting(struct sock
*sk
); 605 /* From syncookies.c */ 606 externstruct sock
*cookie_v4_check(struct sock
*sk
,struct sk_buff
*skb
, 607 struct ip_options
*opt
); 608 extern __u32
cookie_v4_init_sequence(struct sock
*sk
,struct sk_buff
*skb
, 613 externvoidtcp_read_wakeup(struct sock
*); 614 externvoidtcp_write_xmit(struct sock
*); 615 externvoidtcp_time_wait(struct sock
*); 616 externinttcp_retransmit_skb(struct sock
*,struct sk_buff
*); 617 externvoidtcp_fack_retransmit(struct sock
*); 618 externvoidtcp_xmit_retransmit_queue(struct sock
*); 619 externvoidtcp_simple_retransmit(struct sock
*); 621 externvoidtcp_send_probe0(struct sock
*); 622 externvoidtcp_send_partial(struct sock
*); 623 externvoidtcp_write_wakeup(struct sock
*); 624 externvoidtcp_send_fin(struct sock
*sk
); 625 externvoidtcp_send_active_reset(struct sock
*sk
,int priority
); 626 externinttcp_send_synack(struct sock
*); 627 externvoidtcp_transmit_skb(struct sock
*,struct sk_buff
*); 628 externvoidtcp_send_skb(struct sock
*,struct sk_buff
*,int force_queue
); 629 externvoidtcp_send_ack(struct sock
*sk
); 630 externvoidtcp_send_delayed_ack(struct sock
*sk
,int max_timeout
); 633 externvoidtcp_reset_xmit_timer(struct sock
*,int,unsigned long); 634 externvoidtcp_init_xmit_timers(struct sock
*); 635 externvoidtcp_clear_xmit_timers(struct sock
*); 637 externvoidtcp_retransmit_timer(unsigned long); 638 externvoidtcp_delack_timer(unsigned long); 639 externvoidtcp_probe_timer(unsigned long); 641 externvoidtcp_delete_keepalive_timer(struct sock
*); 642 externvoidtcp_reset_keepalive_timer(struct sock
*,unsigned long); 643 externvoidtcp_keepalive_timer(unsigned long); 648 externstruct timer_list tcp_slow_timer
; 650 struct tcp_sl_timer
{ 652 unsigned long period
; 654 void(*handler
) (unsigned long); 657 #define TCP_SLT_SYNACK 0 658 #define TCP_SLT_TWKILL 1 659 #define TCP_SLT_MAX 2 661 externstruct tcp_sl_timer tcp_slt_array
[TCP_SLT_MAX
]; 663 externinttcp_sync_mss(struct sock
*sk
, u32 pmtu
); 665 /* Compute the current effective MSS, taking SACKs and IP options, 666 * and even PMTU discovery events into account. 669 static __inline__
unsigned inttcp_current_mss(struct sock
*sk
) 671 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 672 struct dst_entry
*dst
= sk
->dst_cache
; 673 int mss_now
= tp
->mss_cache
; 675 if(dst
&& dst
->pmtu
!= tp
->pmtu_cookie
) 676 mss_now
=tcp_sync_mss(sk
, dst
->pmtu
); 678 if(tp
->sack_ok
&& tp
->num_sacks
) 679 mss_now
-= (TCPOLEN_SACK_BASE_ALIGNED
+ 680 (tp
->num_sacks
* TCPOLEN_SACK_PERBLOCK
)); 681 return mss_now
>8? mss_now
:8; 684 /* Initialize RCV_MSS value. 685 * RCV_MSS is an our guess about MSS used by the peer. 686 * We haven't any direct information about the MSS. 687 * It's better to underestimate the RCV_MSS rather than overestimate. 688 * Overestimations make us ACKing less frequently than needed. 689 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). 692 extern __inline__
voidtcp_initialize_rcv_mss(struct sock
*sk
) 694 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 695 struct dst_entry
*dst
=__sk_dst_get(sk
); 703 tp
->rcv_mss
=max(min(mss
,536),8); 706 /* Compute the actual receive window we are currently advertising. 707 * Rcv_nxt can be after the window if our peer push more data 708 * than the offered window. 710 static __inline__ u32
tcp_receive_window(struct tcp_opt
*tp
) 712 s32 win
= tp
->rcv_wup
+ tp
->rcv_wnd
- tp
->rcv_nxt
; 719 /* Choose a new window, without checks for shrinking, and without 720 * scaling applied to the result. The caller does these things 721 * if necessary. This is a "raw" window selection. 723 extern u32
__tcp_select_window(struct sock
*sk
); 725 /* Chose a new window to advertise, update state in tcp_opt for the 726 * socket, and return result with RFC1323 scaling applied. The return 727 * value can be stuffed directly into th->window for an outgoing 730 extern __inline__ u16
tcp_select_window(struct sock
*sk
) 732 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
); 733 u32 cur_win
=tcp_receive_window(tp
); 734 u32 new_win
=__tcp_select_window(sk
); 736 /* Never shrink the offered window */ 737 if(new_win
< cur_win
) { 738 /* Danger Will Robinson! 739 * Don't update rcv_wup/rcv_wnd here or else 740 * we will not be able to advertise a zero 741 * window in time. --DaveM 745 tp
->rcv_wnd
= new_win
; 746 tp
->rcv_wup
= tp
->rcv_nxt
; 749 /* RFC1323 scaling applied */ 750 return new_win
>> tp
->rcv_wscale
; 753 /* See if we can advertise non-zero, and if so how much we 754 * can increase our advertisement. If it becomes more than 755 * twice what we are talking about right now, return true. 757 extern __inline__
inttcp_raise_window(struct sock
*sk
) 759 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
); 760 u32 cur_win
=tcp_receive_window(tp
); 761 u32 new_win
=__tcp_select_window(sk
); 763 return(new_win
&& (new_win
> (cur_win
<<1))); 767 /* TCP timestamps are only 32-bits, this causes a slight 768 * complication on 64-bit systems since we store a snapshot 769 * of jiffies in the buffer control blocks below. We decidely 770 * only use of the low 32-bits of jiffies and hide the ugly 771 * casts with the following macro. 773 #define tcp_time_stamp ((__u32)(jiffies)) 775 /* This is what the send packet queueing engine uses to pass 776 * TCP per-packet control information to the transmission 777 * code. We also store the host-order sequence numbers in 778 * here too. This is 36 bytes on 32-bit architectures, 779 * 40 bytes on 64-bit machines, if this grows please adjust 780 * skbuff.h:skbuff->cb[xxx] size appropriately. 784 struct inet_skb_parm h4
; 785 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 786 struct inet6_skb_parm h6
; 788 } header
;/* For incoming frames */ 789 __u32 seq
;/* Starting sequence number */ 790 __u32 end_seq
;/* SEQ + FIN + SYN + datalen */ 791 __u32 when
;/* used to compute rtt's */ 792 __u8 flags
;/* TCP header flags. */ 794 /* NOTE: These must match up to the flags byte in a 797 #define TCPCB_FLAG_FIN 0x01 798 #define TCPCB_FLAG_SYN 0x02 799 #define TCPCB_FLAG_RST 0x04 800 #define TCPCB_FLAG_PSH 0x08 801 #define TCPCB_FLAG_ACK 0x10 802 #define TCPCB_FLAG_URG 0x20 804 __u8 sacked
;/* State flags for SACK/FACK. */ 805 #define TCPCB_SACKED_ACKED 0x01/* SKB ACK'd by a SACK block */ 806 #define TCPCB_SACKED_RETRANS 0x02/* SKB retransmitted */ 808 __u16 urg_ptr
;/* Valid w/URG flags is set. */ 809 __u32 ack_seq
;/* Sequence number ACK'd */ 812 #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) 814 /* This determines how many packets are "in the network" to the best 815 * of our knowledge. In many cases it is conservative, but where 816 * detailed information is available from the receiver (via SACK 817 * blocks etc.) we can make more aggressive calculations. 819 * Use this for decisions involving congestion control, use just 820 * tp->packets_out to determine if the send queue is empty or not. 822 * Read this equation as: 824 * "Packets sent once on transmission queue" MINUS 825 * "Packets acknowledged by FACK information" PLUS 826 * "Packets fast retransmitted" 828 static __inline__
inttcp_packets_in_flight(struct tcp_opt
*tp
) 830 return tp
->packets_out
- tp
->fackets_out
+ tp
->retrans_out
; 833 /* Recalculate snd_ssthresh, we want to set it to: 835 * one half the current congestion window, but no 836 * less than two segments 838 * We must take into account the current send window 839 * as well, however we keep track of that using different 840 * units so a conversion is necessary. -DaveM 843 * RFC 2581: "an easy mistake to make is to simply use cwnd, 844 * rather than FlightSize" 845 * I see no references to FlightSize here. snd_wnd is not FlightSize, 846 * it is also apriory characteristics. 848 * FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ? 850 extern __inline__ __u32
tcp_recalc_ssthresh(struct tcp_opt
*tp
) 852 u32 FlightSize
= (tp
->snd_nxt
- tp
->snd_una
)/tp
->mss_cache
; 854 FlightSize
=min(FlightSize
,tcp_packets_in_flight(tp
)); 856 returnmax(min(FlightSize
, tp
->snd_cwnd
) >>1,2); 859 /* This checks if the data bearing packet SKB (usually tp->send_head) 860 * should be put on the wire right now. 862 static __inline__
inttcp_snd_test(struct sock
*sk
,struct sk_buff
*skb
) 864 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
); 867 /* RFC 1122 - section 4.2.3.4 871 * a) The right edge of this frame exceeds the window 872 * b) There are packets in flight and we have a small segment 873 * [SWS avoidance and Nagle algorithm] 874 * (part of SWS is done on packetization) 875 * c) We are retransmiting [Nagle] 876 * d) We have too many packets 'in flight' 878 * Don't use the nagle rule for urgent data (or 879 * for the final FIN -DaveM). 881 if((sk
->nonagle
==2&& (skb
->len
< tp
->mss_cache
)) || 883 skb
->len
< (tp
->mss_cache
>>1) && 885 !(TCP_SKB_CB(skb
)->flags
& (TCPCB_FLAG_URG
|TCPCB_FLAG_FIN
)))) 889 * Reset CWND after idle period longer rto. Actually, it would 890 * be better to save last send time, but VJ in SIGCOMM'88 proposes 891 * to use keepalive timestamp. Well, it is not good, certainly, 892 * because SMTP is still broken, but it is better than nothing yet. 894 if(tp
->packets_out
==0&& (s32
)(tcp_time_stamp
- tp
->rcv_tstamp
) > tp
->rto
) 895 tp
->snd_cwnd
=min(tp
->snd_cwnd
,2); 897 /* Don't be strict about the congestion window for the 898 * final FIN frame. -DaveM 900 return(nagle_check
&& 901 ((tcp_packets_in_flight(tp
) < tp
->snd_cwnd
) || 902 (TCP_SKB_CB(skb
)->flags
& TCPCB_FLAG_FIN
)) && 903 !after(TCP_SKB_CB(skb
)->end_seq
, tp
->snd_una
+ tp
->snd_wnd
) && 904 tp
->retransmits
==0); 907 /* Push out any pending frames which were held back due to 908 * TCP_CORK or attempt at coalescing tiny packets. 909 * The socket must be locked by the caller. 911 static __inline__
voidtcp_push_pending_frames(struct sock
*sk
,struct tcp_opt
*tp
) 914 if(tcp_snd_test(sk
, tp
->send_head
)) 916 else if(tp
->packets_out
==0&& !tp
->pending
) { 917 /* We held off on this in tcp_send_skb() */ 918 tp
->pending
= TIME_PROBE0
; 919 tcp_reset_xmit_timer(sk
, TIME_PROBE0
, tp
->rto
); 924 /* This tells the input processing path that an ACK should go out 927 #define tcp_enter_quickack_mode(__tp) ((__tp)->ato |= (1<<31)) 928 #define tcp_exit_quickack_mode(__tp) ((__tp)->ato &= ~(1<<31)) 929 #define tcp_in_quickack_mode(__tp) (((__tp)->ato & (1 << 31)) != 0) 932 * List all states of a TCP socket that can be viewed as a "connected" 933 * state. This now includes TCP_SYN_RECV, although I am not yet fully 934 * convinced that this is the solution for the 'getpeername(2)' 935 * problem. Thanks to Stephen A. Wood <saw@cebaf.gov> -FvK 938 extern __inline
const inttcp_connected(const int state
) 941 (TCPF_ESTABLISHED
|TCPF_CLOSE_WAIT
|TCPF_FIN_WAIT1
| 942 TCPF_FIN_WAIT2
|TCPF_SYN_RECV
)); 945 extern __inline
const inttcp_established(const int state
) 948 (TCPF_ESTABLISHED
|TCPF_CLOSE_WAIT
|TCPF_FIN_WAIT1
| 953 externvoidtcp_destroy_sock(struct sock
*sk
); 957 * Calculate(/check) TCP checksum 959 static __inline__ u16
tcp_v4_check(struct tcphdr
*th
,int len
, 960 unsigned long saddr
,unsigned long daddr
, 963 returncsum_tcpudp_magic(saddr
,daddr
,len
,IPPROTO_TCP
,base
); 969 static char*statename
[]={ 970 "Unused","Established","Syn Sent","Syn Recv", 971 "Fin Wait 1","Fin Wait 2","Time Wait","Close", 972 "Close Wait","Last ACK","Listen","Closing" 976 static __inline__
voidtcp_set_state(struct sock
*sk
,int state
) 978 int oldstate
= sk
->state
; 981 case TCP_ESTABLISHED
: 982 if(oldstate
!= TCP_ESTABLISHED
) 983 tcp_statistics
.TcpCurrEstab
++; 987 sk
->prot
->unhash(sk
); 990 if(oldstate
==TCP_ESTABLISHED
) 991 tcp_statistics
.TcpCurrEstab
--; 994 /* Change state AFTER socket is unhashed to avoid closed 995 * socket sitting in hash tables. 1000 SOCK_DEBUG(sk
,"TCP sk=%p, State %s -> %s\n",sk
, statename
[oldstate
],statename
[state
]); 1004 static __inline__
voidtcp_done(struct sock
*sk
) 1006 sk
->shutdown
= SHUTDOWN_MASK
; 1009 sk
->state_change(sk
); 1011 tcp_destroy_sock(sk
); 1014 static __inline__
voidtcp_build_and_update_options(__u32
*ptr
,struct tcp_opt
*tp
, __u32 tstamp
) 1017 *ptr
++ =__constant_htonl((TCPOPT_NOP
<<24) | 1019 (TCPOPT_TIMESTAMP
<<8) | 1021 *ptr
++ =htonl(tstamp
); 1022 *ptr
++ =htonl(tp
->ts_recent
); 1024 if(tp
->sack_ok
&& tp
->num_sacks
) { 1027 *ptr
++ =__constant_htonl((TCPOPT_NOP
<<24) | 1030 (TCPOLEN_SACK_BASE
+ 1031 (tp
->num_sacks
* TCPOLEN_SACK_PERBLOCK
))); 1032 for(this_sack
=0; this_sack
< tp
->num_sacks
; this_sack
++) { 1033 *ptr
++ =htonl(tp
->selective_acks
[this_sack
].start_seq
); 1034 *ptr
++ =htonl(tp
->selective_acks
[this_sack
].end_seq
); 1039 /* Construct a tcp options header for a SYN or SYN_ACK packet. 1040 * If this is every changed make sure to change the definition of 1041 * MAX_SYN_SIZE to match the new maximum number of options that you 1044 extern __inline__
voidtcp_syn_build_options(__u32
*ptr
,int mss
,int ts
,int sack
, 1045 int offer_wscale
,int wscale
, __u32 tstamp
, __u32 ts_recent
) 1047 /* We always get an MSS option. 1048 * The option bytes which will be seen in normal data 1049 * packets should timestamps be used, must be in the MSS 1050 * advertised. But we subtract them from tp->mss_cache so 1051 * that calculations in tcp_sendmsg are simpler etc. 1052 * So account for this fact here if necessary. If we 1053 * don't do this correctly, as a receiver we won't 1054 * recognize data packets as being full sized when we 1055 * should, and thus we won't abide by the delayed ACK 1057 * SACKs don't matter, we never delay an ACK when we 1058 * have any of those going out. 1060 *ptr
++ =htonl((TCPOPT_MSS
<<24) | (TCPOLEN_MSS
<<16) | mss
); 1063 *ptr
++ =__constant_htonl((TCPOPT_SACK_PERM
<<24) | (TCPOLEN_SACK_PERM
<<16) | 1064 (TCPOPT_TIMESTAMP
<<8) | TCPOLEN_TIMESTAMP
); 1066 *ptr
++ =__constant_htonl((TCPOPT_NOP
<<24) | (TCPOPT_NOP
<<16) | 1067 (TCPOPT_TIMESTAMP
<<8) | TCPOLEN_TIMESTAMP
); 1068 *ptr
++ =htonl(tstamp
);/* TSVAL */ 1069 *ptr
++ =htonl(ts_recent
);/* TSECR */ 1071 *ptr
++ =__constant_htonl((TCPOPT_NOP
<<24) | (TCPOPT_NOP
<<16) | 1072 (TCPOPT_SACK_PERM
<<8) | TCPOLEN_SACK_PERM
); 1074 *ptr
++ =htonl((TCPOPT_NOP
<<24) | (TCPOPT_WINDOW
<<16) | (TCPOLEN_WINDOW
<<8) | (wscale
)); 1077 /* Determine a window scaling and initial window to offer. 1078 * Based on the assumption that the given amount of space 1079 * will be offered. Store the results in the tp structure. 1080 * NOTE: for smooth operation initial space offering should 1081 * be a multiple of mss if possible. We assume here that mss >= 1. 1082 * This MUST be enforced by all callers. 1084 extern __inline__
voidtcp_select_initial_window(int space
, __u32 mss
, 1086 __u32
*window_clamp
, 1090 /* If no clamp set the clamp to the max possible scaled window */ 1091 if(*window_clamp
==0) 1092 (*window_clamp
) = (65535<<14); 1093 space
=min(*window_clamp
,space
); 1095 /* Quantize space offering to a multiple of mss if possible. */ 1097 space
= (space
/mss
)*mss
; 1099 /* NOTE: offering an initial window larger than 32767 1100 * will break some buggy TCP stacks. We try to be nice. 1101 * If we are not window scaling, then this truncates 1102 * our initial window offering to 32k. There should also 1103 * be a sysctl option to stop being nice. 1105 (*rcv_wnd
) =min(space
, MAX_WINDOW
); 1108 /* See RFC1323 for an explanation of the limit to 14 */ 1109 while(space
>65535&& (*rcv_wscale
) <14) { 1114 /* Set the clamp no higher than max representable value */ 1115 (*window_clamp
) =min(65535<<(*rcv_wscale
),*window_clamp
); 1118 /* Note: caller must be prepared to deal with negative returns */ 1119 extern __inline__
inttcp_space(struct sock
*sk
) 1121 return(sk
->rcvbuf
-atomic_read(&sk
->rmem_alloc
)) / 1122 WINDOW_ADVERTISE_DIVISOR
; 1125 extern __inline__
inttcp_full_space(struct sock
*sk
) 1127 return sk
->rcvbuf
/ WINDOW_ADVERTISE_DIVISOR
; 1130 extern __inline__
voidtcp_synq_unlink(struct tcp_opt
*tp
,struct open_request
*req
,struct open_request
*prev
) 1133 tp
->syn_wait_last
= (struct open_request
**)prev
; 1134 prev
->dl_next
= req
->dl_next
; 1137 extern __inline__
voidtcp_synq_queue(struct tcp_opt
*tp
,struct open_request
*req
) 1139 req
->dl_next
= NULL
; 1140 *tp
->syn_wait_last
= req
; 1141 tp
->syn_wait_last
= &req
->dl_next
; 1144 extern __inline__
voidtcp_synq_init(struct tcp_opt
*tp
) 1146 tp
->syn_wait_queue
= NULL
; 1147 tp
->syn_wait_last
= &tp
->syn_wait_queue
; 1150 externvoid__tcp_inc_slow_timer(struct tcp_sl_timer
*slt
); 1151 extern __inline__
voidtcp_inc_slow_timer(int timer
) 1153 struct tcp_sl_timer
*slt
= &tcp_slt_array
[timer
]; 1155 if(atomic_read(&slt
->count
) ==0) 1157 __tcp_inc_slow_timer(slt
); 1160 atomic_inc(&slt
->count
); 1163 extern __inline__
voidtcp_dec_slow_timer(int timer
) 1165 struct tcp_sl_timer
*slt
= &tcp_slt_array
[timer
]; 1167 atomic_dec(&slt
->count
); 1170 externconst char timer_bug_msg
[]; 1172 staticinlinevoidtcp_clear_xmit_timer(struct sock
*sk
,int what
) 1174 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 1175 struct timer_list
*timer
; 1179 timer
= &tp
->retransmit_timer
; 1182 timer
= &tp
->delack_timer
; 1185 timer
= &tp
->probe_timer
; 1188 printk(timer_bug_msg
); 1192 spin_lock_bh(&sk
->timer_lock
); 1193 if(timer
->prev
!= NULL
&&del_timer(timer
)) 1195 spin_unlock_bh(&sk
->timer_lock
); 1198 /* This function does not return reliable answer. You is only as advice. 1201 staticinlineinttcp_timer_is_set(struct sock
*sk
,int what
) 1203 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
; 1208 ret
= tp
->retransmit_timer
.prev
!= NULL
; 1211 ret
= tp
->delack_timer
.prev
!= NULL
; 1214 ret
= tp
->probe_timer
.prev
!= NULL
; 1218 printk(timer_bug_msg
); 1224 externvoidtcp_listen_wlock(void); 1226 /* - We may sleep inside this lock. 1227 * - If sleeping is not required (or called from BH), 1228 * use plain read_(un)lock(&tcp_lhash_lock). 1231 extern __inline__
voidtcp_listen_lock(void) 1233 /* read_lock synchronizes to candidates to writers */ 1234 read_lock(&tcp_lhash_lock
); 1235 atomic_inc(&tcp_lhash_users
); 1236 read_unlock(&tcp_lhash_lock
); 1239 extern __inline__
voidtcp_listen_unlock(void) 1241 if(atomic_dec_and_test(&tcp_lhash_users
)) 1242 wake_up(&tcp_lhash_wait
); 1245 staticinlineintkeepalive_intvl_when(struct tcp_opt
*tp
) 1247 if(tp
->keepalive_intvl
) 1248 return tp
->keepalive_intvl
; 1250 return sysctl_tcp_keepalive_intvl
; 1253 staticinlineintkeepalive_time_when(struct tcp_opt
*tp
) 1255 if(tp
->keepalive_time
) 1256 return tp
->keepalive_time
; 1258 return sysctl_tcp_keepalive_time
;