84bf7f55ed28035b5bd948bdee1ace6e13e268f6
[davej-history.git] / include / net / tcp.h
blob84bf7f55ed28035b5bd948bdee1ace6e13e268f6
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Definitions for the TCP module.
8 * Version: @(#)tcp.h 1.0.5 05/23/93
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
18 #ifndef _TCP_H
19 #define _TCP_H
21 #include <linux/config.h>
22 #include <linux/tcp.h>
23 #include <linux/slab.h>
24 #include <net/checksum.h>
26 /* This is for all connections with a full identity, no wildcards.
27 * New scheme, half the table is for TIME_WAIT, the other half is
28 * for the rest. I'll experiment with dynamic table growth later.
30 #define TCP_HTABLE_SIZE 512
32 /* This is for listening sockets, thus all sockets which possess wildcards. */
33 #define TCP_LHTABLE_SIZE 32/* Yes, really, this is all you need. */
35 /* This is for all sockets, to keep track of the local port allocations. */
36 #define TCP_BHTABLE_SIZE 512
38 /* tcp_ipv4.c: These need to be shared by v4 and v6 because the lookup
39 * and hashing code needs to work with different AF's yet
40 * the port space is shared.
42 externstruct sock *tcp_established_hash[TCP_HTABLE_SIZE];
43 externstruct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
45 /* There are a few simple rules, which allow for local port reuse by
46 * an application. In essence:
48 * 1) Sockets bound to different interfaces may share a local port.
49 * Failing that, goto test 2.
50 * 2) If all sockets have sk->reuse set, and none of them are in
51 * TCP_LISTEN state, the port may be shared.
52 * Failing that, goto test 3.
53 * 3) If all sockets are bound to a specific sk->rcv_saddr local
54 * address, and none of them are the same, the port may be
55 * shared.
56 * Failing this, the port cannot be shared.
58 * The interesting point, is test #2. This is what an FTP server does
59 * all day. To optimize this case we use a specific flag bit defined
60 * below. As we add sockets to a bind bucket list, we perform a
61 * check of: (newsk->reuse && (newsk->state != TCP_LISTEN))
62 * As long as all sockets added to a bind bucket pass this test,
63 * the flag bit will be set.
64 * The resulting situation is that tcp_v[46]_verify_bind() can just check
65 * for this flag bit, if it is set and the socket trying to bind has
66 * sk->reuse set, we don't even have to walk the owners list at all,
67 * we return that it is ok to bind this socket to the requested local port.
69 * Sounds like a lot of work, but it is worth it. In a more naive
70 * implementation (ie. current FreeBSD etc.) the entire list of ports
71 * must be walked for each data port opened by an ftp server. Needless
72 * to say, this does not scale at all. With a couple thousand FTP
73 * users logged onto your box, isn't it nice to know that new data
74 * ports are created in O(1) time? I thought so. ;-) -DaveM
76 struct tcp_bind_bucket {
77 unsigned short port;
78 unsigned short flags;
79 #define TCPB_FLAG_LOCKED 0x0001
80 #define TCPB_FLAG_FASTREUSE 0x0002
82 struct tcp_bind_bucket *next;
83 struct sock *owners;
84 struct tcp_bind_bucket **pprev;
87 externstruct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
88 extern kmem_cache_t *tcp_bucket_cachep;
89 externstruct tcp_bind_bucket *tcp_bucket_create(unsigned short snum);
90 externvoidtcp_bucket_unlock(struct sock *sk);
91 externint tcp_port_rover;
93 /* Level-1 socket-demux cache. */
94 #define TCP_NUM_REGS 32
95 externstruct sock *tcp_regs[TCP_NUM_REGS];
97 #define TCP_RHASH_FN(__fport) \
98 ((((__fport) >> 7) ^ (__fport)) & (TCP_NUM_REGS - 1))
99 #define TCP_RHASH(__fport) tcp_regs[TCP_RHASH_FN((__fport))]
100 #define TCP_SK_RHASH_FN(__sock) TCP_RHASH_FN((__sock)->dport)
101 #define TCP_SK_RHASH(__sock) tcp_regs[TCP_SK_RHASH_FN((__sock))]
103 static __inline__ voidtcp_reg_zap(struct sock *sk)
105 struct sock **rpp;
107 rpp = &(TCP_SK_RHASH(sk));
108 if(*rpp == sk)
109 *rpp = NULL;
112 /* These are AF independent. */
113 static __inline__ inttcp_bhashfn(__u16 lport)
115 return(lport & (TCP_BHTABLE_SIZE -1));
118 static __inline__ voidtcp_sk_bindify(struct sock *sk)
120 struct tcp_bind_bucket *tb;
121 unsigned short snum = sk->num;
123 for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb->port != snum; tb = tb->next)
125 /* Update bucket flags. */
126 if(tb->owners == NULL) {
127 /* We're the first. */
128 if(sk->reuse && sk->state != TCP_LISTEN)
129 tb->flags = TCPB_FLAG_FASTREUSE;
130 else
131 tb->flags =0;
132 }else{
133 if((tb->flags & TCPB_FLAG_FASTREUSE) &&
134 ((sk->reuse ==0) || (sk->state == TCP_LISTEN)))
135 tb->flags &= ~TCPB_FLAG_FASTREUSE;
137 if((sk->bind_next = tb->owners) != NULL)
138 tb->owners->bind_pprev = &sk->bind_next;
139 tb->owners = sk;
140 sk->bind_pprev = &tb->owners;
141 sk->prev = (struct sock *) tb;
144 /* This is a TIME_WAIT bucket. It works around the memory consumption
145 * problems of sockets in such a state on heavily loaded servers, but
146 * without violating the protocol specification.
148 struct tcp_tw_bucket {
149 /* These _must_ match the beginning of struct sock precisely.
150 * XXX Yes I know this is gross, but I'd have to edit every single
151 * XXX networking file if I created a "struct sock_header". -DaveM
153 struct sock *sklist_next;
154 struct sock *sklist_prev;
155 struct sock *bind_next;
156 struct sock **bind_pprev;
157 struct sock *next;
158 struct sock **pprev;
159 __u32 daddr;
160 __u32 rcv_saddr;
161 int bound_dev_if;
162 unsigned short num;
163 unsigned char state,
164 zapped;
165 __u16 sport;
166 __u16 dport;
167 unsigned short family;
168 unsigned char reuse,
169 nonagle;
171 /* And these are ours. */
172 __u32 rcv_nxt;
173 struct tcp_func *af_specific;
174 struct tcp_bind_bucket *tb;
175 struct timer_list timer;
176 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
177 struct in6_addr v6_daddr;
178 struct in6_addr v6_rcv_saddr;
179 #endif
182 extern kmem_cache_t *tcp_timewait_cachep;
184 /* tcp_ipv4.c: These sysctl variables need to be shared between v4 and v6
185 * because the v6 tcp code to intialize a connection needs to interoperate
186 * with the v4 code using the same variables.
187 * FIXME: It would be better to rewrite the connection code to be
188 * address family independent and just leave one copy in the ipv4 section.
189 * This would also clean up some code duplication. -- erics
191 externint sysctl_tcp_timestamps;
192 externint sysctl_tcp_window_scaling;
193 externint sysctl_tcp_sack;
195 /* These can have wildcards, don't try too hard. */
196 static __inline__ inttcp_lhashfn(unsigned short num)
198 return num & (TCP_LHTABLE_SIZE -1);
201 static __inline__ inttcp_sk_listen_hashfn(struct sock *sk)
203 returntcp_lhashfn(sk->num);
206 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
207 #define NETHDR_SIZE sizeof(struct ipv6hdr)
208 #else
209 #define NETHDR_SIZE sizeof(struct iphdr) + 40
210 #endif
213 * 40 is maximal IP options size
214 * 20 is the maximum TCP options size we can currently construct on a SYN.
215 * 40 is the maximum possible TCP options size.
218 #define MAX_SYN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15)
219 #define MAX_FIN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
220 #define BASE_ACK_SIZE (NETHDR_SIZE + MAX_HEADER + 15)
221 #define MAX_ACK_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
222 #define MAX_RESET_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15)
223 #define MAX_TCPHEADER_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15)
225 #define MAX_WINDOW 32767/* Never offer a window over 32767 without using
226 window scaling (not yet supported). Some poor
227 stacks do signed 16bit maths! */
228 #define MIN_WINDOW 2048
229 #define MAX_ACK_BACKLOG 2
230 #define MAX_DELAY_ACK 2
231 #define MIN_WRITE_SPACE 2048
232 #define TCP_WINDOW_DIFF 2048
234 /* urg_data states */
235 #define URG_VALID 0x0100
236 #define URG_NOTYET 0x0200
237 #define URG_READ 0x0400
239 #define TCP_RETR1 7/*
240 * This is how many retries it does before it
241 * tries to figure out if the gateway is
242 * down.
245 #define TCP_RETR2 15/*
246 * This should take at least
247 * 90 minutes to time out.
250 #define TCP_TIMEOUT_LEN (15*60*HZ)/* should be about 15 mins */
251 #define TCP_TIMEWAIT_LEN (60*HZ)/* how long to wait to successfully
252 * close the socket, about 60 seconds */
253 #define TCP_FIN_TIMEOUT (3*60*HZ)/* BSD style FIN_WAIT2 deadlock breaker */
255 #define TCP_ACK_TIME (3*HZ)/* time to delay before sending an ACK */
256 #define TCP_DONE_TIME (5*HZ/2)/* maximum time to wait before actually
257 * destroying a socket */
258 #define TCP_WRITE_TIME (30*HZ)/* initial time to wait for an ACK,
259 * after last transmit */
260 #define TCP_TIMEOUT_INIT (3*HZ)/* RFC 1122 initial timeout value */
261 #define TCP_SYN_RETRIES 10/* number of times to retry opening a
262 * connection (TCP_RETR2-....) */
263 #define TCP_PROBEWAIT_LEN (1*HZ)/* time to wait between probes when
264 * I've got something to write and
265 * there is no window */
266 #define TCP_KEEPALIVE_TIME (180*60*HZ)/* two hours */
267 #define TCP_KEEPALIVE_PROBES 9/* Max of 9 keepalive probes */
268 #define TCP_KEEPALIVE_PERIOD ((75*HZ)>>2)/* period of keepalive check */
269 #define TCP_NO_CHECK 0/* turn to one if you want the default
270 * to be no checksum */
272 #define TCP_SYNACK_PERIOD (HZ/2)
273 #define TCP_QUICK_TRIES 8/* How often we try to retransmit, until
274 * we tell the LL layer that it is something
275 * wrong (e.g. that it can expire redirects) */
277 #define TCP_BUCKETGC_PERIOD (HZ)
280 * TCP option
283 #define TCPOPT_NOP 1/* Padding */
284 #define TCPOPT_EOL 0/* End of options */
285 #define TCPOPT_MSS 2/* Segment size negotiating */
286 #define TCPOPT_WINDOW 3/* Window scaling */
287 #define TCPOPT_SACK_PERM 4/* SACK Permitted */
288 #define TCPOPT_SACK 5/* SACK Block */
289 #define TCPOPT_TIMESTAMP 8/* Better RTT estimations/PAWS */
292 * TCP option lengths
295 #define TCPOLEN_MSS 4
296 #define TCPOLEN_WINDOW 3
297 #define TCPOLEN_SACK_PERM 2
298 #define TCPOLEN_TIMESTAMP 10
300 /* But this is what stacks really send out. */
301 #define TCPOLEN_TSTAMP_ALIGNED 12
302 #define TCPOLEN_WSCALE_ALIGNED 4
303 #define TCPOLEN_SACKPERM_ALIGNED 4
304 #define TCPOLEN_SACK_BASE 2
305 #define TCPOLEN_SACK_BASE_ALIGNED 4
306 #define TCPOLEN_SACK_PERBLOCK 8
309 * TCP Vegas constants
312 #define TCP_VEGAS_ALPHA 2/* v_cong_detect_top_nseg */
313 #define TCP_VEGAS_BETA 4/* v_cong_detect_bot_nseg */
314 #define TCP_VEGAS_GAMMA 1/* v_exp_inc_nseg */
316 struct open_request;
318 struct or_calltable {
319 void(*rtx_syn_ack) (struct sock *sk,struct open_request *req);
320 void(*destructor) (struct open_request *req);
321 void(*send_reset) (struct sk_buff *skb);
324 struct tcp_v4_open_req {
325 __u32 loc_addr;
326 __u32 rmt_addr;
327 struct ip_options *opt;
330 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
331 struct tcp_v6_open_req {
332 struct in6_addr loc_addr;
333 struct in6_addr rmt_addr;
334 struct ipv6_options *opt;
335 int iif;
337 #endif
339 /* this structure is too big */
340 struct open_request {
341 struct open_request *dl_next;/* Must be first member! */
342 __u32 rcv_isn;
343 __u32 snt_isn;
344 __u16 rmt_port;
345 __u16 mss;
346 __u8 retrans;
347 __u8 __pad;
348 unsigned snd_wscale :4,
349 rcv_wscale :4,
350 tstamp_ok :1,
351 sack_ok :1,
352 wscale_ok :1;
353 /* The following two fields can be easily recomputed I think -AK */
354 __u32 window_clamp;/* window clamp at creation time */
355 __u32 rcv_wnd;/* rcv_wnd offered first time */
356 __u32 ts_recent;
357 unsigned long expires;
358 struct or_calltable *class;
359 struct sock *sk;
360 union{
361 struct tcp_v4_open_req v4_req;
362 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
363 struct tcp_v6_open_req v6_req;
364 #endif
365 } af;
366 #ifdef CONFIG_IP_TRANSPARENT_PROXY
367 __u16 lcl_port;/* LVE */
368 #endif
371 /* SLAB cache for open requests. */
372 extern kmem_cache_t *tcp_openreq_cachep;
374 #define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC)
375 #define tcp_openreq_free(req) kmem_cache_free(tcp_openreq_cachep, req)
378 * Pointers to address related TCP functions
379 * (i.e. things that depend on the address family)
382 struct tcp_func {
383 void(*queue_xmit) (struct sk_buff *skb);
385 void(*send_check) (struct sock *sk,
386 struct tcphdr *th,
387 int len,
388 struct sk_buff *skb);
390 int(*rebuild_header) (struct sock *sk);
392 int(*conn_request) (struct sock *sk,
393 struct sk_buff *skb,
394 void*opt, __u32 isn);
396 struct sock * (*syn_recv_sock) (struct sock *sk,
397 struct sk_buff *skb,
398 struct open_request *req,
399 struct dst_entry *dst);
401 struct sock * (*get_sock) (struct sk_buff *skb,
402 struct tcphdr *th);
404 int(*setsockopt) (struct sock *sk,
405 int level,
406 int optname,
407 char*optval,
408 int optlen);
410 int(*getsockopt) (struct sock *sk,
411 int level,
412 int optname,
413 char*optval,
414 int*optlen);
417 void(*addr2sockaddr) (struct sock *sk,
418 struct sockaddr *);
420 int sockaddr_len;
424 * The next routines deal with comparing 32 bit unsigned ints
425 * and worry about wraparound (automatic with unsigned arithmetic).
428 extern __inline intbefore(__u32 seq1, __u32 seq2)
430 return(__s32)(seq1-seq2) <0;
433 extern __inline intafter(__u32 seq1, __u32 seq2)
435 return(__s32)(seq2-seq1) <0;
439 /* is s2<=s1<=s3 ? */
440 extern __inline intbetween(__u32 seq1, __u32 seq2, __u32 seq3)
442 return seq3 - seq2 >= seq1 - seq2;
446 externstruct proto tcp_prot;
447 externstruct tcp_mib tcp_statistics;
449 externunsigned shorttcp_good_socknum(void);
451 externvoidtcp_v4_err(struct sk_buff *skb,
452 unsigned char*,int);
454 externvoidtcp_shutdown(struct sock *sk,int how);
456 externinttcp_v4_rcv(struct sk_buff *skb,
457 unsigned short len);
459 externinttcp_do_sendmsg(struct sock *sk,
460 int iovlen,struct iovec *iov,
461 int flags);
463 externinttcp_ioctl(struct sock *sk,
464 int cmd,
465 unsigned long arg);
467 externinttcp_rcv_state_process(struct sock *sk,
468 struct sk_buff *skb,
469 struct tcphdr *th,
470 void*opt, __u16 len);
472 externinttcp_rcv_established(struct sock *sk,
473 struct sk_buff *skb,
474 struct tcphdr *th,
475 __u16 len);
477 externinttcp_timewait_state_process(struct tcp_tw_bucket *tw,
478 struct sk_buff *skb,
479 struct tcphdr *th,
480 void*opt, __u16 len);
482 externvoidtcp_close(struct sock *sk,
483 unsigned long timeout);
484 externstruct sock *tcp_accept(struct sock *sk,int flags);
485 externunsigned inttcp_poll(struct file * file,struct socket *sock,struct poll_table_struct *wait);
486 externinttcp_getsockopt(struct sock *sk,int level,
487 int optname,char*optval,
488 int*optlen);
489 externinttcp_setsockopt(struct sock *sk,int level,
490 int optname,char*optval,
491 int optlen);
492 externvoidtcp_set_keepalive(struct sock *sk,int val);
493 externinttcp_recvmsg(struct sock *sk,
494 struct msghdr *msg,
495 int len,int nonblock,
496 int flags,int*addr_len);
498 externvoidtcp_parse_options(struct sock *sk,struct tcphdr *th,
499 struct tcp_opt *tp,int no_fancy);
502 * TCP v4 functions exported for the inet6 API
505 externinttcp_v4_rebuild_header(struct sock *sk);
507 externinttcp_v4_build_header(struct sock *sk,
508 struct sk_buff *skb);
510 externvoidtcp_v4_send_check(struct sock *sk,
511 struct tcphdr *th,int len,
512 struct sk_buff *skb);
514 externinttcp_v4_conn_request(struct sock *sk,
515 struct sk_buff *skb,
516 void*ptr, __u32 isn);
518 externstruct sock *tcp_create_openreq_child(struct sock *sk,
519 struct open_request *req,
520 struct sk_buff *skb,
521 int mss);
523 externstruct sock *tcp_v4_syn_recv_sock(struct sock *sk,
524 struct sk_buff *skb,
525 struct open_request *req,
526 struct dst_entry *dst);
528 externinttcp_v4_do_rcv(struct sock *sk,
529 struct sk_buff *skb);
531 externinttcp_v4_connect(struct sock *sk,
532 struct sockaddr *uaddr,
533 int addr_len);
535 externvoidtcp_connect(struct sock *sk,
536 struct sk_buff *skb,
537 int est_mss);
539 externstruct sk_buff *tcp_make_synack(struct sock *sk,
540 struct dst_entry *dst,
541 struct open_request *req,
542 int mss);
545 /* From syncookies.c */
546 externstruct sock *cookie_v4_check(struct sock *sk,struct sk_buff *skb,
547 struct ip_options *opt);
548 extern __u32 cookie_v4_init_sequence(struct sock *sk,struct sk_buff *skb,
549 __u16 *mss);
551 externvoidtcp_read_wakeup(struct sock *);
552 externvoidtcp_write_xmit(struct sock *);
553 externvoidtcp_time_wait(struct sock *);
554 externinttcp_retransmit_skb(struct sock *,struct sk_buff *);
555 externvoidtcp_xmit_retransmit_queue(struct sock *);
556 externvoidtcp_simple_retransmit(struct sock *);
558 /* tcp_output.c */
560 externvoidtcp_send_probe0(struct sock *);
561 externvoidtcp_send_partial(struct sock *);
562 externvoidtcp_write_wakeup(struct sock *);
563 externvoidtcp_send_fin(struct sock *sk);
564 externvoidtcp_send_active_reset(struct sock *sk);
565 externinttcp_send_synack(struct sock *);
566 externvoidtcp_transmit_skb(struct sock *,struct sk_buff *);
567 externvoidtcp_send_skb(struct sock *,struct sk_buff *,int force_queue);
568 externvoidtcp_send_ack(struct sock *sk);
569 externvoidtcp_send_delayed_ack(struct tcp_opt *tp,int max_timeout);
571 /* CONFIG_IP_TRANSPARENT_PROXY */
572 externinttcp_chkaddr(struct sk_buff *);
574 /* tcp_timer.c */
575 #define tcp_reset_msl_timer(x,y,z) net_reset_timer(x,y,z)
576 externvoidtcp_reset_xmit_timer(struct sock *,int,unsigned long);
577 externvoidtcp_init_xmit_timers(struct sock *);
578 externvoidtcp_clear_xmit_timers(struct sock *);
580 externvoidtcp_retransmit_timer(unsigned long);
581 externvoidtcp_delack_timer(unsigned long);
582 externvoidtcp_probe_timer(unsigned long);
584 externstruct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
585 struct open_request *req);
588 * TCP slow timer
590 externstruct timer_list tcp_slow_timer;
592 struct tcp_sl_timer {
593 atomic_t count;
594 unsigned long period;
595 unsigned long last;
596 void(*handler) (unsigned long);
599 #define TCP_SLT_SYNACK 0
600 #define TCP_SLT_KEEPALIVE 1
601 #define TCP_SLT_BUCKETGC 2
602 #define TCP_SLT_MAX 3
604 externstruct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX];
606 /* Compute the actual receive window we are currently advertising. */
607 static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
609 return tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd);
612 /* Choose a new window, without checks for shrinking, and without
613 * scaling applied to the result. The caller does these things
614 * if necessary. This is a "raw" window selection.
616 extern u32 __tcp_select_window(struct sock *sk);
618 /* Chose a new window to advertise, update state in tcp_opt for the
619 * socket, and return result with RFC1323 scaling applied. The return
620 * value can be stuffed directly into th->window for an outgoing
621 * frame.
623 extern __inline__ u16 tcp_select_window(struct sock *sk)
625 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
626 u32 new_win =__tcp_select_window(sk);
627 u32 cur_win =tcp_receive_window(tp);
629 /* Never shrink the offered window */
630 if(new_win < cur_win)
631 new_win = cur_win;
632 tp->rcv_wnd = new_win;
633 tp->rcv_wup = tp->rcv_nxt;
635 /* RFC1323 scaling applied */
636 return new_win >> tp->rcv_wscale;
639 /* See if we can advertise non-zero, and if so how much we
640 * can increase our advertisement. If it becomes more than
641 * twice what we are talking about right now, return true.
643 extern __inline__ inttcp_raise_window(struct sock *sk)
645 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
646 u32 new_win =__tcp_select_window(sk);
647 u32 cur_win =tcp_receive_window(tp);
649 return(new_win && (new_win > (cur_win <<1)));
652 /* This is what the send packet queueing engine uses to pass
653 * TCP per-packet control information to the transmission
654 * code.
656 struct tcp_skb_cb {
657 __u8 flags;/* TCP header flags. */
659 /* NOTE: These must match up to the flags byte in a
660 * real TCP header.
662 #define TCPCB_FLAG_FIN 0x01
663 #define TCPCB_FLAG_SYN 0x02
664 #define TCPCB_FLAG_RST 0x04
665 #define TCPCB_FLAG_PSH 0x08
666 #define TCPCB_FLAG_ACK 0x10
667 #define TCPCB_FLAG_URG 0x20
669 __u8 sacked;/* State flags for SACK/FACK. */
670 #define TCPCB_SACKED_ACKED 0x01/* SKB ACK'd by a SACK block */
671 #define TCPCB_SACKED_RETRANS 0x02/* SKB retransmitted */
673 __u16 urg_ptr;/* Valid w/URG flags is set. */
676 #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
678 /* This checks if the data bearing packet SKB (usually tp->send_head)
679 * should be put on the wire right now.
681 static __inline__ inttcp_snd_test(struct sock *sk,struct sk_buff *skb)
683 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
684 int nagle_check =1;
685 int len;
687 /* RFC 1122 - section 4.2.3.4
689 * We must queue if
691 * a) The right edge of this frame exceeds the window
692 * b) There are packets in flight and we have a small segment
693 * [SWS avoidance and Nagle algorithm]
694 * (part of SWS is done on packetization)
695 * c) We are retransmiting [Nagle]
696 * d) We have too many packets 'in flight'
698 * Don't use the nagle rule for urgent data.
700 len = skb->end_seq - skb->seq;
701 if(!sk->nonagle && len < (sk->mss >>1) && tp->packets_out &&
702 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG))
703 nagle_check =0;
705 return(nagle_check && tp->packets_out < tp->snd_cwnd &&
706 !after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
707 tp->retransmits ==0);
710 /* This tells the input processing path that an ACK should go out
711 * right now.
713 #define tcp_enter_quickack_mode(__tp) ((__tp)->ato = (HZ/100))
714 #define tcp_in_quickack_mode(__tp) ((__tp)->ato == (HZ/100))
717 * List all states of a TCP socket that can be viewed as a "connected"
718 * state. This now includes TCP_SYN_RECV, although I am not yet fully
719 * convinced that this is the solution for the 'getpeername(2)'
720 * problem. Thanks to Stephen A. Wood <saw@cebaf.gov> -FvK
723 extern __inline const inttcp_connected(const int state)
725 return((1<< state) &
726 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
727 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
731 * Calculate(/check) TCP checksum
733 static __inline__ u16 tcp_v4_check(struct tcphdr *th,int len,
734 unsigned long saddr,unsigned long daddr,
735 unsigned long base)
737 returncsum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
740 #undef STATE_TRACE
742 #ifdef STATE_TRACE
743 static char*statename[]={
744 "Unused","Established","Syn Sent","Syn Recv",
745 "Fin Wait 1","Fin Wait 2","Time Wait","Close",
746 "Close Wait","Last ACK","Listen","Closing"
748 #endif
750 static __inline__ voidtcp_set_state(struct sock *sk,int state)
752 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
753 int oldstate = sk->state;
755 sk->state = state;
757 #ifdef STATE_TRACE
758 SOCK_DEBUG(sk,"TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
759 #endif
761 switch(state) {
762 case TCP_ESTABLISHED:
763 if(oldstate != TCP_ESTABLISHED)
764 tcp_statistics.TcpCurrEstab++;
765 break;
767 case TCP_CLOSE:
768 /* Should be about 2 rtt's */
769 net_reset_timer(sk, TIME_DONE,min(tp->srtt *2, TCP_DONE_TIME));
770 sk->prot->unhash(sk);
771 /* fall through */
772 default:
773 if(oldstate==TCP_ESTABLISHED)
774 tcp_statistics.TcpCurrEstab--;
778 static __inline__ voidtcp_build_and_update_options(__u32 *ptr,struct tcp_opt *tp, __u32 tstamp)
780 if(tp->tstamp_ok) {
781 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) |
782 (TCPOPT_NOP <<16) |
783 (TCPOPT_TIMESTAMP <<8) |
784 TCPOLEN_TIMESTAMP);
785 *ptr++ =htonl(tstamp);
786 *ptr++ =htonl(tp->ts_recent);
788 if(tp->sack_ok && tp->num_sacks) {
789 int this_sack;
791 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) |
792 (TCPOPT_NOP <<16) |
793 (TCPOPT_SACK <<8) |
794 (TCPOLEN_SACK_BASE +
795 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)));
796 for(this_sack =0; this_sack < tp->num_sacks; this_sack++) {
797 *ptr++ =htonl(tp->selective_acks[this_sack].start_seq);
798 *ptr++ =htonl(tp->selective_acks[this_sack].end_seq);
803 /* Construct a tcp options header for a SYN or SYN_ACK packet.
804 * If this is every changed make sure to change the definition of
805 * MAX_SYN_SIZE to match the new maximum number of options that you
806 * can generate.
808 extern __inline__ voidtcp_syn_build_options(__u32 *ptr,int mss,int ts,int sack,
809 int offer_wscale,int wscale, __u32 tstamp)
811 /* We always get an MSS option.
812 * The option bytes which will be seen in normal data
813 * packets should timestamps be used, must be in the MSS
814 * advertised. But we subtract them from sk->mss so
815 * that calculations in tcp_sendmsg are simpler etc.
816 * So account for this fact here if necessary. If we
817 * don't do this correctly, as a receiver we won't
818 * recognize data packets as being full sized when we
819 * should, and thus we won't abide by the delayed ACK
820 * rules correctly.
821 * SACKs don't matter, we never delay an ACK when we
822 * have any of those going out.
824 if(ts)
825 mss += TCPOLEN_TSTAMP_ALIGNED;
826 *ptr++ =htonl((TCPOPT_MSS <<24) | (TCPOLEN_MSS <<16) | mss);
827 if(ts) {
828 if(sack)
829 *ptr++ =__constant_htonl((TCPOPT_SACK_PERM <<24) | (TCPOLEN_SACK_PERM <<16) |
830 (TCPOPT_TIMESTAMP <<8) | TCPOLEN_TIMESTAMP);
831 else
832 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) | (TCPOPT_NOP <<16) |
833 (TCPOPT_TIMESTAMP <<8) | TCPOLEN_TIMESTAMP);
834 *ptr++ =htonl(tstamp);/* TSVAL */
835 *ptr++ =__constant_htonl(0);/* TSECR */
836 }else if(sack)
837 *ptr++ =__constant_htonl((TCPOPT_NOP <<24) | (TCPOPT_NOP <<16) |
838 (TCPOPT_SACK_PERM <<8) | TCPOLEN_SACK_PERM);
839 if(offer_wscale)
840 *ptr++ =htonl((TCPOPT_NOP <<24) | (TCPOPT_WINDOW <<16) | (TCPOLEN_WINDOW <<8) | (wscale));
843 /* Determine a window scaling and initial window to offer.
844 * Based on the assumption that the given amount of space
845 * will be offered. Store the results in the tp structure.
846 * NOTE: for smooth operation initial space offering should
847 * be a multiple of mss if possible. We assume here that mss >= 1.
848 * This MUST be enforced by all callers.
850 extern __inline__ voidtcp_select_initial_window(__u32 space, __u16 mss,
851 __u32 *rcv_wnd,
852 __u32 *window_clamp,
853 int wscale_ok,
854 __u8 *rcv_wscale)
856 /* If no clamp set the clamp to the max possible scaled window */
857 if(*window_clamp ==0)
858 (*window_clamp) = (65535<<14);
859 space =min(*window_clamp,space);
861 /* Quantize space offering to a multiple of mss if possible. */
862 if(space > mss)
863 space = (space/mss)*mss;
865 /* NOTE: offering an initial window larger than 32767
866 * will break some buggy TCP stacks. We try to be nice.
867 * If we are not window scaling, then this truncates
868 * our initial window offering to 32k. There should also
869 * be a sysctl option to stop being nice.
871 (*rcv_wnd) =min(space,32767);
872 (*rcv_wscale) =0;
873 if(wscale_ok) {
874 /* See RFC1323 for an explanation of the limit to 14 */
875 while(space >65535&& (*rcv_wscale) <14) {
876 space >>=1;
877 (*rcv_wscale)++;
880 /* Set the clamp no higher than max representable value */
881 (*window_clamp) =min(65535<<(*rcv_wscale),*window_clamp);
884 extern __inline__ voidtcp_synq_unlink(struct tcp_opt *tp,struct open_request *req,struct open_request *prev)
886 if(!req->dl_next)
887 tp->syn_wait_last = (struct open_request **)prev;
888 prev->dl_next = req->dl_next;
891 extern __inline__ voidtcp_synq_queue(struct tcp_opt *tp,struct open_request *req)
893 req->dl_next = NULL;
894 *tp->syn_wait_last = req;
895 tp->syn_wait_last = &req->dl_next;
898 extern __inline__ voidtcp_synq_init(struct tcp_opt *tp)
900 tp->syn_wait_queue = NULL;
901 tp->syn_wait_last = &tp->syn_wait_queue;
904 extern __inline__ struct open_request *tcp_synq_unlink_tail(struct tcp_opt *tp)
906 struct open_request *head = tp->syn_wait_queue;
907 #if 0
908 /* Should be a net-ratelimit'd thing, not all the time. */
909 printk(KERN_DEBUG "synq tail drop with expire=%ld\n",
910 head->expires-jiffies);
911 #endif
912 if(head->dl_next == NULL)
913 tp->syn_wait_last = &tp->syn_wait_queue;
914 tp->syn_wait_queue = head->dl_next;
915 return head;
918 externvoid__tcp_inc_slow_timer(struct tcp_sl_timer *slt);
919 extern __inline__ voidtcp_inc_slow_timer(int timer)
921 struct tcp_sl_timer *slt = &tcp_slt_array[timer];
923 if(atomic_read(&slt->count) ==0)
925 __tcp_inc_slow_timer(slt);
928 atomic_inc(&slt->count);
931 extern __inline__ voidtcp_dec_slow_timer(int timer)
933 struct tcp_sl_timer *slt = &tcp_slt_array[timer];
935 atomic_dec(&slt->count);
938 /* This needs to use a slow timer, so it is here. */
939 static __inline__ voidtcp_sk_unbindify(struct sock *sk)
941 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *) sk->prev;
942 if(sk->bind_next)
943 sk->bind_next->bind_pprev = sk->bind_pprev;
944 *sk->bind_pprev = sk->bind_next;
945 if(tb->owners == NULL)
946 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
949 externconst char timer_bug_msg[];
951 staticinlinevoidtcp_clear_xmit_timer(struct sock *sk,int what)
953 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
954 struct timer_list *timer;
956 switch(what) {
957 case TIME_RETRANS:
958 timer = &tp->retransmit_timer;
959 break;
960 case TIME_DACK:
961 timer = &tp->delack_timer;
962 break;
963 case TIME_PROBE0:
964 timer = &tp->probe_timer;
965 break;
966 default:
967 printk(timer_bug_msg);
968 return;
970 if(timer->prev != NULL)
971 del_timer(timer);
974 staticinlineinttcp_timer_is_set(struct sock *sk,int what)
976 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
978 switch(what) {
979 case TIME_RETRANS:
980 return tp->retransmit_timer.prev != NULL;
981 break;
982 case TIME_DACK:
983 return tp->delack_timer.prev != NULL;
984 break;
985 case TIME_PROBE0:
986 return tp->probe_timer.prev != NULL;
987 break;
988 default:
989 printk(timer_bug_msg);
991 return0;
995 #endif/* _TCP_H */
close