net/ipv4/tcp.c

Name: Public Git Hosting - davej-history.git/blob - net/ipv4/tcp.c
Rating: 4.5 (5983 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * Implementation of the Transmission Control Protocol(TCP).
 7  *
 8  * Version: $Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  * Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  * Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  * Florian La Roche, <flla@stud.uni-sb.de>
 15  * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  * Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  * Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  * Matthew Dillon, <dillon@apollo.west.oic.com>
 19  * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  * Jorge Cwik, <jorge@laser.satlink.net>
 21  *
 22  * Fixes:
 23  * Alan Cox : Numerous verify_area() calls
 24  * Alan Cox : Set the ACK bit on a reset
 25  * Alan Cox : Stopped it crashing if it closed while
 26  * sk->inuse=1 and was trying to connect
 27  * (tcp_err()).
 28  * Alan Cox : All icmp error handling was broken
 29  * pointers passed where wrong and the
 30  * socket was looked up backwards. Nobody
 31  * tested any icmp error code obviously.
 32  * Alan Cox : tcp_err() now handled properly. It
 33  * wakes people on errors. poll
 34  * behaves and the icmp error race
 35  * has gone by moving it into sock.c
 36  * Alan Cox : tcp_send_reset() fixed to work for
 37  * everything not just packets for
 38  * unknown sockets.
 39  * Alan Cox : tcp option processing.
 40  * Alan Cox : Reset tweaked (still not 100%) [Had
 41  * syn rule wrong]
 42  * Herp Rosmanith : More reset fixes
 43  * Alan Cox : No longer acks invalid rst frames.
 44  * Acking any kind of RST is right out.
 45  * Alan Cox : Sets an ignore me flag on an rst
 46  * receive otherwise odd bits of prattle
 47  * escape still
 48  * Alan Cox : Fixed another acking RST frame bug.
 49  * Should stop LAN workplace lockups.
 50  * Alan Cox : Some tidyups using the new skb list
 51  * facilities
 52  * Alan Cox : sk->keepopen now seems to work
 53  * Alan Cox : Pulls options out correctly on accepts
 54  * Alan Cox : Fixed assorted sk->rqueue->next errors
 55  * Alan Cox : PSH doesn't end a TCP read. Switched a
 56  * bit to skb ops.
 57  * Alan Cox : Tidied tcp_data to avoid a potential
 58  * nasty.
 59  * Alan Cox : Added some better commenting, as the
 60  * tcp is hard to follow
 61  * Alan Cox : Removed incorrect check for 20 * psh
 62  * Michael O'Reilly : ack < copied bug fix.
 63  * Johannes Stille : Misc tcp fixes (not all in yet).
 64  * Alan Cox : FIN with no memory -> CRASH
 65  * Alan Cox : Added socket option proto entries.
 66  * Also added awareness of them to accept.
 67  * Alan Cox : Added TCP options (SOL_TCP)
 68  * Alan Cox : Switched wakeup calls to callbacks,
 69  * so the kernel can layer network
 70  * sockets.
 71  * Alan Cox : Use ip_tos/ip_ttl settings.
 72  * Alan Cox : Handle FIN (more) properly (we hope).
 73  * Alan Cox : RST frames sent on unsynchronised
 74  * state ack error.
 75  * Alan Cox : Put in missing check for SYN bit.
 76  * Alan Cox : Added tcp_select_window() aka NET2E
 77  * window non shrink trick.
 78  * Alan Cox : Added a couple of small NET2E timer
 79  * fixes
 80  * Charles Hedrick : TCP fixes
 81  * Toomas Tamm : TCP window fixes
 82  * Alan Cox : Small URG fix to rlogin ^C ack fight
 83  * Charles Hedrick : Rewrote most of it to actually work
 84  * Linus : Rewrote tcp_read() and URG handling
 85  * completely
 86  * Gerhard Koerting: Fixed some missing timer handling
 87  * Matthew Dillon : Reworked TCP machine states as per RFC
 88  * Gerhard Koerting: PC/TCP workarounds
 89  * Adam Caldwell : Assorted timer/timing errors
 90  * Matthew Dillon : Fixed another RST bug
 91  * Alan Cox : Move to kernel side addressing changes.
 92  * Alan Cox : Beginning work on TCP fastpathing
 93  * (not yet usable)
 94  * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
 95  * Alan Cox : TCP fast path debugging
 96  * Alan Cox : Window clamping
 97  * Michael Riepe : Bug in tcp_check()
 98  * Matt Dillon : More TCP improvements and RST bug fixes
 99  * Matt Dillon : Yet more small nasties remove from the
 100  * TCP code (Be very nice to this man if
 101  * tcp finally works 100%) 8)
 102  * Alan Cox : BSD accept semantics.
 103  * Alan Cox : Reset on closedown bug.
 104  * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
 105  * Michael Pall : Handle poll() after URG properly in
 106  * all cases.
 107  * Michael Pall : Undo the last fix in tcp_read_urg()
 108  * (multi URG PUSH broke rlogin).
 109  * Michael Pall : Fix the multi URG PUSH problem in
 110  * tcp_readable(), poll() after URG
 111  * works now.
 112  * Michael Pall : recv(...,MSG_OOB) never blocks in the
 113  * BSD api.
 114  * Alan Cox : Changed the semantics of sk->socket to
 115  * fix a race and a signal problem with
 116  * accept() and async I/O.
 117  * Alan Cox : Relaxed the rules on tcp_sendto().
 118  * Yury Shevchuk : Really fixed accept() blocking problem.
 119  * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
 120  * clients/servers which listen in on
 121  * fixed ports.
 122  * Alan Cox : Cleaned the above up and shrank it to
 123  * a sensible code size.
 124  * Alan Cox : Self connect lockup fix.
 125  * Alan Cox : No connect to multicast.
 126  * Ross Biro : Close unaccepted children on master
 127  * socket close.
 128  * Alan Cox : Reset tracing code.
 129  * Alan Cox : Spurious resets on shutdown.
 130  * Alan Cox : Giant 15 minute/60 second timer error
 131  * Alan Cox : Small whoops in polling before an
 132  * accept.
 133  * Alan Cox : Kept the state trace facility since
 134  * it's handy for debugging.
 135  * Alan Cox : More reset handler fixes.
 136  * Alan Cox : Started rewriting the code based on
 137  * the RFC's for other useful protocol
 138  * references see: Comer, KA9Q NOS, and
 139  * for a reference on the difference
 140  * between specifications and how BSD
 141  * works see the 4.4lite source.
 142  * A.N.Kuznetsov : Don't time wait on completion of tidy
 143  * close.
 144  * Linus Torvalds : Fin/Shutdown & copied_seq changes.
 145  * Linus Torvalds : Fixed BSD port reuse to work first syn
 146  * Alan Cox : Reimplemented timers as per the RFC
 147  * and using multiple timers for sanity.
 148  * Alan Cox : Small bug fixes, and a lot of new
 149  * comments.
 150  * Alan Cox : Fixed dual reader crash by locking
 151  * the buffers (much like datagram.c)
 152  * Alan Cox : Fixed stuck sockets in probe. A probe
 153  * now gets fed up of retrying without
 154  * (even a no space) answer.
 155  * Alan Cox : Extracted closing code better
 156  * Alan Cox : Fixed the closing state machine to
 157  * resemble the RFC.
 158  * Alan Cox : More 'per spec' fixes.
 159  * Jorge Cwik : Even faster checksumming.
 160  * Alan Cox : tcp_data() doesn't ack illegal PSH
 161  * only frames. At least one pc tcp stack
 162  * generates them.
 163  * Alan Cox : Cache last socket.
 164  * Alan Cox : Per route irtt.
 165  * Matt Day : poll()->select() match BSD precisely on error
 166  * Alan Cox : New buffers
 167  * Marc Tamsky : Various sk->prot->retransmits and
 168  * sk->retransmits misupdating fixed.
 169  * Fixed tcp_write_timeout: stuck close,
 170  * and TCP syn retries gets used now.
 171  * Mark Yarvis : In tcp_read_wakeup(), don't send an
 172  * ack if state is TCP_CLOSED.
 173  * Alan Cox : Look up device on a retransmit - routes may
 174  * change. Doesn't yet cope with MSS shrink right
 175  * but its a start!
 176  * Marc Tamsky : Closing in closing fixes.
 177  * Mike Shaver : RFC1122 verifications.
 178  * Alan Cox : rcv_saddr errors.
 179  * Alan Cox : Block double connect().
 180  * Alan Cox : Small hooks for enSKIP.
 181  * Alexey Kuznetsov: Path MTU discovery.
 182  * Alan Cox : Support soft errors.
 183  * Alan Cox : Fix MTU discovery pathological case
 184  * when the remote claims no mtu!
 185  * Marc Tamsky : TCP_CLOSE fix.
 186  * Colin (G3TNE) : Send a reset on syn ack replies in
 187  * window but wrong (fixes NT lpd problems)
 188  * Pedro Roque : Better TCP window handling, delayed ack.
 189  * Joerg Reuter : No modification of locked buffers in
 190  * tcp_do_retransmit()
 191  * Eric Schenk : Changed receiver side silly window
 192  * avoidance algorithm to BSD style
 193  * algorithm. This doubles throughput
 194  * against machines running Solaris,
 195  * and seems to result in general
 196  * improvement.
 197  * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
 198  * Willy Konynenberg : Transparent proxying support.
 199  * Mike McLagan : Routing by source
 200  * Keith Owens : Do proper merging with partial SKB's in
 201  * tcp_do_sendmsg to avoid burstiness.
 202  * Eric Schenk : Fix fast close down bug with
 203  * shutdown() followed by close().
 204  * Andi Kleen : Make poll agree with SIGIO
 205  *
 206  * This program is free software; you can redistribute it and/or
 207  * modify it under the terms of the GNU General Public License
 208  * as published by the Free Software Foundation; either version
 209  * 2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  * TCP_SYN_SENT sent a connection request, waiting for ack
 214  *
 215  * TCP_SYN_RECV received a connection request, sent ack,
 216  * waiting for final ack in three-way handshake.
 217  *
 218  * TCP_ESTABLISHED connection established
 219  *
 220  * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
 221  * transmission of remaining buffered data
 222  *
 223  * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
 224  * to shutdown
 225  *
 226  * TCP_CLOSING both sides have shutdown but we still have
 227  * data we have to finish sending
 228  *
 229  * TCP_TIME_WAIT timeout to catch resent junk before entering
 230  * closed, can only be entered from FIN_WAIT2
 231  * or CLOSING. Required because the other end
 232  * may not have gotten our last ACK causing it
 233  * to retransmit the data packet (which we ignore)
 234  *
 235  * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
 236  * us to finish writing our data and to shutdown
 237  * (we have to close() to move on to LAST_ACK)
 238  *
 239  * TCP_LAST_ACK out side has shutdown after remote has
 240  * shutdown. There may still be data in our
 241  * buffer that we have to finish sending
 242  *
 243  * TCP_CLOSE socket is finished
 244  */
 245
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like. tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * [Note: Most of the TCP code has been rewriten/redesigned since this
 253  * RFC1122 check. It is probably not correct anymore. It should be redone
 254  * before 2.2. -AK]
 255  *
 256  * Use of PSH (4.2.2.2)
 257  * MAY aggregate data sent without the PSH flag. (does)
 258  * MAY queue data received without the PSH flag. (does)
 259  * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260  * MAY implement PSH on send calls. (doesn't, thus:)
 261  * MUST NOT buffer data indefinitely (doesn't [1 second])
 262  * MUST set PSH on last segment (does)
 263  * MAY pass received PSH to application layer (doesn't)
 264  * SHOULD send maximum-sized segment whenever possible. (almost always does)
 265  *
 266  * Window Size (4.2.2.3, 4.2.2.16)
 267  * MUST treat window size as an unsigned number (does)
 268  * SHOULD treat window size as a 32-bit number (does not)
 269  * MUST NOT shrink window once it is offered (does not normally)
 270  *
 271  * Urgent Pointer (4.2.2.4)
 272  * **MUST point urgent pointer to last byte of urgent data (not right
 273  * after). (doesn't, to be like BSD. That's configurable, but defaults
 274  * to off)
 275  * MUST inform application layer asynchronously of incoming urgent
 276  * data. (does)
 277  * MUST provide application with means of determining the amount of
 278  * urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  * [Follows BSD 1 byte of urgent data]
 282  *
 283  * TCP Options (4.2.2.5)
 284  * MUST be able to receive TCP options in any segment. (does)
 285  * MUST ignore unsupported options (does)
 286  *
 287  * Maximum Segment Size Option (4.2.2.6)
 288  * MUST implement both sending and receiving MSS. (does, but currently
 289  * only uses the smaller of both of them)
 290  * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291  * it always). (does, even when MSS == 536, which is legal)
 292  * MUST assume MSS == 536 if no MSS received at connection setup (does)
 293  * MUST calculate "effective send MSS" correctly:
 294  * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295  * (does - but allows operator override)
 296  *
 297  * TCP Checksum (4.2.2.7)
 298  * MUST generate and check TCP checksum. (does)
 299  *
 300  * Initial Sequence Number Selection (4.2.2.8)
 301  * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
 302  * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303  * necessary for 10Mbps networks - and harder than BSD to spoof!
 304  * With syncookies we doesn't)
 305  *
 306  * Simultaneous Open Attempts (4.2.2.10)
 307  * MUST support simultaneous open attempts (does)
 308  *
 309  * Recovery from Old Duplicate SYN (4.2.2.11)
 310  * MUST keep track of active vs. passive open (does)
 311  *
 312  * RST segment (4.2.2.12)
 313  * SHOULD allow an RST segment to contain data (does, but doesn't do
 314  * anything with it, which is standard)
 315  *
 316  * Closing a Connection (4.2.2.13)
 317  * MUST inform application of whether connection was closed by RST or
 318  * normal close. (does)
 319  * MAY allow "half-duplex" close (treat connection as closed for the
 320  * local app, even before handshake is done). (does)
 321  * MUST linger in TIME_WAIT for 2 * MSL (does)
 322  *
 323  * Retransmission Timeout (4.2.2.15)
 324  * MUST implement Jacobson's slow start and congestion avoidance
 325  * stuff. (does)
 326  *
 327  * Probing Zero Windows (4.2.2.17)
 328  * MUST support probing of zero windows. (does)
 329  * MAY keep offered window closed indefinitely. (does)
 330  * MUST allow remote window to stay closed indefinitely. (does)
 331  *
 332  * Passive Open Calls (4.2.2.18)
 333  * MUST NOT let new passive open affect other connections. (doesn't)
 334  * MUST support passive opens (LISTENs) concurrently. (does)
 335  *
 336  * Time to Live (4.2.2.19)
 337  * MUST make TCP TTL configurable. (does - IP_TTL option)
 338  *
 339  * Event Processing (4.2.2.20)
 340  * SHOULD queue out-of-order segments. (does)
 341  * MUST aggregate ACK segments whenever possible. (does but badly)
 342  *
 343  * Retransmission Timeout Calculation (4.2.3.1)
 344  * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345  * calculation. (does, or at least explains them in the comments 8*b)
 346  * SHOULD initialize RTO to 0 and RTT to 3. (does)
 347  *
 348  * When to Send an ACK Segment (4.2.3.2)
 349  * SHOULD implement delayed ACK. (does)
 350  * MUST keep ACK delay < 0.5 sec. (does)
 351  *
 352  * When to Send a Window Update (4.2.3.3)
 353  * MUST implement receiver-side SWS. (does)
 354  *
 355  * When to Send Data (4.2.3.4)
 356  * MUST implement sender-side SWS. (does)
 357  * SHOULD implement Nagle algorithm. (does)
 358  *
 359  * TCP Connection Failures (4.2.3.5)
 360  * MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361  * SHOULD inform application layer of soft errors. (does)
 362  *
 363  * TCP Keep-Alives (4.2.3.6)
 364  * MAY provide keep-alives. (does)
 365  * MUST make keep-alives configurable on a per-connection basis. (does)
 366  * MUST default to no keep-alives. (does)
 367  * MUST make keep-alive interval configurable. (does)
 368  * MUST make default keep-alive interval > 2 hours. (does)
 369  * MUST NOT interpret failure to ACK keep-alive packet as dead
 370  * connection. (doesn't)
 371  * SHOULD send keep-alive with no data. (does)
 372  *
 373  * TCP Multihoming (4.2.3.7)
 374  * MUST get source address from IP layer before sending first
 375  * SYN. (does)
 376  * MUST use same local address for all segments of a connection. (does)
 377  *
 378  * IP Options (4.2.3.8)
 379  * MUST ignore unsupported IP options. (does)
 380  * MAY support Time Stamp and Record Route. (does)
 381  * MUST allow application to specify a source route. (does)
 382  * MUST allow received Source Route option to set route for all future
 383  * segments on this connection. (does not (security issues))
 384  *
 385  * ICMP messages (4.2.3.9)
 386  * MUST act on ICMP errors. (does)
 387  * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 388  * because that is deprecated now by the IETF, can be turned on)
 389  * MUST NOT abort connection upon receipt of soft Destination
 390  * Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  * Problems. (doesn't)
 392  * SHOULD report soft Destination Unreachables etc. to the
 393  * application. (does, except during SYN_RECV and may drop messages
 394  * in some rare cases before accept() - ICMP is unreliable)
 395  * SHOULD abort connection upon receipt of hard Destination Unreachable
 396  * messages (2, 3, 4). (does, but see above)
 397  *
 398  * Remote Address Validation (4.2.3.10)
 399  * MUST reject as an error OPEN for invalid remote IP address. (does)
 400  * MUST ignore SYN with invalid source address. (does)
 401  * MUST silently discard incoming SYN for broadcast/multicast
 402  * address. (does)
 403  *
 404  * Asynchronous Reports (4.2.4.1)
 405  * MUST provide mechanism for reporting soft errors to application
 406  * layer. (does)
 407  *
 408  * Type of Service (4.2.4.2)
 409  * MUST allow application layer to set Type of Service. (does IP_TOS)
 410  *
 411  * (Whew. -- MS 950903)
 412  * (Updated by AK, but not complete yet.)
 413  **/
 414
 415 #include <linux/types.h>
 416 #include <linux/fcntl.h>
 417 #include <linux/poll.h>
 418 #include <linux/init.h>
 419
 420 #include <net/icmp.h>
 421 #include <net/tcp.h>
 422
 423 #include <asm/uaccess.h>
 424
 425 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 426
 427 struct tcp_mib tcp_statistics;
 428
 429 kmem_cache_t *tcp_openreq_cachep;
 430 kmem_cache_t *tcp_bucket_cachep;
 431 kmem_cache_t *tcp_timewait_cachep;
 432
 433 /*
 434  * Find someone to 'accept'. Must be called with
 435  * the socket locked or with interrupts disabled
 436  */
 437
 438 static struct open_request *tcp_find_established(struct tcp_opt *tp,
 439 struct open_request **prevp)
 440 {
 441 struct open_request *req = tp->syn_wait_queue;
 442 struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
 443 while(req) {
 444 if(req->sk &&
 445 ((1<< req->sk->state) &
 446 ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
 447 break;
 448  prev = req;
 449  req = req->dl_next;
 450 }
 451 *prevp = prev;
 452 return req;
 453 }
 454
 455 /*
 456  * Walk down the receive queue counting readable data.
 457  *
 458  * Must be called with the socket lock held.
 459  */
 460
 461 static inttcp_readable(struct sock *sk)
 462 {
 463 unsigned long counted;
 464 unsigned long amount;
 465 struct sk_buff *skb;
 466 int sum;
 467
 468 SOCK_DEBUG(sk,"tcp_readable: %p - ",sk);
 469
 470  skb =skb_peek(&sk->receive_queue);
 471 if(skb == NULL) {
 472 SOCK_DEBUG(sk,"empty\n");
 473 return(0);
 474 }
 475
 476  counted = sk->tp_pinfo.af_tcp.copied_seq;/* Where we are at the moment */
 477  amount =0;
 478
 479 /* Do until a push or until we are out of data. */
 480 do{
 481 /* Found a hole so stops here. */
 482 if(before(counted,TCP_SKB_CB(skb)->seq))/* should not happen */
 483 break;
 484
 485 /* Length - header but start from where we are up to
 486  * avoid overlaps.
 487  */
 488  sum = skb->len - (counted -TCP_SKB_CB(skb)->seq);
 489 if(sum >=0) {
 490 /* Add it up, move on. */
 491  amount += sum;
 492  counted += sum;
 493 if(skb->h.th->syn)
 494  counted++;
 495 }
 496
 497 /* Don't count urg data ... but do it in the right place!
 498  * Consider: "old_data (ptr is here) URG PUSH data"
 499  * The old code would stop at the first push because
 500  * it counted the urg (amount==1) and then does amount--
 501  * *after* the loop. This means tcp_readable() always
 502  * returned zero if any URG PUSH was in the queue, even
 503  * though there was normal data available. If we subtract
 504  * the urg data right here, we even get it to work for more
 505  * than one URG PUSH skb without normal data.
 506  * This means that poll() finally works now with urg data
 507  * in the queue. Note that rlogin was never affected
 508  * because it doesn't use poll(); it uses two processes
 509  * and a blocking read(). And the queue scan in tcp_read()
 510  * was correct. Mike <pall@rz.uni-karlsruhe.de>
 511  */
 512
 513 /* Don't count urg data. */
 514 if(skb->h.th->urg)
 515  amount--;
 516 #if 0
 517 if(amount && skb->h.th->psh)break;
 518 #endif
 519  skb = skb->next;
 520 }while(skb != (struct sk_buff *)&sk->receive_queue);
 521
 522 SOCK_DEBUG(sk,"got %lu bytes.\n",amount);
 523 return(amount);
 524 }
 525
 526 /*
 527  * LISTEN is a special case for poll..
 528  */
 529 static unsigned inttcp_listen_poll(struct sock *sk, poll_table *wait)
 530 {
 531 struct open_request *req, *dummy;
 532
 533 lock_sock(sk);
 534  req =tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 535 release_sock(sk);
 536 if(req)
 537 return POLLIN | POLLRDNORM;
 538 return0;
 539 }
 540
 541 /*
 542  * Compute minimal free write space needed to queue new packets.
 543  */
 544 staticinlineinttcp_min_write_space(struct sock *sk,struct tcp_opt *tp)
 545 {
 546 int space;
 547 #if 1/* This needs benchmarking and real world tests */
 548  space =max(tp->mss_cache +128, MIN_WRITE_SPACE);
 549 #else/* 2.0 way */
 550 /* More than half of the socket queue free? */
 551  space =atomic_read(&sk->wmem_alloc) /2;
 552 #endif
 553 return space;
 554 }
 555
 556 /*
 557  * Wait for a TCP event.
 558  *
 559  * Note that we don't need to lock the socket, as the upper poll layers
 560  * take care of normal races (between the test and the event) and we don't
 561  * go look at any of the socket buffers directly.
 562  */
 563 unsigned inttcp_poll(struct file * file,struct socket *sock, poll_table *wait)
 564 {
 565 unsigned int mask;
 566 struct sock *sk = sock->sk;
 567 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 568
 569 poll_wait(file, sk->sleep, wait);
 570 if(sk->state == TCP_LISTEN)
 571 returntcp_listen_poll(sk, wait);
 572
 573  mask =0;
 574 if(sk->err)
 575  mask = POLLERR;
 576 /* Connected? */
 577 if((1<< sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE)) {
 578 if(sk->shutdown & RCV_SHUTDOWN)
 579  mask |= POLLHUP;
 580
 581 if((tp->rcv_nxt != tp->copied_seq) &&
 582 (tp->urg_seq != tp->copied_seq ||
 583  tp->rcv_nxt != tp->copied_seq+1||
 584  sk->urginline || !tp->urg_data))
 585  mask |= POLLIN | POLLRDNORM;
 586
 587 /* Always wake the user up when an error occurred */
 588 if(sock_wspace(sk) >=tcp_min_write_space(sk, tp) || sk->err)
 589  mask |= POLLOUT | POLLWRNORM;
 590 if(tp->urg_data & URG_VALID)
 591  mask |= POLLPRI;
 592 }
 593 return mask;
 594 }
 595
 596 /*
 597  * Socket write_space callback.
 598  * This (or rather the sock_wake_async) should agree with poll.
 599  */
 600 voidtcp_write_space(struct sock *sk)
 601 {
 602 if(sk->dead)
 603 return;
 604
 605 wake_up_interruptible(sk->sleep);
 606 if(sock_wspace(sk) >=
 607 tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp)))
 608 sock_wake_async(sk->socket,2);
 609 }
 610
 611
 612 inttcp_ioctl(struct sock *sk,int cmd,unsigned long arg)
 613 {
 614 int answ;
 615
 616 switch(cmd) {
 617 case TIOCINQ:
 618 #ifdef FIXME/* FIXME: */
 619 case FIONREAD:
 620 #endif
 621 if(sk->state == TCP_LISTEN)
 622 return(-EINVAL);
 623 lock_sock(sk);
 624  answ =tcp_readable(sk);
 625 release_sock(sk);
 626 break;
 627 case SIOCATMARK:
 628 {
 629 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 630  answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 631 break;
 632 }
 633 case TIOCOUTQ:
 634 if(sk->state == TCP_LISTEN)
 635 return(-EINVAL);
 636  answ =sock_wspace(sk);
 637 break;
 638 default:
 639 return(-ENOIOCTLCMD);
 640 };
 641
 642 returnput_user(answ, (int*)arg);
 643 }
 644
 645 /*
 646  * Wait for a socket to get into the connected state
 647  *
 648  * Note: must be called with the socket locked.
 649  */
 650 static intwait_for_tcp_connect(struct sock * sk,int flags)
 651 {
 652 struct task_struct *tsk = current;
 653 struct wait_queue wait = { tsk, NULL };
 654
 655 while((1<< sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 656 if(sk->err)
 657 returnsock_error(sk);
 658 if((1<< sk->state) &
 659 ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 660 if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 661 send_sig(SIGPIPE, tsk,0);
 662 return-EPIPE;
 663 }
 664 if(flags & MSG_DONTWAIT)
 665 return-EAGAIN;
 666 if(signal_pending(tsk))
 667 return-ERESTARTSYS;
 668
 669  tsk->state = TASK_INTERRUPTIBLE;
 670 add_wait_queue(sk->sleep, &wait);
 671 release_sock(sk);
 672
 673 if(((1<< sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
 674  sk->err ==0)
 675 schedule();
 676
 677  tsk->state = TASK_RUNNING;
 678 remove_wait_queue(sk->sleep, &wait);
 679 lock_sock(sk);
 680 }
 681 return0;
 682 }
 683
 684 staticinlineinttcp_memory_free(struct sock *sk)
 685 {
 686 returnatomic_read(&sk->wmem_alloc) < sk->sndbuf;
 687 }
 688
 689 /*
 690  * Wait for more memory for a socket
 691  */
 692 static voidwait_for_tcp_memory(struct sock * sk)
 693 {
 694 release_sock(sk);
 695 if(!tcp_memory_free(sk)) {
 696 struct wait_queue wait = { current, NULL };
 697
 698  sk->socket->flags &= ~SO_NOSPACE;
 699 add_wait_queue(sk->sleep, &wait);
 700 for(;;) {
 701 if(signal_pending(current))
 702 break;
 703  current->state = TASK_INTERRUPTIBLE;
 704 if(tcp_memory_free(sk))
 705 break;
 706 if(sk->shutdown & SEND_SHUTDOWN)
 707 break;
 708 if(sk->err)
 709 break;
 710 schedule();
 711 }
 712  current->state = TASK_RUNNING;
 713 remove_wait_queue(sk->sleep, &wait);
 714 }
 715 lock_sock(sk);
 716 }
 717
 718 /*
 719  * This routine copies from a user buffer into a socket,
 720  * and starts the transmit system.
 721  *
 722  * Note: must be called with the socket locked.
 723  */
 724
 725 inttcp_do_sendmsg(struct sock *sk,int iovlen,struct iovec *iov,int flags)
 726 {
 727 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 728 int mss_now;
 729 int err =0;
 730 int copied =0;
 731
 732 /* Verify that the socket is locked */
 733 if(!atomic_read(&sk->sock_readers))
 734 printk("tcp_do_sendmsg: socket not locked!\n");
 735
 736 /* Wait for a connection to finish. */
 737 if((1<< sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 738 if((err =wait_for_tcp_connect(sk, flags)) !=0)
 739 return err;
 740
 741  mss_now =tcp_current_mss(sk);
 742
 743 /* Ok commence sending. */
 744 while(--iovlen >=0) {
 745 int seglen=iov->iov_len;
 746 unsigned char* from=iov->iov_base;
 747
 748  iov++;
 749
 750 while(seglen >0) {
 751 int copy, tmp, queue_it;
 752 struct sk_buff *skb;
 753
 754 if(err)
 755 return-EFAULT;
 756
 757 /* Stop on errors. */
 758 if(sk->err)
 759 goto do_sock_err;
 760
 761 /* Make sure that we are established. */
 762 if(sk->shutdown & SEND_SHUTDOWN)
 763 goto do_shutdown;
 764
 765 /* Now we need to check if we have a half
 766  * built packet we can tack some data onto.
 767  */
 768 if(tp->send_head && !(flags & MSG_OOB)) {
 769  skb = sk->write_queue.prev;
 770  copy = skb->len;
 771 /* If the remote does SWS avoidance we should
 772  * queue the best we can if not we should in
 773  * fact send multiple packets...
 774  * A method for detecting this would be most
 775  * welcome.
 776  */
 777 if(skb_tailroom(skb) >0&&
 778 (mss_now - copy) >0&&
 779  tp->snd_nxt <TCP_SKB_CB(skb)->end_seq) {
 780 int last_byte_was_odd = (copy %4);
 781
 782  copy = mss_now - copy;
 783 if(copy >skb_tailroom(skb))
 784  copy =skb_tailroom(skb);
 785 if(copy > seglen)
 786  copy = seglen;
 787 if(last_byte_was_odd) {
 788 if(copy_from_user(skb_put(skb, copy),
 789  from, copy))
 790  err = -EFAULT;
 791  skb->csum =csum_partial(skb->data,
 792  skb->len,0);
 793 }else{
 794  skb->csum =
 795 csum_and_copy_from_user(
 796  from,skb_put(skb, copy),
 797  copy, skb->csum, &err);
 798 }
 799  tp->write_seq += copy;
 800 TCP_SKB_CB(skb)->end_seq += copy;
 801  from += copy;
 802  copied += copy;
 803  seglen -= copy;
 804 if(!seglen && !iovlen)
 805 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 806 continue;
 807 }
 808 }
 809
 810 /* We also need to worry about the window. If
 811  * window < 1/2 the maximum window we've seen
 812  * from this host, don't use it. This is
 813  * sender side silly window prevention, as
 814  * specified in RFC1122. (Note that this is
 815  * different than earlier versions of SWS
 816  * prevention, e.g. RFC813.). What we
 817  * actually do is use the whole MSS. Since
 818  * the results in the right edge of the packet
 819  * being outside the window, it will be queued
 820  * for later rather than sent.
 821  */
 822  copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 823 if(copy >= (tp->max_window >>1))
 824  copy =min(copy, mss_now);
 825 else
 826  copy = mss_now;
 827 if(copy > seglen)
 828  copy = seglen;
 829
 830  tmp = MAX_HEADER + sk->prot->max_header;
 831  queue_it =0;
 832 if(copy <min(mss_now, tp->max_window >>1) &&
 833 !(flags & MSG_OOB)) {
 834  tmp +=min(mss_now, tp->max_window);
 835
 836 /* What is happening here is that we want to
 837  * tack on later members of the users iovec
 838  * if possible into a single frame. When we
 839  * leave this loop our caller checks to see if
 840  * we can send queued frames onto the wire.
 841  * See tcp_v[46]_sendmsg() for this.
 842  */
 843  queue_it =1;
 844 }else{
 845  tmp += copy;
 846 }
 847  skb =sock_wmalloc(sk, tmp,0, GFP_KERNEL);
 848
 849 /* If we didn't get any memory, we need to sleep. */
 850 if(skb == NULL) {
 851  sk->socket->flags |= SO_NOSPACE;
 852 if(flags&MSG_DONTWAIT) {
 853  err = -EAGAIN;
 854 goto do_interrupted;
 855 }
 856 if(signal_pending(current)) {
 857  err = -ERESTARTSYS;
 858 goto do_interrupted;
 859 }
 860 wait_for_tcp_memory(sk);
 861
 862 /* If SACK's were formed or PMTU events happened,
 863  * we must find out about it.
 864  */
 865  mss_now =tcp_current_mss(sk);
 866 continue;
 867 }
 868
 869  seglen -= copy;
 870
 871 /* Prepare control bits for TCP header creation engine. */
 872 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 873 ((!seglen && !iovlen) ?
 874  TCPCB_FLAG_PSH :0));
 875 TCP_SKB_CB(skb)->sacked =0;
 876 if(flags & MSG_OOB) {
 877 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 878 TCP_SKB_CB(skb)->urg_ptr = copy;
 879 }else
 880 TCP_SKB_CB(skb)->urg_ptr =0;
 881
 882 /* TCP data bytes are SKB_PUT() on top, later
 883  * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 884  * Reserve header space and checksum the data.
 885  */
 886 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 887  skb->csum =csum_and_copy_from_user(from,
 888 skb_put(skb, copy), copy,0, &err);
 889
 890  from += copy;
 891  copied += copy;
 892
 893 TCP_SKB_CB(skb)->seq = tp->write_seq;
 894 TCP_SKB_CB(skb)->end_seq =TCP_SKB_CB(skb)->seq + copy;
 895
 896 /* This advances tp->write_seq for us. */
 897 tcp_send_skb(sk, skb, queue_it);
 898 }
 899 }
 900  sk->err =0;
 901 if(err)
 902 return-EFAULT;
 903 return copied;
 904
 905 do_sock_err:
 906 if(copied)
 907 return copied;
 908 returnsock_error(sk);
 909 do_shutdown:
 910 if(copied)
 911 return copied;
 912 if(!(flags&MSG_NOSIGNAL))
 913 send_sig(SIGPIPE, current,0);
 914 return-EPIPE;
 915 do_interrupted:
 916 if(copied)
 917 return copied;
 918 return err;
 919 }
 920
 921 /*
 922  * Send an ack if one is backlogged at this point. Ought to merge
 923  * this with tcp_send_ack().
 924  * This is called for delayed acks also.
 925  */
 926
 927 voidtcp_read_wakeup(struct sock *sk)
 928 {
 929 /* If we're closed, don't send an ack, or we'll get a RST
 930  * from the closed destination.
 931  */
 932 if(sk->state != TCP_CLOSE)
 933 tcp_send_ack(sk);
 934 }
 935
 936 /*
 937  * Handle reading urgent data. BSD has very simple semantics for
 938  * this, no blocking and very strange errors 8)
 939  */
 940
 941 static inttcp_recv_urg(struct sock * sk,int nonblock,
 942 struct msghdr *msg,int len,int flags,
 943 int*addr_len)
 944 {
 945 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 946
 947 /* No URG data to read. */
 948 if(sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
 949 return-EINVAL;/* Yes this is right ! */
 950
 951 if(sk->err)
 952 returnsock_error(sk);
 953
 954 if(sk->done)
 955 return-ENOTCONN;
 956
 957 if(sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
 958  sk->done =1;
 959 return0;
 960 }
 961
 962 lock_sock(sk);
 963 if(tp->urg_data & URG_VALID) {
 964 int err =0;
 965 char c = tp->urg_data;
 966
 967 if(!(flags & MSG_PEEK))
 968  tp->urg_data = URG_READ;
 969
 970 if(msg->msg_name)
 971  tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
 972  msg->msg_name);
 973
 974 if(addr_len)
 975 *addr_len = tp->af_specific->sockaddr_len;
 976
 977 /* Read urgent data. */
 978  msg->msg_flags|=MSG_OOB;
 979 release_sock(sk);
 980
 981 if(len>0)
 982 {
 983  err =memcpy_toiovec(msg->msg_iov, &c,1);
 984 /* N.B. already set above ... */
 985  msg->msg_flags|=MSG_OOB;
 986 }
 987 else
 988  msg->msg_flags|=MSG_TRUNC;
 989
 990 /* N.B. Is this right?? If len == 0 we didn't read any data */
 991 return err ? -EFAULT :1;
 992 }
 993 release_sock(sk);
 994
 995 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
 996  * the available implementations agree in this case:
 997  * this call should never block, independent of the
 998  * blocking state of the socket.
 999  * Mike <pall@rz.uni-karlsruhe.de>
1000  */
1001 return-EAGAIN;
1002 }
1003
1004 /*
1005  * Release a skb if it is no longer needed. This routine
1006  * must be called with interrupts disabled or with the
1007  * socket locked so that the sk_buff queue operation is ok.
1008  */
1009
1010 staticinlinevoidtcp_eat_skb(struct sock *sk,struct sk_buff * skb)
1011 {
1012 __skb_unlink(skb, &sk->receive_queue);
1013 kfree_skb(skb);
1014 }
1015
1016 /* Clean up the receive buffer for full frames taken by the user,
1017  * then send an ACK if necessary. COPIED is the number of bytes
1018  * tcp_recvmsg has given to the user so far, it speeds up the
1019  * calculation of whether or not we must ACK for the sake of
1020  * a window update.
1021  */
1022 static voidcleanup_rbuf(struct sock *sk,int copied)
1023 {
1024 struct sk_buff *skb;
1025
1026 /* NOTE! The socket must be locked, so that we don't get
1027  * a messed-up receive queue.
1028  */
1029 while((skb=skb_peek(&sk->receive_queue)) != NULL) {
1030 if(!skb->used ||atomic_read(&skb->users) >1)
1031 break;
1032 tcp_eat_skb(sk, skb);
1033 }
1034
1035 SOCK_DEBUG(sk,"sk->rspace = %lu\n",sock_rspace(sk));
1036
1037 /* We send an ACK if we can now advertise a non-zero window
1038  * which has been raised "significantly".
1039  */
1040 if(copied >0) {
1041 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1042  __u32 rcv_window_now =tcp_receive_window(tp);
1043
1044 /* We won't be raising the window any further than
1045  * the window-clamp allows. Our window selection
1046  * also keeps things a nice multiple of MSS. These
1047  * checks are necessary to prevent spurious ACKs
1048  * which don't advertize a larger window.
1049  */
1050 if((copied >= rcv_window_now) &&
1051 ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1052 tcp_read_wakeup(sk);
1053 }
1054 }
1055
1056
1057 /*
1058  * This routine copies from a sock struct into the user buffer.
1059  */
1060
1061 inttcp_recvmsg(struct sock *sk,struct msghdr *msg,
1062 int len,int nonblock,int flags,int*addr_len)
1063 {
1064 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1065 struct wait_queue wait = { current, NULL };
1066 int copied =0;
1067  u32 peek_seq;
1068 volatile u32 *seq;/* So gcc doesn't overoptimise */
1069 unsigned long used;
1070 int err =0;
1071 int target =1;/* Read at least this many bytes */
1072
1073 if(sk->state == TCP_LISTEN)
1074 return-ENOTCONN;
1075
1076 /* Urgent data needs to be handled specially. */
1077 if(flags & MSG_OOB)
1078 returntcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1079
1080 /* Copying sequence to update. This is volatile to handle
1081  * the multi-reader case neatly (memcpy_to/fromfs might be
1082  * inline and thus not flush cached variables otherwise).
1083  */
1084  peek_seq = tp->copied_seq;
1085  seq = &tp->copied_seq;
1086 if(flags & MSG_PEEK)
1087  seq = &peek_seq;
1088
1089 /* Handle the POSIX bogosity MSG_WAITALL. */
1090 if(flags & MSG_WAITALL)
1091  target=len;
1092
1093 add_wait_queue(sk->sleep, &wait);
1094 lock_sock(sk);
1095
1096 /*
1097  * BUG BUG BUG
1098  * This violates 1003.1g compliance. We must wait for
1099  * data to exist even if we read none!
1100  */
1101
1102 while(len >0) {
1103 struct sk_buff * skb;
1104  u32 offset;
1105
1106 /* Are we at urgent data? Stop if we have read anything. */
1107 if(copied && tp->urg_data && tp->urg_seq == *seq)
1108 break;
1109
1110 /* We need to check signals first, to get correct SIGURG
1111  * handling. FIXME: Need to check this doesnt impact 1003.1g
1112  * and move it down to the bottom of the loop
1113  */
1114 if(signal_pending(current)) {
1115 if(copied)
1116 break;
1117  copied = -ERESTARTSYS;
1118 if(nonblock)
1119  copied = -EAGAIN;
1120 break;
1121 }
1122
1123 /* Next get a buffer. */
1124  current->state = TASK_INTERRUPTIBLE;
1125
1126  skb =skb_peek(&sk->receive_queue);
1127 do{
1128 if(!skb)
1129 break;
1130
1131 /* Now that we have two receive queues this
1132  * shouldn't happen.
1133  */
1134 if(before(*seq,TCP_SKB_CB(skb)->seq)) {
1135 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1136 *seq,TCP_SKB_CB(skb)->seq);
1137 break;
1138 }
1139  offset = *seq -TCP_SKB_CB(skb)->seq;
1140 if(skb->h.th->syn)
1141  offset--;
1142 if(offset < skb->len)
1143 goto found_ok_skb;
1144 if(skb->h.th->fin)
1145 goto found_fin_ok;
1146 if(!(flags & MSG_PEEK))
1147  skb->used =1;
1148  skb = skb->next;
1149 }while(skb != (struct sk_buff *)&sk->receive_queue);
1150
1151 if(copied >= target)
1152 break;
1153
1154 if(sk->err && !(flags&MSG_PEEK)) {
1155  copied =sock_error(sk);
1156 break;
1157 }
1158
1159 if(sk->shutdown & RCV_SHUTDOWN) {
1160  sk->done =1;
1161 break;
1162 }
1163
1164 if(sk->state == TCP_CLOSE) {
1165 if(!sk->done) {
1166  sk->done =1;
1167 break;
1168 }
1169  copied = -ENOTCONN;
1170 break;
1171 }
1172
1173 if(nonblock) {
1174  copied = -EAGAIN;
1175 break;
1176 }
1177
1178 cleanup_rbuf(sk, copied);
1179 release_sock(sk);
1180  sk->socket->flags |= SO_WAITDATA;
1181 schedule();
1182  sk->socket->flags &= ~SO_WAITDATA;
1183 lock_sock(sk);
1184 continue;
1185
1186  found_ok_skb:
1187 /* Lock the buffer. We can be fairly relaxed as
1188  * an interrupt will never steal a buffer we are
1189  * using unless I've missed something serious in
1190  * tcp_data.
1191  */
1192 atomic_inc(&skb->users);
1193
1194 /* Ok so how much can we use? */
1195  used = skb->len - offset;
1196 if(len < used)
1197  used = len;
1198
1199 /* Do we have urgent data here? */
1200 if(tp->urg_data) {
1201  u32 urg_offset = tp->urg_seq - *seq;
1202 if(urg_offset < used) {
1203 if(!urg_offset) {
1204 if(!sk->urginline) {
1205 ++*seq;
1206  offset++;
1207  used--;
1208 }
1209 }else
1210  used = urg_offset;
1211 }
1212 }
1213
1214 /* Copy it - We _MUST_ update *seq first so that we
1215  * don't ever double read when we have dual readers
1216  */
1217 *seq += used;
1218
1219 /* This memcpy_toiovec can sleep. If it sleeps and we
1220  * do a second read it relies on the skb->users to avoid
1221  * a crash when cleanup_rbuf() gets called.
1222  */
1223  err =memcpy_toiovec(msg->msg_iov, ((unsigned char*)skb->h.th) + skb->h.th->doff*4+ offset, used);
1224 if(err) {
1225 /* Exception. Bailout! */
1226 atomic_dec(&skb->users);
1227  copied = -EFAULT;
1228 break;
1229 }
1230
1231  copied += used;
1232  len -= used;
1233
1234 /* We now will not sleep again until we are finished
1235  * with skb. Sorry if you are doing the SMP port
1236  * but you'll just have to fix it neatly ;)
1237  */
1238 atomic_dec(&skb->users);
1239
1240 if(after(tp->copied_seq,tp->urg_seq))
1241  tp->urg_data =0;
1242 if(used + offset < skb->len)
1243 continue;
1244
1245 /* Process the FIN. We may also need to handle PSH
1246  * here and make it break out of MSG_WAITALL.
1247  */
1248 if(skb->h.th->fin)
1249 goto found_fin_ok;
1250 if(flags & MSG_PEEK)
1251 continue;
1252  skb->used =1;
1253 if(atomic_read(&skb->users) ==1)
1254 tcp_eat_skb(sk, skb);
1255 continue;
1256
1257  found_fin_ok:
1258 ++*seq;
1259 if(flags & MSG_PEEK)
1260 break;
1261
1262 /* All is done. */
1263  skb->used =1;
1264  sk->shutdown |= RCV_SHUTDOWN;
1265 break;
1266 }
1267
1268 if(copied >0&& msg->msg_name)
1269  tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1270  msg->msg_name);
1271
1272 if(addr_len)
1273 *addr_len = tp->af_specific->sockaddr_len;
1274
1275 remove_wait_queue(sk->sleep, &wait);
1276  current->state = TASK_RUNNING;
1277
1278 /* Clean up data we have read: This will do ACK frames. */
1279 cleanup_rbuf(sk, copied);
1280 release_sock(sk);
1281 return copied;
1282 }
1283
1284 /*
1285  * Check whether to renew the timer.
1286  */
1287 staticinlinevoidtcp_check_fin_timer(struct sock *sk)
1288 {
1289 if(sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
1290 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
1291 }
1292
1293 /*
1294  * State processing on a close. This implements the state shift for
1295  * sending our FIN frame. Note that we only send a FIN for some
1296  * states. A shutdown() may have already sent the FIN, or we may be
1297  * closed.
1298  */
1299
1300 static unsigned char new_state[16] = {
1301 /* current state: new state: action: */
1302 /* (Invalid) */ TCP_CLOSE,
1303 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1304 /* TCP_SYN_SENT */ TCP_CLOSE,
1305 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1306 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1307 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1308 /* TCP_TIME_WAIT */ TCP_CLOSE,
1309 /* TCP_CLOSE */ TCP_CLOSE,
1310 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1311 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1312 /* TCP_LISTEN */ TCP_CLOSE,
1313 /* TCP_CLOSING */ TCP_CLOSING,
1314 };
1315
1316 static inttcp_close_state(struct sock *sk,int dead)
1317 {
1318 int next = (int) new_state[sk->state];
1319 int ns = (next & TCP_STATE_MASK);
1320
1321 tcp_set_state(sk, ns);
1322
1323 /* This is a (useful) BSD violating of the RFC. There is a
1324  * problem with TCP as specified in that the other end could
1325  * keep a socket open forever with no application left this end.
1326  * We use a 3 minute timeout (about the same as BSD) then kill
1327  * our end. If they send after that then tough - BUT: long enough
1328  * that we won't make the old 4*rto = almost no time - whoops
1329  * reset mistake.
1330  */
1331 if(dead)
1332 tcp_check_fin_timer(sk);
1333
1334 return(next & TCP_ACTION_FIN);
1335 }
1336
1337 /*
1338  * Shutdown the sending side of a connection. Much like close except
1339  * that we don't receive shut down or set sk->dead.
1340  */
1341
1342 voidtcp_shutdown(struct sock *sk,int how)
1343 {
1344 /* We need to grab some memory, and put together a FIN,
1345  * and then put it into the queue to be sent.
1346  * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1347  */
1348 if(!(how & SEND_SHUTDOWN))
1349 return;
1350
1351 /* If we've already sent a FIN, or it's a closed state, skip this. */
1352 if((1<< sk->state) &
1353 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1354 lock_sock(sk);
1355
1356 /* Flag that the sender has shutdown. */
1357  sk->shutdown |= SEND_SHUTDOWN;
1358
1359 /* Clear out any half completed packets. FIN if needed. */
1360 if(tcp_close_state(sk,0))
1361 tcp_send_fin(sk);
1362
1363 release_sock(sk);
1364 }
1365 }
1366
1367
1368 /*
1369  * Return 1 if we still have things to send in our buffers.
1370  */
1371
1372 staticinlineintclosing(struct sock * sk)
1373 {
1374 return((1<< sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1375 }
1376
1377 /*
1378  * This routine closes sockets which have been at least partially
1379  * opened, but not yet accepted. Currently it is only called by
1380  * tcp_close, and timeout mirrors the value there.
1381  */
1382
1383 static voidtcp_close_pending(struct sock *sk)
1384 {
1385 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1386 struct open_request *req = tp->syn_wait_queue;
1387
1388 while(req) {
1389 struct open_request *iter;
1390
1391 if(req->sk)
1392 tcp_close(req->sk,0);
1393
1394  iter = req;
1395  req = req->dl_next;
1396
1397 (*iter->class->destructor)(iter);
1398 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1399  sk->ack_backlog--;
1400 tcp_openreq_free(iter);
1401 }
1402
1403 tcp_synq_init(tp);
1404 }
1405
1406 voidtcp_close(struct sock *sk,unsigned long timeout)
1407 {
1408 struct sk_buff *skb;
1409 int data_was_unread =0;
1410
1411 /*
1412  * Check whether the socket is locked ... supposedly
1413  * it's impossible to tcp_close() a locked socket.
1414  */
1415 if(atomic_read(&sk->sock_readers))
1416 printk("tcp_close: socket already locked!\n");
1417
1418 /* We need to grab some memory, and put together a FIN,
1419  * and then put it into the queue to be sent.
1420  */
1421 lock_sock(sk);
1422 if(sk->state == TCP_LISTEN) {
1423 /* Special case. */
1424 tcp_set_state(sk, TCP_CLOSE);
1425 tcp_close_pending(sk);
1426 release_sock(sk);
1427  sk->dead =1;
1428 return;
1429 }
1430
1431 /* It is questionable, what the role of this is now.
1432  * In any event either it should be removed, or
1433  * increment of SLT_KEEPALIVE be done, this is causing
1434  * big problems. For now I comment it out. -DaveM
1435  */
1436 /* sk->keepopen = 1; */
1437  sk->shutdown = SHUTDOWN_MASK;
1438
1439 if(!sk->dead)
1440  sk->state_change(sk);
1441
1442 /* We need to flush the recv. buffs. We do this only on the
1443  * descriptor close, not protocol-sourced closes, because the
1444  * reader process may not have drained the data yet!
1445  */
1446 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1447  data_was_unread++;
1448 kfree_skb(skb);
1449 }
1450
1451 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1452  * 3.10, we send a RST here because data was lost. To
1453  * witness the awful effects of the old behavior of always
1454  * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1455  * a bulk GET in an FTP client, suspend the process, wait
1456  * for the client to advertise a zero window, then kill -9
1457  * the FTP client, wheee... Note: timeout is always zero
1458  * in such a case.
1459  */
1460 if(data_was_unread !=0) {
1461 /* Unread data was tossed, zap the connection. */
1462 tcp_set_state(sk, TCP_CLOSE);
1463 tcp_send_active_reset(sk);
1464 }else if(tcp_close_state(sk,1)) {
1465 /* We FIN if the application ate all the data before
1466  * zapping the connection.
1467  */
1468 tcp_send_fin(sk);
1469 }
1470
1471 if(timeout) {
1472 struct task_struct *tsk = current;
1473 struct wait_queue wait = { tsk, NULL };
1474
1475  tsk->timeout = timeout;
1476 add_wait_queue(sk->sleep, &wait);
1477 release_sock(sk);
1478
1479 while(1) {
1480  tsk->state = TASK_INTERRUPTIBLE;
1481 if(!closing(sk))
1482 break;
1483 schedule();
1484 if(signal_pending(tsk) || !tsk->timeout)
1485 break;
1486 }
1487
1488  tsk->timeout=0;
1489  tsk->state = TASK_RUNNING;
1490 remove_wait_queue(sk->sleep, &wait);
1491
1492 lock_sock(sk);
1493 }
1494
1495 /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1496  * we may need to set up a timer.
1497  */
1498 tcp_check_fin_timer(sk);
1499
1500  sk->dead =1;
1501 release_sock(sk);
1502 }
1503
1504 /*
1505  * Wait for an incoming connection, avoid race
1506  * conditions. This must be called with the socket locked.
1507  */
1508 static struct open_request *wait_for_connect(struct sock * sk,
1509 struct open_request **pprev)
1510 {
1511 struct wait_queue wait = { current, NULL };
1512 struct open_request *req;
1513
1514 add_wait_queue(sk->sleep, &wait);
1515 for(;;) {
1516  current->state = TASK_INTERRUPTIBLE;
1517 release_sock(sk);
1518 schedule();
1519 lock_sock(sk);
1520  req =tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1521 if(req)
1522 break;
1523 if(signal_pending(current))
1524 break;
1525 }
1526  current->state = TASK_RUNNING;
1527 remove_wait_queue(sk->sleep, &wait);
1528 return req;
1529 }
1530
1531 /*
1532  * This will accept the next outstanding connection.
1533  *
1534  * Be careful about race conditions here - this is subtle.
1535  */
1536
1537 struct sock *tcp_accept(struct sock *sk,int flags)
1538 {
1539 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1540 struct open_request *req, *prev;
1541 struct sock *newsk = NULL;
1542 int error;
1543
1544 lock_sock(sk);
1545
1546 /* We need to make sure that this socket is listening,
1547  * and that it has something pending.
1548  */
1549  error = EINVAL;
1550 if(sk->state != TCP_LISTEN)
1551 goto out;
1552
1553 /* Find already established connection */
1554  req =tcp_find_established(tp, &prev);
1555 if(!req) {
1556 /* If this is a non blocking socket don't sleep */
1557  error = EAGAIN;
1558 if(flags & O_NONBLOCK)
1559 goto out;
1560
1561  error = ERESTARTSYS;
1562  req =wait_for_connect(sk, &prev);
1563 if(!req)
1564 goto out;
1565 }
1566
1567 tcp_synq_unlink(tp, req, prev);
1568  newsk = req->sk;
1569  req->class->destructor(req);
1570 tcp_openreq_free(req);
1571  sk->ack_backlog--;
1572
1573 /*
1574  * This does not pass any already set errors on the new socket
1575  * to the user, but they will be returned on the first socket operation
1576  * after the accept.
1577  */
1578
1579  error =0;
1580 out:
1581 release_sock(sk);
1582  sk->err = error;
1583 return newsk;
1584 }
1585
1586 /*
1587  * Socket option code for TCP.
1588  */
1589
1590 inttcp_setsockopt(struct sock *sk,int level,int optname,char*optval,
1591 int optlen)
1592 {
1593 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1594 int val;
1595
1596 if(level != SOL_TCP)
1597 return tp->af_specific->setsockopt(sk, level, optname,
1598  optval, optlen);
1599
1600 if(optlen<sizeof(int))
1601 return-EINVAL;
1602
1603 if(get_user(val, (int*)optval))
1604 return-EFAULT;
1605
1606 switch(optname) {
1607 case TCP_MAXSEG:
1608 /* values greater than interface MTU won't take effect. however at
1609  * the point when this call is done we typically don't yet know
1610  * which interface is going to be used
1611  */
1612 if(val<1||val>MAX_WINDOW)
1613 return-EINVAL;
1614  tp->user_mss=val;
1615 return0;
1616 case TCP_NODELAY:
1617  sk->nonagle=(val==0)?0:1;
1618 return0;
1619 default:
1620 return(-ENOPROTOOPT);
1621 };
1622 }
1623
1624 inttcp_getsockopt(struct sock *sk,int level,int optname,char*optval,
1625 int*optlen)
1626 {
1627 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1628 int val;
1629 int len;
1630
1631 if(level != SOL_TCP)
1632 return tp->af_specific->getsockopt(sk, level, optname,
1633  optval, optlen);
1634
1635 if(get_user(len,optlen))
1636 return-EFAULT;
1637
1638  len =min(len,sizeof(int));
1639
1640 switch(optname) {
1641 case TCP_MAXSEG:
1642  val=tp->user_mss;
1643 break;
1644 case TCP_NODELAY:
1645  val=sk->nonagle;
1646 break;
1647 default:
1648 return(-ENOPROTOOPT);
1649 };
1650
1651 if(put_user(len, optlen))
1652 return-EFAULT;
1653 if(copy_to_user(optval, &val,len))
1654 return-EFAULT;
1655 return0;
1656 }
1657
1658 voidtcp_set_keepalive(struct sock *sk,int val)
1659 {
1660 if(!sk->keepopen && val)
1661 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1662 else if(sk->keepopen && !val)
1663 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1664 }
1665
1666 externvoid__skb_cb_too_small_for_tcp(int,int);
1667
1668 void __init tcp_init(void)
1669 {
1670 struct sk_buff *skb = NULL;
1671
1672 if(sizeof(struct tcp_skb_cb) >sizeof(skb->cb))
1673 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1674 sizeof(skb->cb));
1675
1676  tcp_openreq_cachep =kmem_cache_create("tcp_open_request",
1677 sizeof(struct open_request),
1678 0, SLAB_HWCACHE_ALIGN,
1679  NULL, NULL);
1680 if(!tcp_openreq_cachep)
1681 panic("tcp_init: Cannot alloc open_request cache.");
1682
1683  tcp_bucket_cachep =kmem_cache_create("tcp_bind_bucket",
1684 sizeof(struct tcp_bind_bucket),
1685 0, SLAB_HWCACHE_ALIGN,
1686  NULL, NULL);
1687 if(!tcp_bucket_cachep)
1688 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
1689
1690  tcp_timewait_cachep =kmem_cache_create("tcp_tw_bucket",
1691 sizeof(struct tcp_tw_bucket),
1692 0, SLAB_HWCACHE_ALIGN,
1693  NULL, NULL);
1694 if(!tcp_timewait_cachep)
1695 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
1696 }