net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_send_check
tcp_build_header
wait_for_tcp_connect
wait_for_tcp_memory
do_tcp_sendmsg
tcp_sendmsg
tcp_read_wakeup
tcp_recv_urg
tcp_eat_skb
cleanup_rbuf
tcp_recvmsg
tcp_close_state
tcp_shutdown
closing
tcp_close
tcp_accept
tcp_connect
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. select
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), select() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *              Eric Schenk     :       Changed receiver side silly window
 198  *                                      avoidance algorithm to BSD style
 199  *                                      algorithm. This doubles throughput
 200  *                                      against machines running Solaris,
 201  *                                      and seems to result in general
 202  *                                      improvement.
 203  *
 204  * To Fix:
 205  *              Fast path the code. Two things here - fix the window calculation
 206  *              so it doesn't iterate over the queue, also spot packets with no funny
 207  *              options arriving in order and process directly.
 208  *
 209  *              Rewrite output state machine to use a single queue.
 210  *              Speed up input assembly algorithm.
 211  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 212  *              could do with it working on IPv4
 213  *              User settable/learned rtt/max window/mtu
 214  *
 215  *              Change the fundamental structure to a single send queue maintained
 216  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 217  *              active routes too]). Cut the queue off in tcp_retransmit/
 218  *              tcp_transmit.
 219  *              Change the receive queue to assemble as it goes. This lets us
 220  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 221  *              tcp_data/tcp_read as well as the window shrink crud.
 222  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 223  *              tcp_queue_skb seem obvious routines to extract.
 224  *
 225  *              This program is free software; you can redistribute it and/or
 226  *              modify it under the terms of the GNU General Public License
 227  *              as published by the Free Software Foundation; either version
 228  *              2 of the License, or(at your option) any later version.
 229  *
 230  * Description of States:
 231  *
 232  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 233  *
 234  *      TCP_SYN_RECV            received a connection request, sent ack,
 235  *                              waiting for final ack in three-way handshake.
 236  *
 237  *      TCP_ESTABLISHED         connection established
 238  *
 239  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 240  *                              transmission of remaining buffered data
 241  *
 242  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 243  *                              to shutdown
 244  *
 245  *      TCP_CLOSING             both sides have shutdown but we still have
 246  *                              data we have to finish sending
 247  *
 248  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 249  *                              closed, can only be entered from FIN_WAIT2
 250  *                              or CLOSING.  Required because the other end
 251  *                              may not have gotten our last ACK causing it
 252  *                              to retransmit the data packet (which we ignore)
 253  *
 254  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 255  *                              us to finish writing our data and to shutdown
 256  *                              (we have to close() to move on to LAST_ACK)
 257  *
 258  *      TCP_LAST_ACK            out side has shutdown after remote has
 259  *                              shutdown.  There may still be data in our
 260  *                              buffer that we have to finish sending
 261  *
 262  *      TCP_CLOSE               socket is finished
 263  */
 264 
 265 /*
 266  * RFC1122 status:
 267  * NOTE: I'm not going to be doing comments in the code for this one except
 268  * for violations and the like.  tcp.c is just too big... If I say something
 269  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 270  * with Alan. -- MS 950903
 271  *
 272  * Use of PSH (4.2.2.2)
 273  *   MAY aggregate data sent without the PSH flag. (does)
 274  *   MAY queue data received without the PSH flag. (does)
 275  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 276  *   MAY implement PSH on send calls. (doesn't, thus:)
 277  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 278  *     MUST set PSH on last segment (does)
 279  *   MAY pass received PSH to application layer (doesn't)
 280  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 281  *
 282  * Window Size (4.2.2.3, 4.2.2.16)
 283  *   MUST treat window size as an unsigned number (does)
 284  *   SHOULD treat window size as a 32-bit number (does not)
 285  *   MUST NOT shrink window once it is offered (does not normally)
 286  *
 287  * Urgent Pointer (4.2.2.4)
 288  * **MUST point urgent pointer to last byte of urgent data (not right
 289  *     after). (doesn't, to be like BSD)
 290  *   MUST inform application layer asynchronously of incoming urgent
 291  *     data. (does)
 292  *   MUST provide application with means of determining the amount of
 293  *     urgent data pending. (does)
 294  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 295  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 296  *      [Follows BSD 1 byte of urgent data]
 297  *
 298  * TCP Options (4.2.2.5)
 299  *   MUST be able to receive TCP options in any segment. (does)
 300  *   MUST ignore unsupported options (does)
 301  *
 302  * Maximum Segment Size Option (4.2.2.6)
 303  *   MUST implement both sending and receiving MSS. (does)
 304  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 305  *     it always). (does, even when MSS == 536, which is legal)
 306  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 307  *   MUST calculate "effective send MSS" correctly:
 308  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 309  *     (does - but allows operator override)
 310  *
 311  * TCP Checksum (4.2.2.7)
 312  *   MUST generate and check TCP checksum. (does)
 313  *
 314  * Initial Sequence Number Selection (4.2.2.8)
 315  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 316  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 317  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 318  *
 319  * Simultaneous Open Attempts (4.2.2.10)
 320  *   MUST support simultaneous open attempts (does)
 321  *
 322  * Recovery from Old Duplicate SYN (4.2.2.11)
 323  *   MUST keep track of active vs. passive open (does)
 324  *
 325  * RST segment (4.2.2.12)
 326  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 327  *     anything with it, which is standard)
 328  *
 329  * Closing a Connection (4.2.2.13)
 330  *   MUST inform application of whether connection was closed by RST or
 331  *     normal close. (does)
 332  *   MAY allow "half-duplex" close (treat connection as closed for the
 333  *     local app, even before handshake is done). (does)
 334  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 335  *
 336  * Retransmission Timeout (4.2.2.15)
 337  *   MUST implement Jacobson's slow start and congestion avoidance
 338  *     stuff. (does)
 339  *
 340  * Probing Zero Windows (4.2.2.17)
 341  *   MUST support probing of zero windows. (does)
 342  *   MAY keep offered window closed indefinitely. (does)
 343  *   MUST allow remote window to stay closed indefinitely. (does)
 344  *
 345  * Passive Open Calls (4.2.2.18)
 346  *   MUST NOT let new passive open affect other connections. (doesn't)
 347  *   MUST support passive opens (LISTENs) concurrently. (does)
 348  *
 349  * Time to Live (4.2.2.19)
 350  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 351  *
 352  * Event Processing (4.2.2.20)
 353  *   SHOULD queue out-of-order segments. (does)
 354  *   MUST aggregate ACK segments whenever possible. (does but badly)
 355  *
 356  * Retransmission Timeout Calculation (4.2.3.1)
 357  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 358  *     calculation. (does, or at least explains them in the comments 8*b)
 359  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 360  *
 361  * When to Send an ACK Segment (4.2.3.2)
 362  *   SHOULD implement delayed ACK. (does)
 363  *   MUST keep ACK delay < 0.5 sec. (does)
 364  *
 365  * When to Send a Window Update (4.2.3.3)
 366  *   MUST implement receiver-side SWS. (does)
 367  *
 368  * When to Send Data (4.2.3.4)
 369  *   MUST implement sender-side SWS. (does)
 370  *   SHOULD implement Nagle algorithm. (does)
 371  *
 372  * TCP Connection Failures (4.2.3.5)
 373  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 374  *   SHOULD inform application layer of soft errors. (does)
 375  *
 376  * TCP Keep-Alives (4.2.3.6)
 377  *   MAY provide keep-alives. (does)
 378  *   MUST make keep-alives configurable on a per-connection basis. (does)
 379  *   MUST default to no keep-alives. (does)
 380  * **MUST make keep-alive interval configurable. (doesn't)
 381  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 382  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 383  *     connection. (doesn't)
 384  *   SHOULD send keep-alive with no data. (does)
 385  *
 386  * TCP Multihoming (4.2.3.7)
 387  *   MUST get source address from IP layer before sending first
 388  *     SYN. (does)
 389  *   MUST use same local address for all segments of a connection. (does)
 390  *
 391  * IP Options (4.2.3.8)
 392  *   MUST ignore unsupported IP options. (does)
 393  *   MAY support Time Stamp and Record Route. (does)
 394  *   MUST allow application to specify a source route. (does)
 395  *   MUST allow received Source Route option to set route for all future
 396  *     segments on this connection. (does not (security issues))
 397  *
 398  * ICMP messages (4.2.3.9)
 399  *   MUST act on ICMP errors. (does)
 400  *   MUST slow transmission upon receipt of a Source Quench. (does)
 401  *   MUST NOT abort connection upon receipt of soft Destination
 402  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 403  *     Problems. (doesn't)
 404  *   SHOULD report soft Destination Unreachables etc. to the
 405  *     application. (does)
 406  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 407  *     messages (2, 3, 4). (does)
 408  *
 409  * Remote Address Validation (4.2.3.10)
 410  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 411  *   MUST ignore SYN with invalid source address. (does)
 412  *   MUST silently discard incoming SYN for broadcast/multicast
 413  *     address. (does)
 414  *
 415  * Asynchronous Reports (4.2.4.1)
 416  * MUST provide mechanism for reporting soft errors to application
 417  *     layer. (does)
 418  *
 419  * Type of Service (4.2.4.2)
 420  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 421  *
 422  * (Whew. -- MS 950903)
 423  **/
 424 
 425 #include <linux/config.h>
 426 #include <linux/types.h>
 427 #include <linux/fcntl.h>
 428 
 429 #include <net/icmp.h>
 430 #include <net/tcp.h>
 431 
 432 #include <asm/segment.h>
 433 
 434 unsigned long seq_offset;
 435 struct tcp_mib  tcp_statistics;
 436 
 437 static void tcp_close(struct sock *sk, unsigned long timeout);
 438 
 439 /*
 440  *      The less said about this the better, but it works and will do for 1.2  (and 1.4 ;))
 441  */
 442 
 443 struct wait_queue *master_select_wakeup;
 444 
 445 /*
 446  *      Find someone to 'accept'. Must be called with
 447  *      the socket locked or with interrupts disabled
 448  */
 449 
 450 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 451 {
 452         struct sk_buff *p=skb_peek(&s->receive_queue);
 453         if(p==NULL)
 454                 return NULL;
 455         do
 456         {
 457                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 458                         return p;
 459                 p=p->next;
 460         }
 461         while(p!=(struct sk_buff *)&s->receive_queue);
 462         return NULL;
 463 }
 464 
 465 /*
 466  *      Remove a completed connection and return it. This is used by
 467  *      tcp_accept() to get connections from the queue.
 468  */
 469 
 470 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 471 {
 472         struct sk_buff *skb;
 473         unsigned long flags;
 474         save_flags(flags);
 475         cli();
 476         skb=tcp_find_established(s);
 477         if(skb!=NULL)
 478                 skb_unlink(skb);        /* Take it off the queue */
 479         restore_flags(flags);
 480         return skb;
 481 }
 482 
 483 /*
 484  *      This routine closes sockets which have been at least partially
 485  *      opened, but not yet accepted. Currently it is only called by
 486  *      tcp_close, and timeout mirrors the value there.
 487  */
 488 
 489 static void tcp_close_pending (struct sock *sk)
     /*  */
 490 {
 491         struct sk_buff *skb;
 492 
 493         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
 494         {
 495                 tcp_close(skb->sk, 0);
 496                 kfree_skb(skb, FREE_READ);
 497         }
 498         return;
 499 }
 500 
 501 /*
 502  *      Enter the time wait state.
 503  */
 504 
 505 void tcp_time_wait(struct sock *sk)
     /*  */
 506 {
 507         tcp_set_state(sk,TCP_TIME_WAIT);
 508         sk->shutdown = SHUTDOWN_MASK;
 509         if (!sk->dead)
 510                 sk->state_change(sk);
 511         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 512 }
 513 
 514 
 515 /*
 516  * This routine is called by the ICMP module when it gets some
 517  * sort of error condition.  If err < 0 then the socket should
 518  * be closed and the error returned to the user.  If err > 0
 519  * it's just the icmp type << 8 | icmp code.  After adjustment
 520  * header points to the first 8 bytes of the tcp header.  We need
 521  * to find the appropriate port.
 522  */
 523 
 524 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
 525         __u32 saddr, struct inet_protocol *protocol)
 526 {
 527         struct tcphdr *th = (struct tcphdr *)header;
 528         struct sock *sk;
 529 
 530         /*
 531          *      This one is _WRONG_. FIXME urgently.
 532          */
 533 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 534         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
 535 #endif
 536         th =(struct tcphdr *)header;
 537         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 538 
 539         if (sk == NULL)
 540                 return;
 541 
 542         if (type == ICMP_SOURCE_QUENCH)
 543         {
 544                 /*
 545                  * FIXME:
 546                  * For now we will just trigger a linear backoff.
 547                  * The slow start code should cause a real backoff here.
 548                  */
 549                 if (sk->cong_window > 4)
 550                         sk->cong_window--;
 551                 return;
 552         }
 553 
 554         if (type == ICMP_PARAMETERPROB)
 555         {
 556                 sk->err=EPROTO;
 557                 sk->error_report(sk);
 558         }
 559 
 560 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 561         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 562         {
 563                 struct rtable * rt;
 564                 /*
 565                  * Ugly trick to pass MTU to protocol layer.
 566                  * Really we should add argument "info" to error handler.
 567                  */
 568                 unsigned short new_mtu = ntohs(iph->id);
 569 
 570                 if ((rt = sk->ip_route_cache) != NULL)
 571                         if (rt->rt_mtu > new_mtu)
 572                                 rt->rt_mtu = new_mtu;
 573 
 574                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
 575                         && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
 576                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 577 
 578                 return;
 579         }
 580 #endif
 581 
 582         /*
 583          * If we've already connected we will keep trying
 584          * until we time out, or the user gives up.
 585          */
 586 
 587         if (code < 13)
 588         {
 589                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 590                 {
 591                         sk->err = icmp_err_convert[code].errno;
 592                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 593                         {
 594                                 tcp_statistics.TcpAttemptFails++;
 595                                 tcp_set_state(sk,TCP_CLOSE);
 596                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 597                         }
 598                 }
 599                 else    /* Only an error on timeout */
 600                         sk->err_soft = icmp_err_convert[code].errno;
 601         }
 602 }
 603 
 604 
 605 /*
 606  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 607  *      in the received data queue (ie a frame missing that needs sending to us). Not
 608  *      sorting using two queues as data arrives makes life so much harder.
 609  */
 610 
 611 static int tcp_readable(struct sock *sk)
     /*  */
 612 {
 613         unsigned long counted;
 614         unsigned long amount;
 615         struct sk_buff *skb;
 616         int sum;
 617         unsigned long flags;
 618 
 619         if(sk && sk->debug)
 620                 printk("tcp_readable: %p - ",sk);
 621 
 622         save_flags(flags);
 623         cli();
 624         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 625         {
 626                 restore_flags(flags);
 627                 if(sk && sk->debug)
 628                         printk("empty\n");
 629                 return(0);
 630         }
 631 
 632         counted = sk->copied_seq;       /* Where we are at the moment */
 633         amount = 0;
 634 
 635         /*
 636          *      Do until a push or until we are out of data.
 637          */
 638 
 639         do
 640         {
 641                 if (before(counted, skb->seq))          /* Found a hole so stops here */
 642                         break;
 643                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
 644                 if (skb->h.th->syn)
 645                         sum++;
 646                 if (sum > 0)
 647                 {                                       /* Add it up, move on */
 648                         amount += sum;
 649                         if (skb->h.th->syn)
 650                                 amount--;
 651                         counted += sum;
 652                 }
 653                 /*
 654                  * Don't count urg data ... but do it in the right place!
 655                  * Consider: "old_data (ptr is here) URG PUSH data"
 656                  * The old code would stop at the first push because
 657                  * it counted the urg (amount==1) and then does amount--
 658                  * *after* the loop.  This means tcp_readable() always
 659                  * returned zero if any URG PUSH was in the queue, even
 660                  * though there was normal data available. If we subtract
 661                  * the urg data right here, we even get it to work for more
 662                  * than one URG PUSH skb without normal data.
 663                  * This means that select() finally works now with urg data
 664                  * in the queue.  Note that rlogin was never affected
 665                  * because it doesn't use select(); it uses two processes
 666                  * and a blocking read().  And the queue scan in tcp_read()
 667                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 668                  */
 669                 if (skb->h.th->urg)
 670                         amount--;       /* don't count urg data */
 671                 if (amount && skb->h.th->psh) break;
 672                 skb = skb->next;
 673         }
 674         while(skb != (struct sk_buff *)&sk->receive_queue);
 675 
 676         restore_flags(flags);
 677         if(sk->debug)
 678                 printk("got %lu bytes.\n",amount);
 679         return(amount);
 680 }
 681 
 682 /*
 683  * LISTEN is a special case for select..
 684  */
 685 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 686 {
 687         if (sel_type == SEL_IN) {
 688                 int retval;
 689 
 690                 lock_sock(sk);
 691                 retval = (tcp_find_established(sk) != NULL);
 692                 release_sock(sk);
 693                 if (!retval)
 694                         select_wait(&master_select_wakeup,wait);
 695                 return retval;
 696         }
 697         return 0;
 698 }
 699 
 700 
 701 /*
 702  *      Wait for a TCP event.
 703  *
 704  *      Note that we don't need to lock the socket, as the upper select layers
 705  *      take care of normal races (between the test and the event) and we don't
 706  *      go look at any of the socket buffers directly.
 707  */
 708 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 709 {
 710         if (sk->state == TCP_LISTEN)
 711                 return tcp_listen_select(sk, sel_type, wait);
 712 
 713         switch(sel_type) {
 714         case SEL_IN:
 715                 if (sk->err)
 716                         return 1;
 717                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 718                         break;
 719 
 720                 if (sk->shutdown & RCV_SHUTDOWN)
 721                         return 1;
 722 
 723                 if (sk->acked_seq == sk->copied_seq)
 724                         break;
 725 
 726                 if (sk->urg_seq != sk->copied_seq ||
 727                     sk->acked_seq != sk->copied_seq+1 ||
 728                     sk->urginline || !sk->urg_data)
 729                         return 1;
 730                 break;
 731 
 732         case SEL_OUT:
 733                 if (sk->err)
 734                         return 1;
 735                 if (sk->shutdown & SEND_SHUTDOWN)
 736                         return 0;
 737                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 738                         break;
 739                 /*
 740                  * This is now right thanks to a small fix
 741                  * by Matt Dillon.
 742                  */
 743 
 744                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
 745                         break;
 746                 return 1;
 747 
 748         case SEL_EX:
 749                 if (sk->urg_data)
 750                         return 1;
 751                 break;
 752         }
 753         select_wait(sk->sleep, wait);
 754         return 0;
 755 }
 756 
 757 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
 758 {
 759         int err;
 760         switch(cmd)
 761         {
 762 
 763                 case TIOCINQ:
 764 #ifdef FIXME    /* FIXME: */
 765                 case FIONREAD:
 766 #endif
 767                 {
 768                         unsigned long amount;
 769 
 770                         if (sk->state == TCP_LISTEN)
 771                                 return(-EINVAL);
 772 
 773                         lock_sock(sk);
 774                         amount = tcp_readable(sk);
 775                         release_sock(sk);
 776                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
 777                         if(err)
 778                                 return err;
 779                         put_user(amount, (int *)arg);
 780                         return(0);
 781                 }
 782                 case SIOCATMARK:
 783                 {
 784                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 785 
 786                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
 787                         if (err)
 788                                 return err;
 789                         put_user(answ,(int *) arg);
 790                         return(0);
 791                 }
 792                 case TIOCOUTQ:
 793                 {
 794                         unsigned long amount;
 795 
 796                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 797                         amount = sock_wspace(sk);
 798                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
 799                         if(err)
 800                                 return err;
 801                         put_user(amount, (int *)arg);
 802                         return(0);
 803                 }
 804                 default:
 805                         return(-EINVAL);
 806         }
 807 }
 808 
 809 
 810 /*
 811  *      This routine computes a TCP checksum.
 812  *
 813  *      Modified January 1995 from a go-faster DOS routine by
 814  *      Jorge Cwik <jorge@laser.satlink.net>
 815  */
 816 #undef DEBUG_TCP_CHECK
 817 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
     /*  */
 818                 unsigned long daddr, int len, struct sk_buff *skb)
 819 {
 820 #ifdef DEBUG_TCP_CHECK
 821         u16 check;
 822 #endif
 823         th->check = 0;
 824         th->check = tcp_check(th, len, saddr, daddr,
 825                 csum_partial((char *)th,sizeof(*th),skb->csum));
 826 
 827 #ifdef DEBUG_TCP_CHECK
 828         check = th->check;
 829         th->check = 0;
 830         th->check = tcp_check(th, len, saddr, daddr,
 831                 csum_partial((char *)th,len,0));
 832         if (check != th->check) {
 833                 static int count = 0;
 834                 if (++count < 10) {
 835                         printk("Checksum %x (%x) from %p\n", th->check, check,
 836                                 (&th)[-1]);
 837                         printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
 838                 }
 839         }
 840 #endif
 841 }
 842 
 843 
 844 /*
 845  *      This routine builds a generic TCP header.
 846  */
 847 
 848 static inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
 849 {
 850         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
 851         th->psh = (push == 0) ? 1 : 0;
 852         th->seq = htonl(sk->write_seq);
 853         th->ack_seq = htonl(sk->acked_seq);
 854         th->window = htons(tcp_select_window(sk));
 855 
 856         return(sizeof(*th));
 857 }
 858 
 859 /*
 860  *      Wait for a socket to get into the connected state
 861  */
 862 static void wait_for_tcp_connect(struct sock * sk)
     /*  */
 863 {
 864         release_sock(sk);
 865         cli();
 866         if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
 867         {
 868                 interruptible_sleep_on(sk->sleep);
 869         }
 870         sti();
 871         lock_sock(sk);
 872 }
 873 
 874 /*
 875  *      Wait for more memory for a socket
 876  */
 877 static void wait_for_tcp_memory(struct sock * sk)
     /*  */
 878 {
 879         release_sock(sk);
 880         cli();
 881         if (sk->wmem_alloc*2 > sk->sndbuf &&
 882             (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
 883                 && sk->err == 0)
 884         {
 885                 sk->socket->flags &= ~SO_NOSPACE;
 886                 interruptible_sleep_on(sk->sleep);
 887         }
 888         sti();
 889         lock_sock(sk);
 890 }
 891 
 892 
 893 /*
 894  *      This routine copies from a user buffer into a socket,
 895  *      and starts the transmit system.
 896  */
 897 
 898 static int do_tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
 899           int len, int nonblock, int flags)
 900 {
 901         int copied = 0;
 902         int copy;
 903         int tmp;
 904         int seglen;
 905         int iovct=0;
 906         struct sk_buff *skb;
 907         struct sk_buff *send_tmp;
 908         struct proto *prot;
 909         struct device *dev = NULL;
 910         unsigned char *from;
 911 
 912         /*
 913          *      Ok commence sending
 914          */
 915 
 916         while(iovct<msg->msg_iovlen)
 917         {
 918                 seglen=msg->msg_iov[iovct].iov_len;
 919                 from=msg->msg_iov[iovct++].iov_base;
 920                 prot = sk->prot;
 921                 while(seglen > 0)
 922                 {
 923                         /*
 924                          * Stop on errors
 925                          */
 926                         if (sk->err)
 927                         {
 928                                 if (copied)
 929                                         return copied;
 930                                 return sock_error(sk);
 931                         }
 932 
 933                         /*
 934                          *      Make sure that we are established.
 935                          */
 936                         if (sk->shutdown & SEND_SHUTDOWN)
 937                         {
 938                                 if (copied)
 939                                         return copied;
 940                                 return -EPIPE;
 941                         }
 942 
 943                         /*
 944                          *      Wait for a connection to finish.
 945                          */
 946                         while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
 947                         {
 948                                 if (copied)
 949                                         return copied;
 950 
 951                                 if (sk->err)
 952                                         return sock_error(sk);
 953 
 954                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
 955                                 {
 956                                         if (sk->keepopen)
 957                                                 send_sig(SIGPIPE, current, 0);
 958                                         return -EPIPE;
 959                                 }
 960 
 961                                 if (nonblock)
 962                                         return -EAGAIN;
 963 
 964                                 if (current->signal & ~current->blocked)
 965                                         return -ERESTARTSYS;
 966 
 967                                 wait_for_tcp_connect(sk);
 968                         }
 969 
 970                 /*
 971                  * The following code can result in copy <= if sk->mss is ever
 972                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
 973                  * sk->mtu is constant once SYN processing is finished.  I.e. we
 974                  * had better not get here until we've seen his SYN and at least one
 975                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
 976                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
 977                  * non-decreasing.  Note that any ioctl to set user_mss must be done
 978                  * before the exchange of SYN's.  If the initial ack from the other
 979                  * end has a window of 0, max_window and thus mss will both be 0.
 980                  */
 981 
 982                 /*
 983                  *      Now we need to check if we have a half built packet.
 984                  */
 985 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 986                 /*
 987                  *      FIXME:  I'm almost sure that this fragment is BUG,
 988                  *              but it works... I do not know why 8) --ANK
 989                  *
 990                  *      Really, we should rebuild all the queues...
 991                  *      It's difficult. Temporary hack is to send all
 992                  *      queued segments with allowed fragmentation.
 993                  */
 994                 {
 995                         int new_mss = min(sk->mtu, sk->max_window);
 996                         if (new_mss < sk->mss)
 997                         {
 998                                 tcp_send_partial(sk);
 999                                 sk->mss = new_mss;
1000                         }
1001                 }
1002 #endif
1003 
1004                         if ((skb = tcp_dequeue_partial(sk)) != NULL)
1005                         {
1006                                 int tcp_size;
1007 
1008                                 tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
1009 
1010                                 /* Add more stuff to the end of skb->len */
1011                                 if (!(flags & MSG_OOB))
1012                                 {
1013                                         copy = min(sk->mss - tcp_size, seglen);
1014                                         if (copy <= 0)
1015                                         {
1016                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1017                                                 return -EFAULT;
1018                                         }
1019                                         tcp_size += copy;
1020                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1021                                         skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1022                                         from += copy;
1023                                         copied += copy;
1024                                         len -= copy;
1025                                         sk->write_seq += copy;
1026                                         seglen -= copy;
1027                                 }
1028                                 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1029                                         tcp_send_skb(sk, skb);
1030                                 else
1031                                         tcp_enqueue_partial(skb, sk);
1032                                 continue;
1033                         }
1034 
1035                 /*
1036                  * We also need to worry about the window.
1037                  * If window < 1/2 the maximum window we've seen from this
1038                  *   host, don't use it.  This is sender side
1039                  *   silly window prevention, as specified in RFC1122.
1040                  *   (Note that this is different than earlier versions of
1041                  *   SWS prevention, e.g. RFC813.).  What we actually do is
1042                  *   use the whole MSS.  Since the results in the right
1043                  *   edge of the packet being outside the window, it will
1044                  *   be queued for later rather than sent.
1045                  */
1046 
1047                         copy = sk->window_seq - sk->write_seq;
1048                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1049                                 copy = sk->mss;
1050                         if (copy > seglen)
1051                                 copy = seglen;
1052                         if (copy <= 0)
1053                         {
1054                                 printk("TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
1055                                 return -EFAULT;
1056                         }
1057 
1058                 /*
1059                  *      We should really check the window here also.
1060                  */
1061 
1062                         send_tmp = NULL;
1063                         if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1064                         {
1065                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1066                                 send_tmp = skb;
1067                         }
1068                         else
1069                         {
1070                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1071                         }
1072 
1073                         /*
1074                          *      If we didn't get any memory, we need to sleep.
1075                          */
1076 
1077                         if (skb == NULL)
1078                         {
1079                                 sk->socket->flags |= SO_NOSPACE;
1080                                 if (nonblock)
1081                                 {
1082                                         if (copied)
1083                                                 return copied;
1084                                         return -EAGAIN;
1085                                 }
1086 
1087                                 if (current->signal & ~current->blocked)
1088                                 {
1089                                         if (copied)
1090                                                 return copied;
1091                                         return -ERESTARTSYS;
1092                                 }
1093 
1094                                 wait_for_tcp_memory(sk);
1095                                 continue;
1096                         }
1097 
1098                         skb->sk = sk;
1099                         skb->free = 0;
1100                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1101 
1102                         /*
1103                          * FIXME: we need to optimize this.
1104                          * Perhaps some hints here would be good.
1105                          */
1106 
1107                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1108                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1109                         if (tmp < 0 )
1110                         {
1111                                 sock_wfree(sk, skb);
1112                                 if (copied)
1113                                         return(copied);
1114                                 return(tmp);
1115                         }
1116 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1117                         skb->ip_hdr->frag_off |= htons(IP_DF);
1118 #endif
1119                         skb->dev = dev;
1120                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1121                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1122                         if (tmp < 0)
1123                         {
1124                                 sock_wfree(sk, skb);
1125                                 if (copied)
1126                                         return(copied);
1127                                 return(tmp);
1128                         }
1129 
1130                         if (flags & MSG_OOB)
1131                         {
1132                                 skb->h.th->urg = 1;
1133                                 skb->h.th->urg_ptr = ntohs(copy);
1134                         }
1135 
1136                         skb->csum = csum_partial_copy_fromuser(from,
1137                                 skb_put(skb,copy), copy, 0);
1138 
1139                         from += copy;
1140                         copied += copy;
1141                         len -= copy;
1142                         seglen -= copy;
1143                         skb->free = 0;
1144                         sk->write_seq += copy;
1145 
1146                         if (send_tmp != NULL)
1147                         {
1148                                 tcp_enqueue_partial(send_tmp, sk);
1149                                 continue;
1150                         }
1151                         tcp_send_skb(sk, skb);
1152                 }
1153         }
1154         sk->err = 0;
1155 
1156         return copied;
1157 }
1158 
1159 
1160 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1161           int len, int nonblock, int flags)
1162 {
1163         int retval = -EINVAL;
1164 
1165         /*
1166          *      Do sanity checking for sendmsg/sendto/send
1167          */
1168 
1169         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1170                 goto out;
1171         if (msg->msg_name) {
1172                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1173 
1174                 if (msg->msg_namelen < sizeof(*addr))
1175                         goto out;
1176                 if (addr->sin_family && addr->sin_family != AF_INET)
1177                         goto out;
1178                 retval = -ENOTCONN;
1179                 if(sk->state == TCP_CLOSE)
1180                         goto out;
1181                 retval = -EISCONN;
1182                 if (addr->sin_port != sk->dummy_th.dest)
1183                         goto out;
1184                 if (addr->sin_addr.s_addr != sk->daddr)
1185                         goto out;
1186         }
1187 
1188         lock_sock(sk);
1189         retval = do_tcp_sendmsg(sk, msg, len, nonblock, flags);
1190 
1191 /*
1192  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1193  *      interactive fast network servers. It's meant to be on and
1194  *      it really improves the throughput though not the echo time
1195  *      on my slow slip link - Alan
1196  *
1197  *      If not nagling we can send on the before case too..
1198  */
1199 
1200         if (sk->partial) {
1201                 if (!sk->packets_out ||
1202                     (sk->nonagle && before(sk->write_seq , sk->window_seq))) {
1203                         tcp_send_partial(sk);
1204                 }
1205         }
1206 
1207         release_sock(sk);
1208 
1209 out:
1210         return retval;
1211 }
1212 
1213 
1214 /*
1215  *      Send an ack if one is backlogged at this point.
1216  *
1217  *      This is called for delayed acks also.
1218  */
1219 
1220 void tcp_read_wakeup(struct sock *sk)
     /*  */
1221 {
1222         if (!sk->ack_backlog)
1223                 return;
1224 
1225         /*
1226          * If we're closed, don't send an ack, or we'll get a RST
1227          * from the closed destination.
1228          */
1229         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1230                 return;
1231 
1232         tcp_send_ack(sk);
1233 }
1234 
1235 
1236 /*
1237  *      Handle reading urgent data. BSD has very simple semantics for
1238  *      this, no blocking and very strange errors 8)
1239  */
1240 
1241 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
1242              struct msghdr *msg, int len, int flags, int *addr_len)
1243 {
1244         /*
1245          *      No URG data to read
1246          */
1247         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1248                 return -EINVAL; /* Yes this is right ! */
1249 
1250         if (sk->err)
1251                 return sock_error(sk);
1252 
1253         if (sk->state == TCP_CLOSE || sk->done)
1254         {
1255                 if (!sk->done)
1256                 {
1257                         sk->done = 1;
1258                         return 0;
1259                 }
1260                 return -ENOTCONN;
1261         }
1262 
1263         if (sk->shutdown & RCV_SHUTDOWN)
1264         {
1265                 sk->done = 1;
1266                 return 0;
1267         }
1268         lock_sock(sk);
1269         if (sk->urg_data & URG_VALID)
1270         {
1271                 char c = sk->urg_data;
1272                 if (!(flags & MSG_PEEK))
1273                         sk->urg_data = URG_READ;
1274                 memcpy_toiovec(msg->msg_iov, &c, 1);
1275                 if(msg->msg_name)
1276                 {
1277                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1278                         sin->sin_family=AF_INET;
1279                         sin->sin_addr.s_addr=sk->daddr;
1280                         sin->sin_port=sk->dummy_th.dest;
1281                 }
1282                 if(addr_len)
1283                         *addr_len=sizeof(struct sockaddr_in);
1284                 release_sock(sk);
1285                 return 1;
1286         }
1287         release_sock(sk);
1288 
1289         /*
1290          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1291          * the available implementations agree in this case:
1292          * this call should never block, independent of the
1293          * blocking state of the socket.
1294          * Mike <pall@rz.uni-karlsruhe.de>
1295          */
1296         return -EAGAIN;
1297 }
1298 
1299 /*
1300  *      Release a skb if it is no longer needed. This routine
1301  *      must be called with interrupts disabled or with the
1302  *      socket locked so that the sk_buff queue operation is ok.
1303  */
1304 
1305 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
     /*  */
1306 {
1307         skb->sk = sk;
1308         __skb_unlink(skb, &sk->receive_queue);
1309         kfree_skb(skb, FREE_READ);
1310 }
1311 
1312 /*
1313  *      FIXME:
1314  *      This routine frees used buffers.
1315  *      It should consider sending an ACK to let the
1316  *      other end know we now have a bigger window.
1317  */
1318 
1319 static void cleanup_rbuf(struct sock *sk)
     /*  */
1320 {
1321         /*
1322          * NOTE! The socket must be locked, so that we don't get
1323          * a messed-up receive queue.
1324          */
1325         while (!skb_queue_empty(&sk->receive_queue)) {
1326                 struct sk_buff *skb = sk->receive_queue.next;
1327                 if (!skb->used || skb->users)
1328                         break;
1329                 tcp_eat_skb(sk, skb);
1330         }
1331 
1332         /*
1333          * Tell the world if we raised the window.
1334          */
1335         if (tcp_raise_window(sk))
1336                 tcp_send_ack(sk);
1337 }
1338 
1339 
1340 /*
1341  *      This routine copies from a sock struct into the user buffer.
1342  */
1343 
1344 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1345         int len, int nonblock, int flags, int *addr_len)
1346 {
1347         struct wait_queue wait = { current, NULL };
1348         int copied = 0;
1349         u32 peek_seq;
1350         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1351         unsigned long used;
1352 
1353         /*
1354          *      This error should be checked.
1355          */
1356 
1357         if (sk->state == TCP_LISTEN)
1358                 return -ENOTCONN;
1359 
1360         /*
1361          *      Urgent data needs to be handled specially.
1362          */
1363 
1364         if (flags & MSG_OOB)
1365                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1366 
1367         /*
1368          *      Copying sequence to update. This is volatile to handle
1369          *      the multi-reader case neatly (memcpy_to/fromfs might be
1370          *      inline and thus not flush cached variables otherwise).
1371          */
1372 
1373         peek_seq = sk->copied_seq;
1374         seq = &sk->copied_seq;
1375         if (flags & MSG_PEEK)
1376                 seq = &peek_seq;
1377 
1378         add_wait_queue(sk->sleep, &wait);
1379         lock_sock(sk);
1380         while (len > 0)
1381         {
1382                 struct sk_buff * skb;
1383                 u32 offset;
1384 
1385                 /*
1386                  * Are we at urgent data? Stop if we have read anything.
1387                  */
1388 
1389                 if (copied && sk->urg_data && sk->urg_seq == *seq)
1390                         break;
1391 
1392                 /*
1393                  * We need to check signals first, to get correct SIGURG
1394                  * handling.
1395                  */
1396                 if (current->signal & ~current->blocked) {
1397                         if (copied)
1398                                 break;
1399                         copied = -ERESTARTSYS;
1400                         break;
1401                 }
1402 
1403                 /*
1404                  *      Next get a buffer.
1405                  */
1406 
1407                 current->state = TASK_INTERRUPTIBLE;
1408 
1409                 skb = skb_peek(&sk->receive_queue);
1410                 do
1411                 {
1412                         if (!skb)
1413                                 break;
1414                         if (before(*seq, skb->seq))
1415                                 break;
1416                         offset = *seq - skb->seq;
1417                         if (skb->h.th->syn)
1418                                 offset--;
1419                         if (offset < skb->len)
1420                                 goto found_ok_skb;
1421                         if (skb->h.th->fin)
1422                                 goto found_fin_ok;
1423                         if (!(flags & MSG_PEEK))
1424                                 skb->used = 1;
1425                         skb = skb->next;
1426                 }
1427                 while (skb != (struct sk_buff *)&sk->receive_queue);
1428 
1429                 if (copied)
1430                         break;
1431 
1432                 if (sk->err)
1433                 {
1434                         copied = sock_error(sk);
1435                         break;
1436                 }
1437 
1438                 if (sk->state == TCP_CLOSE)
1439                 {
1440                         if (!sk->done)
1441                         {
1442                                 sk->done = 1;
1443                                 break;
1444                         }
1445                         copied = -ENOTCONN;
1446                         break;
1447                 }
1448 
1449                 if (sk->shutdown & RCV_SHUTDOWN)
1450                 {
1451                         sk->done = 1;
1452                         break;
1453                 }
1454 
1455                 if (nonblock)
1456                 {
1457                         copied = -EAGAIN;
1458                         break;
1459                 }
1460 
1461                 cleanup_rbuf(sk);
1462                 release_sock(sk);
1463                 sk->socket->flags |= SO_WAITDATA;
1464                 schedule();
1465                 sk->socket->flags &= ~SO_WAITDATA;
1466                 lock_sock(sk);
1467                 continue;
1468 
1469         found_ok_skb:
1470                 /*
1471                  *      Lock the buffer. We can be fairly relaxed as
1472                  *      an interrupt will never steal a buffer we are
1473                  *      using unless I've missed something serious in
1474                  *      tcp_data.
1475                  */
1476 
1477                 skb->users++;
1478 
1479                 /*
1480                  *      Ok so how much can we use ?
1481                  */
1482 
1483                 used = skb->len - offset;
1484                 if (len < used)
1485                         used = len;
1486                 /*
1487                  *      Do we have urgent data here?
1488                  */
1489 
1490                 if (sk->urg_data)
1491                 {
1492                         u32 urg_offset = sk->urg_seq - *seq;
1493                         if (urg_offset < used)
1494                         {
1495                                 if (!urg_offset)
1496                                 {
1497                                         if (!sk->urginline)
1498                                         {
1499                                                 ++*seq;
1500                                                 offset++;
1501                                                 used--;
1502                                         }
1503                                 }
1504                                 else
1505                                         used = urg_offset;
1506                         }
1507                 }
1508 
1509                 /*
1510                  *      Copy it - We _MUST_ update *seq first so that we
1511                  *      don't ever double read when we have dual readers
1512                  */
1513 
1514                 *seq += used;
1515 
1516                 /*
1517                  *      This memcpy_tofs can sleep. If it sleeps and we
1518                  *      do a second read it relies on the skb->users to avoid
1519                  *      a crash when cleanup_rbuf() gets called.
1520                  */
1521 
1522                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
1523                         skb->h.th->doff*4 + offset, used);
1524                 copied += used;
1525                 len -= used;
1526 
1527                 /*
1528                  *      We now will not sleep again until we are finished
1529                  *      with skb. Sorry if you are doing the SMP port
1530                  *      but you'll just have to fix it neatly ;)
1531                  */
1532 
1533                 skb->users --;
1534 
1535                 if (after(sk->copied_seq,sk->urg_seq))
1536                         sk->urg_data = 0;
1537                 if (used + offset < skb->len)
1538                         continue;
1539 
1540                 /*
1541                  *      Process the FIN.
1542                  */
1543 
1544                 if (skb->h.th->fin)
1545                         goto found_fin_ok;
1546                 if (flags & MSG_PEEK)
1547                         continue;
1548                 skb->used = 1;
1549                 if (!skb->users)
1550                         tcp_eat_skb(sk, skb);
1551                 continue;
1552 
1553         found_fin_ok:
1554                 ++*seq;
1555                 if (flags & MSG_PEEK)
1556                         break;
1557 
1558                 /*
1559                  *      All is done
1560                  */
1561 
1562                 skb->used = 1;
1563                 sk->shutdown |= RCV_SHUTDOWN;
1564                 break;
1565 
1566         }
1567 
1568         if(copied>0 && msg->msg_name)
1569         {
1570                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1571                 sin->sin_family=AF_INET;
1572                 sin->sin_addr.s_addr=sk->daddr;
1573                 sin->sin_port=sk->dummy_th.dest;
1574         }
1575         if(addr_len)
1576                 *addr_len=sizeof(struct sockaddr_in);
1577 
1578         remove_wait_queue(sk->sleep, &wait);
1579         current->state = TASK_RUNNING;
1580 
1581         /* Clean up data we have read: This will do ACK frames */
1582         cleanup_rbuf(sk);
1583         release_sock(sk);
1584         return copied;
1585 }
1586 
1587 
1588 
1589 /*
1590  *      State processing on a close. This implements the state shift for
1591  *      sending our FIN frame. Note that we only send a FIN for some
1592  *      states. A shutdown() may have already sent the FIN, or we may be
1593  *      closed.
1594  */
1595 
1596 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
1597 {
1598         int ns=TCP_CLOSE;
1599         int send_fin=0;
1600         switch(sk->state)
1601         {
1602                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
1603                         break;
1604                 case TCP_SYN_RECV:
1605                 case TCP_ESTABLISHED:   /* Closedown begin */
1606                         ns=TCP_FIN_WAIT1;
1607                         send_fin=1;
1608                         break;
1609                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
1610                 case TCP_FIN_WAIT2:
1611                 case TCP_CLOSING:
1612                         ns=sk->state;
1613                         break;
1614                 case TCP_CLOSE:
1615                 case TCP_LISTEN:
1616                         break;
1617                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
1618                                            wait only for the ACK */
1619                         ns=TCP_LAST_ACK;
1620                         send_fin=1;
1621         }
1622 
1623         tcp_set_state(sk,ns);
1624 
1625         /*
1626          *      This is a (useful) BSD violating of the RFC. There is a
1627          *      problem with TCP as specified in that the other end could
1628          *      keep a socket open forever with no application left this end.
1629          *      We use a 3 minute timeout (about the same as BSD) then kill
1630          *      our end. If they send after that then tough - BUT: long enough
1631          *      that we won't make the old 4*rto = almost no time - whoops
1632          *      reset mistake.
1633          */
1634         if(dead && ns==TCP_FIN_WAIT2)
1635         {
1636                 int timer_active=del_timer(&sk->timer);
1637                 if(timer_active)
1638                         add_timer(&sk->timer);
1639                 else
1640                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1641         }
1642 
1643         return send_fin;
1644 }
1645 
1646 /*
1647  *      Shutdown the sending side of a connection. Much like close except
1648  *      that we don't receive shut down or set sk->dead.
1649  */
1650 
1651 void tcp_shutdown(struct sock *sk, int how)
     /*  */
1652 {
1653         /*
1654          *      We need to grab some memory, and put together a FIN,
1655          *      and then put it into the queue to be sent.
1656          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1657          */
1658 
1659         if (!(how & SEND_SHUTDOWN))
1660                 return;
1661 
1662         /*
1663          *      If we've already sent a FIN, or it's a closed state
1664          */
1665 
1666         if (sk->state == TCP_FIN_WAIT1 ||
1667             sk->state == TCP_FIN_WAIT2 ||
1668             sk->state == TCP_CLOSING ||
1669             sk->state == TCP_LAST_ACK ||
1670             sk->state == TCP_TIME_WAIT ||
1671             sk->state == TCP_CLOSE ||
1672             sk->state == TCP_LISTEN
1673           )
1674         {
1675                 return;
1676         }
1677         lock_sock(sk);
1678 
1679         /*
1680          * flag that the sender has shutdown
1681          */
1682 
1683         sk->shutdown |= SEND_SHUTDOWN;
1684 
1685         /*
1686          *  Clear out any half completed packets.
1687          */
1688 
1689         if (sk->partial)
1690                 tcp_send_partial(sk);
1691 
1692         /*
1693          *      FIN if needed
1694          */
1695 
1696         if (tcp_close_state(sk,0))
1697                 tcp_send_fin(sk);
1698 
1699         release_sock(sk);
1700 }
1701 
1702 
1703 /*
1704  *      Return 1 if we still have things to send in our buffers.
1705  */
1706 
1707 static inline int closing(struct sock * sk)
     /*  */
1708 {
1709         switch (sk->state) {
1710                 case TCP_FIN_WAIT1:
1711                 case TCP_CLOSING:
1712                 case TCP_LAST_ACK:
1713                         return 1;
1714         }
1715         return 0;
1716 }
1717 
1718 
1719 static void tcp_close(struct sock *sk, unsigned long timeout)
     /*  */
1720 {
1721         struct sk_buff *skb;
1722 
1723         /*
1724          * We need to grab some memory, and put together a FIN,
1725          * and then put it into the queue to be sent.
1726          */
1727 
1728         lock_sock(sk);
1729 
1730         tcp_cache_zap();
1731         if(sk->state == TCP_LISTEN)
1732         {
1733                 /* Special case */
1734                 tcp_set_state(sk, TCP_CLOSE);
1735                 tcp_close_pending(sk);
1736                 release_sock(sk);
1737                 sk->dead = 1;
1738                 return;
1739         }
1740 
1741         sk->keepopen = 1;
1742         sk->shutdown = SHUTDOWN_MASK;
1743 
1744         if (!sk->dead)
1745                 sk->state_change(sk);
1746 
1747         /*
1748          *  We need to flush the recv. buffs.  We do this only on the
1749          *  descriptor close, not protocol-sourced closes, because the
1750          *  reader process may not have drained the data yet!
1751          */
1752 
1753         while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1754                 kfree_skb(skb, FREE_READ);
1755 
1756         /*
1757          *      Get rid off any half-completed packets.
1758          */
1759 
1760         if (sk->partial)
1761                 tcp_send_partial(sk);
1762 
1763         /*
1764          *      Timeout is not the same thing - however the code likes
1765          *      to send both the same way (sigh).
1766          */
1767 
1768         if (tcp_close_state(sk,1)==1)
1769         {
1770                 tcp_send_fin(sk);
1771         }
1772 
1773         if (timeout) {
1774                 cli();
1775                 release_sock(sk);
1776                 current->timeout = timeout;
1777                 while(closing(sk) && current->timeout)
1778                 {
1779                         interruptible_sleep_on(sk->sleep);
1780                         if (current->signal & ~current->blocked)
1781                         {
1782                                 break;
1783                         }
1784                 }
1785                 current->timeout=0;
1786                 lock_sock(sk);
1787                 sti();
1788         }
1789 
1790         /*
1791          * This will destroy it. The timers will take care of actually
1792          * free'ing up the memory.
1793          */
1794         tcp_cache_zap();        /* Kill the cache again. */
1795         release_sock(sk);
1796         sk->dead = 1;
1797 }
1798 
1799 
1800 /*
1801  *      This will accept the next outstanding connection.
1802  */
1803 
1804 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
1805 {
1806         struct sock *newsk;
1807         struct sk_buff *skb;
1808 
1809   /*
1810    * We need to make sure that this socket is listening,
1811    * and that it has something pending.
1812    */
1813 
1814         if (sk->state != TCP_LISTEN)
1815         {
1816                 sk->err = EINVAL;
1817                 return(NULL);
1818         }
1819 
1820         /* Avoid the race. */
1821         cli();
1822         lock_sock(sk);
1823 
1824         while((skb = tcp_dequeue_established(sk)) == NULL)
1825         {
1826                 if (flags & O_NONBLOCK)
1827                 {
1828                         sti();
1829                         release_sock(sk);
1830                         sk->err = EAGAIN;
1831                         return(NULL);
1832                 }
1833 
1834                 release_sock(sk);
1835                 interruptible_sleep_on(sk->sleep);
1836                 if (current->signal & ~current->blocked)
1837                 {
1838                         sti();
1839                         sk->err = ERESTARTSYS;
1840                         return(NULL);
1841                 }
1842                 lock_sock(sk);
1843         }
1844         sti();
1845 
1846         /*
1847          *      Now all we need to do is return skb->sk.
1848          */
1849 
1850         newsk = skb->sk;
1851 
1852         kfree_skb(skb, FREE_READ);
1853         sk->ack_backlog--;
1854         release_sock(sk);
1855         return(newsk);
1856 }
1857 
1858 /*
1859  *      This will initiate an outgoing connection.
1860  */
1861 
1862 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
1863 {
1864         struct sk_buff *buff;
1865         struct device *dev=NULL;
1866         unsigned char *ptr;
1867         int tmp;
1868         int atype;
1869         struct tcphdr *t1;
1870         struct rtable *rt;
1871 
1872         if (sk->state != TCP_CLOSE)
1873                 return(-EISCONN);
1874 
1875         /*
1876          *      Don't allow a double connect.
1877          */
1878 
1879         if(sk->daddr)
1880                 return -EINVAL;
1881 
1882         if (addr_len < 8)
1883                 return(-EINVAL);
1884 
1885         if (usin->sin_family && usin->sin_family != AF_INET)
1886                 return(-EAFNOSUPPORT);
1887 
1888         /*
1889          *      connect() to INADDR_ANY means loopback (BSD'ism).
1890          */
1891 
1892         if(usin->sin_addr.s_addr==INADDR_ANY)
1893                 usin->sin_addr.s_addr=ip_my_addr();
1894 
1895         /*
1896          *      Don't want a TCP connection going to a broadcast address
1897          */
1898 
1899         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
1900                 return -ENETUNREACH;
1901 
1902         lock_sock(sk);
1903         sk->daddr = usin->sin_addr.s_addr;
1904         sk->write_seq = tcp_init_seq();
1905         sk->window_seq = sk->write_seq;
1906         sk->rcv_ack_seq = sk->write_seq -1;
1907         sk->rcv_ack_cnt = 1;
1908         sk->err = 0;
1909         sk->dummy_th.dest = usin->sin_port;
1910         release_sock(sk);
1911 
1912         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1913         if (buff == NULL)
1914         {
1915                 return(-ENOMEM);
1916         }
1917         lock_sock(sk);
1918         buff->sk = sk;
1919         buff->free = 0;
1920         buff->localroute = sk->localroute;
1921 
1922 
1923         /*
1924          *      Put in the IP header and routing stuff.
1925          */
1926 
1927         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1928                 IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1929         if (tmp < 0)
1930         {
1931                 sock_wfree(sk, buff);
1932                 release_sock(sk);
1933                 return(-ENETUNREACH);
1934         }
1935         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
1936                 sk->saddr = rt->rt_src;
1937         sk->rcv_saddr = sk->saddr;
1938 
1939         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
1940 
1941         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
1942         buff->seq = sk->write_seq++;
1943         t1->seq = htonl(buff->seq);
1944         sk->sent_seq = sk->write_seq;
1945         buff->end_seq = sk->write_seq;
1946         t1->ack = 0;
1947         t1->window = 2;
1948         t1->syn = 1;
1949         t1->doff = 6;
1950         /* use 512 or whatever user asked for */
1951 
1952         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
1953                 sk->window_clamp=rt->rt_window;
1954         else
1955                 sk->window_clamp=0;
1956 
1957         if (sk->user_mss)
1958                 sk->mtu = sk->user_mss;
1959         else if (rt)
1960                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1961         else
1962                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
1963 
1964         /*
1965          *      but not bigger than device MTU
1966          */
1967 
1968         if(sk->mtu <32)
1969                 sk->mtu = 32;   /* Sanity limit */
1970 
1971         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
1972 
1973 #ifdef CONFIG_SKIP
1974 
1975         /*
1976          *      SKIP devices set their MTU to 65535. This is so they can take packets
1977          *      unfragmented to security process then fragment. They could lie to the
1978          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
1979          *      simply because the final package we want unfragmented is going to be
1980          *
1981          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
1982          */
1983 
1984         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
1985                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
1986 #endif
1987 
1988         /*
1989          *      Put in the TCP options to say MTU.
1990          */
1991 
1992         ptr = skb_put(buff,4);
1993         ptr[0] = 2;
1994         ptr[1] = 4;
1995         ptr[2] = (sk->mtu) >> 8;
1996         ptr[3] = (sk->mtu) & 0xff;
1997         buff->csum = csum_partial(ptr, 4, 0);
1998         tcp_send_check(t1, sk->saddr, sk->daddr,
1999                   sizeof(struct tcphdr) + 4, buff);
2000 
2001         /*
2002          *      This must go first otherwise a really quick response will get reset.
2003          */
2004 
2005         tcp_cache_zap();
2006         tcp_set_state(sk,TCP_SYN_SENT);
2007         if(rt&&rt->rt_flags&RTF_IRTT)
2008                 sk->rto = rt->rt_irtt;
2009         else
2010                 sk->rto = TCP_TIMEOUT_INIT;
2011         sk->delack_timer.function = tcp_delack_timer;
2012         sk->delack_timer.data = (unsigned long) sk;
2013         sk->retransmit_timer.function = tcp_retransmit_timer;
2014         sk->retransmit_timer.data = (unsigned long)sk;
2015         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);  /* Timer for repeating the SYN until an answer  */
2016         sk->retransmits = 0;                            /* Now works the right way instead of a hacked
2017                                                                                         initial setting */
2018 
2019         sk->prot->queue_xmit(sk, dev, buff, 0);
2020         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2021         tcp_statistics.TcpActiveOpens++;
2022         tcp_statistics.TcpOutSegs++;
2023 
2024         release_sock(sk);
2025         return(0);
2026 }
2027 
2028 /*
2029  *      Socket option code for TCP.
2030  */
2031 
2032 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
2033 {
2034         int val,err;
2035 
2036         if(level!=SOL_TCP)
2037                 return ip_setsockopt(sk,level,optname,optval,optlen);
2038 
2039         if (optval == NULL)
2040                 return(-EINVAL);
2041 
2042         err=verify_area(VERIFY_READ, optval, sizeof(int));
2043         if(err)
2044                 return err;
2045 
2046         val = get_user((int *)optval);
2047 
2048         switch(optname)
2049         {
2050                 case TCP_MAXSEG:
2051 /*
2052  * values greater than interface MTU won't take effect.  however at
2053  * the point when this call is done we typically don't yet know
2054  * which interface is going to be used
2055  */
2056                         if(val<1||val>MAX_WINDOW)
2057                                 return -EINVAL;
2058                         sk->user_mss=val;
2059                         return 0;
2060                 case TCP_NODELAY:
2061                         sk->nonagle=(val==0)?0:1;
2062                         return 0;
2063                 default:
2064                         return(-ENOPROTOOPT);
2065         }
2066 }
2067 
2068 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
2069 {
2070         int val,err;
2071 
2072         if(level!=SOL_TCP)
2073                 return ip_getsockopt(sk,level,optname,optval,optlen);
2074 
2075         switch(optname)
2076         {
2077                 case TCP_MAXSEG:
2078                         val=sk->user_mss;
2079                         break;
2080                 case TCP_NODELAY:
2081                         val=sk->nonagle;
2082                         break;
2083                 default:
2084                         return(-ENOPROTOOPT);
2085         }
2086         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2087         if(err)
2088                 return err;
2089         put_user(sizeof(int),(int *) optlen);
2090 
2091         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2092         if(err)
2093                 return err;
2094         put_user(val,(int *)optval);
2095 
2096         return(0);
2097 }
2098 
2099 
2100 struct proto tcp_prot = {
2101         tcp_close,
2102         ip_build_header,
2103         tcp_connect,
2104         tcp_accept,
2105         ip_queue_xmit,
2106         tcp_retransmit,
2107         tcp_write_wakeup,
2108         tcp_read_wakeup,
2109         tcp_rcv,
2110         tcp_select,
2111         tcp_ioctl,
2112         NULL,
2113         tcp_shutdown,
2114         tcp_setsockopt,
2115         tcp_getsockopt,
2116         tcp_sendmsg,
2117         tcp_recvmsg,
2118         NULL,           /* No special bind() */
2119         128,
2120         0,
2121         "TCP",
2122         0, 0,
2123         {NULL,}
2124 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS