root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_find_established
  2. tcp_dequeue_established
  3. tcp_close_pending
  4. tcp_time_wait
  5. tcp_err
  6. tcp_readable
  7. tcp_listen_select
  8. tcp_select
  9. tcp_ioctl
  10. tcp_send_check
  11. tcp_build_header
  12. wait_for_tcp_connect
  13. wait_for_tcp_memory
  14. do_tcp_sendmsg
  15. tcp_sendmsg
  16. tcp_read_wakeup
  17. tcp_recv_urg
  18. tcp_eat_skb
  19. cleanup_rbuf
  20. tcp_recvmsg
  21. tcp_close_state
  22. tcp_shutdown
  23. closing
  24. tcp_close
  25. tcp_accept
  26. tcp_connect
  27. tcp_setsockopt
  28. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathalogical case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in 
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in 
 190  *                                      tcp_do_retransmit()
 191  *
 192  * To Fix:
 193  *              Fast path the code. Two things here - fix the window calculation
 194  *              so it doesn't iterate over the queue, also spot packets with no funny
 195  *              options arriving in order and process directly.
 196  *
 197  *              Rewrite output state machine to use a single queue.
 198  *              Speed up input assembly algorithm.
 199  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 200  *              could do with it working on IPv4
 201  *              User settable/learned rtt/max window/mtu
 202  *
 203  *              Change the fundamental structure to a single send queue maintained
 204  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 205  *              active routes too]). Cut the queue off in tcp_retransmit/
 206  *              tcp_transmit.
 207  *              Change the receive queue to assemble as it goes. This lets us
 208  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 209  *              tcp_data/tcp_read as well as the window shrink crud.
 210  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 211  *              tcp_queue_skb seem obvious routines to extract.
 212  *      
 213  *              This program is free software; you can redistribute it and/or
 214  *              modify it under the terms of the GNU General Public License
 215  *              as published by the Free Software Foundation; either version
 216  *              2 of the License, or(at your option) any later version.
 217  *
 218  * Description of States:
 219  *
 220  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 221  *
 222  *      TCP_SYN_RECV            received a connection request, sent ack,
 223  *                              waiting for final ack in three-way handshake.
 224  *
 225  *      TCP_ESTABLISHED         connection established
 226  *
 227  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 228  *                              transmission of remaining buffered data
 229  *
 230  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 231  *                              to shutdown
 232  *
 233  *      TCP_CLOSING             both sides have shutdown but we still have
 234  *                              data we have to finish sending
 235  *
 236  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 237  *                              closed, can only be entered from FIN_WAIT2
 238  *                              or CLOSING.  Required because the other end
 239  *                              may not have gotten our last ACK causing it
 240  *                              to retransmit the data packet (which we ignore)
 241  *
 242  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 243  *                              us to finish writing our data and to shutdown
 244  *                              (we have to close() to move on to LAST_ACK)
 245  *
 246  *      TCP_LAST_ACK            out side has shutdown after remote has
 247  *                              shutdown.  There may still be data in our
 248  *                              buffer that we have to finish sending
 249  *              
 250  *      TCP_CLOSE               socket is finished
 251  */
 252 
 253 /*
 254  * RFC1122 status:
 255  * NOTE: I'm not going to be doing comments in the code for this one except
 256  * for violations and the like.  tcp.c is just too big... If I say something
 257  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 258  * with Alan. -- MS 950903
 259  * 
 260  * Use of PSH (4.2.2.2)
 261  *   MAY aggregate data sent without the PSH flag. (does)
 262  *   MAY queue data received without the PSH flag. (does)
 263  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 264  *   MAY implement PSH on send calls. (doesn't, thus:)
 265  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 266  *     MUST set PSH on last segment (does)
 267  *   MAY pass received PSH to application layer (doesn't)
 268  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 269  * 
 270  * Window Size (4.2.2.3, 4.2.2.16)
 271  *   MUST treat window size as an unsigned number (does)
 272  *   SHOULD treat window size as a 32-bit number (does not)
 273  *   MUST NOT shrink window once it is offered (does not normally)
 274  *   
 275  * Urgent Pointer (4.2.2.4)
 276  * **MUST point urgent pointer to last byte of urgent data (not right
 277  *     after). (doesn't, to be like BSD)
 278  *   MUST inform application layer asynchronously of incoming urgent
 279  *     data. (does)
 280  *   MUST provide application with means of determining the amount of
 281  *     urgent data pending. (does)
 282  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 283  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 284  *      [Follows BSD 1 byte of urgent data]
 285  * 
 286  * TCP Options (4.2.2.5)
 287  *   MUST be able to receive TCP options in any segment. (does)
 288  *   MUST ignore unsupported options (does)
 289  *   
 290  * Maximum Segment Size Option (4.2.2.6)
 291  *   MUST implement both sending and receiving MSS. (does)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *  
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  * 
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 306  * 
 307  * Simultaneous Open Attempts (4.2.2.10)
 308  *   MUST support simultaneous open attempts (does)
 309  * 
 310  * Recovery from Old Duplicate SYN (4.2.2.11)
 311  *   MUST keep track of active vs. passive open (does)
 312  * 
 313  * RST segment (4.2.2.12)
 314  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 315  *     anything with it, which is standard)
 316  * 
 317  * Closing a Connection (4.2.2.13)
 318  *   MUST inform application of whether connectin was closed by RST or
 319  *     normal close. (does)
 320  *   MAY allow "half-duplex" close (treat connection as closed for the
 321  *     local app, even before handshake is done). (does)
 322  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 323  * 
 324  * Retransmission Timeout (4.2.2.15)
 325  *   MUST implement Jacobson's slow start and congestion avoidance
 326  *     stuff. (does) 
 327  * 
 328  * Probing Zero Windows (4.2.2.17)
 329  *   MUST support probing of zero windows. (does)
 330  *   MAY keep offered window closed indefinitely. (does)
 331  *   MUST allow remote window to stay closed indefinitely. (does)
 332  * 
 333  * Passive Open Calls (4.2.2.18)
 334  *   MUST NOT let new passive open affect other connections. (doesn't)
 335  *   MUST support passive opens (LISTENs) concurrently. (does)
 336  *   
 337  * Time to Live (4.2.2.19)
 338  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 339  * 
 340  * Event Processing (4.2.2.20)
 341  *   SHOULD queue out-of-order segments. (does)
 342  *   MUST aggregate ACK segments whenever possible. (does but badly)
 343  *   
 344  * Retransmission Timeout Calculation (4.2.3.1)
 345  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 346  *     calculation. (does, or at least explains them in the comments 8*b)
 347  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 348  * 
 349  * When to Send an ACK Segment (4.2.3.2)
 350  *   SHOULD implement delayed ACK. (does)
 351  *   MUST keep ACK delay < 0.5 sec. (does)
 352  * 
 353  * When to Send a Window Update (4.2.3.3)
 354  *   MUST implement receiver-side SWS. (does)
 355  *   
 356  * When to Send Data (4.2.3.4)
 357  *   MUST implement sender-side SWS. (does)
 358  *   SHOULD implement Nagle algorithm. (does)
 359  * 
 360  * TCP Connection Failures (4.2.3.5)
 361  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 362  *   SHOULD inform application layer of soft errors. (does)
 363  *   
 364  * TCP Keep-Alives (4.2.3.6)
 365  *   MAY provide keep-alives. (does)
 366  *   MUST make keep-alives configurable on a per-connection basis. (does)
 367  *   MUST default to no keep-alives. (does)
 368  * **MUST make keep-alive interval configurable. (doesn't)
 369  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 370  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 371  *     connection. (doesn't)
 372  *   SHOULD send keep-alive with no data. (does)
 373  * 
 374  * TCP Multihoming (4.2.3.7)
 375  *   MUST get source address from IP layer before sending first
 376  *     SYN. (does)
 377  *   MUST use same local address for all segments of a connection. (does)
 378  * 
 379  * IP Options (4.2.3.8)
 380  *   MUST ignore unsupported IP options. (does)
 381  *   MAY support Time Stamp and Record Route. (does)
 382  *   MUST allow application to specify a source route. (does)
 383  *   MUST allow receieved Source Route option to set route for all future
 384  *     segments on this connection. (does not (security issues))
 385  * 
 386  * ICMP messages (4.2.3.9)
 387  *   MUST act on ICMP errors. (does)
 388  *   MUST slow transmission upon receipt of a Source Quench. (does)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does)
 394  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 395  *     messages (2, 3, 4). (does)
 396  * 
 397  * Remote Address Validation (4.2.3.10)
 398  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 399  *   MUST ignore SYN with invalid source address. (does)
 400  *   MUST silently discard incoming SYN for broadcast/multicast
 401  *     address. (does) 
 402  * 
 403  * Asynchronous Reports (4.2.4.1)
 404  * MUST provide mechanism for reporting soft errors to application
 405  *     layer. (does)
 406  * 
 407  * Type of Service (4.2.4.2)
 408  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 409  * 
 410  * (Whew. -- MS 950903)
 411  **/
 412 
 413 #include <linux/config.h>
 414 #include <linux/types.h>
 415 #include <linux/fcntl.h>
 416 
 417 #include <net/icmp.h>
 418 #include <net/tcp.h>
 419 
 420 #include <asm/segment.h>
 421 
 422 unsigned long seq_offset;
 423 struct tcp_mib  tcp_statistics;
 424 
 425 static void tcp_close(struct sock *sk, unsigned long timeout);
 426 
 427 /*
 428  *      The less said about this the better, but it works and will do for 1.2  (and 1.4 ;))
 429  */
 430 
 431 struct wait_queue *master_select_wakeup;
 432 
 433 /*
 434  *      Find someone to 'accept'. Must be called with
 435  *      the socket locked or with interrupts disabled
 436  */ 
 437 
 438 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 439 {
 440         struct sk_buff *p=skb_peek(&s->receive_queue);
 441         if(p==NULL)
 442                 return NULL;
 443         do
 444         {
 445                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 446                         return p;
 447                 p=p->next;
 448         }
 449         while(p!=(struct sk_buff *)&s->receive_queue);
 450         return NULL;
 451 }
 452 
 453 /*
 454  *      Remove a completed connection and return it. This is used by
 455  *      tcp_accept() to get connections from the queue.
 456  */
 457 
 458 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 459 {
 460         struct sk_buff *skb;
 461         unsigned long flags;
 462         save_flags(flags);
 463         cli(); 
 464         skb=tcp_find_established(s);
 465         if(skb!=NULL)
 466                 skb_unlink(skb);        /* Take it off the queue */
 467         restore_flags(flags);
 468         return skb;
 469 }
 470 
 471 /* 
 472  *      This routine closes sockets which have been at least partially
 473  *      opened, but not yet accepted. Currently it is only called by
 474  *      tcp_close, and timeout mirrors the value there. 
 475  */
 476 
 477 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 478 {
 479         struct sk_buff *skb;
 480 
 481         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 482         {
 483                 skb->sk->dead=1;
 484                 tcp_close(skb->sk, 0);
 485                 kfree_skb(skb, FREE_READ);
 486         }
 487         return;
 488 }
 489 
 490 /*
 491  *      Enter the time wait state. 
 492  */
 493 
 494 void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 495 {
 496         tcp_set_state(sk,TCP_TIME_WAIT);
 497         sk->shutdown = SHUTDOWN_MASK;
 498         if (!sk->dead)
 499                 sk->state_change(sk);
 500         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 501 }
 502 
 503 
 504 /*
 505  * This routine is called by the ICMP module when it gets some
 506  * sort of error condition.  If err < 0 then the socket should
 507  * be closed and the error returned to the user.  If err > 0
 508  * it's just the icmp type << 8 | icmp code.  After adjustment
 509  * header points to the first 8 bytes of the tcp header.  We need
 510  * to find the appropriate port.
 511  */
 512 
 513 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 514         __u32 saddr, struct inet_protocol *protocol)
 515 {
 516         struct tcphdr *th = (struct tcphdr *)header;
 517         struct sock *sk;
 518         
 519         /*
 520          *      This one is _WRONG_. FIXME urgently.
 521          */
 522 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
 523         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
 524 #endif  
 525         th =(struct tcphdr *)header;
 526         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 527 
 528         if (sk == NULL) 
 529                 return;
 530   
 531         if (type == ICMP_SOURCE_QUENCH) 
 532         {
 533                 /*
 534                  * FIXME:
 535                  * For now we will just trigger a linear backoff.
 536                  * The slow start code should cause a real backoff here.
 537                  */
 538                 if (sk->cong_window > 4)
 539                         sk->cong_window--;
 540                 return;
 541         }
 542         
 543         if (type == ICMP_PARAMETERPROB)
 544         {
 545                 sk->err=EPROTO;
 546                 sk->error_report(sk);
 547         }
 548 
 549 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 550         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 551         {
 552                 struct rtable * rt;
 553                 /*
 554                  * Ugly trick to pass MTU to protocol layer.
 555                  * Really we should add argument "info" to error handler.
 556                  */
 557                 unsigned short new_mtu = ntohs(iph->id);
 558 
 559                 if ((rt = sk->ip_route_cache) != NULL)
 560                         if (rt->rt_mtu > new_mtu)
 561                                 rt->rt_mtu = new_mtu;
 562 
 563                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
 564                         && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
 565                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 566 
 567                 return;
 568         }
 569 #endif
 570 
 571         /*
 572          * If we've already connected we will keep trying
 573          * until we time out, or the user gives up.
 574          */
 575 
 576         if (code < 13)
 577         {       
 578                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 579                 {
 580                         sk->err = icmp_err_convert[code].errno;
 581                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
 582                         {
 583                                 tcp_statistics.TcpAttemptFails++;
 584                                 tcp_set_state(sk,TCP_CLOSE);
 585                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 586                         }
 587                 }
 588                 else    /* Only an error on timeout */
 589                         sk->err_soft = icmp_err_convert[code].errno;
 590         }
 591 }
 592 
 593 
 594 /*
 595  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 596  *      in the received data queue (ie a frame missing that needs sending to us). Not
 597  *      sorting using two queues as data arrives makes life so much harder.
 598  */
 599 
 600 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 601 {
 602         unsigned long counted;
 603         unsigned long amount;
 604         struct sk_buff *skb;
 605         int sum;
 606         unsigned long flags;
 607 
 608         if(sk && sk->debug)
 609                 printk("tcp_readable: %p - ",sk);
 610 
 611         save_flags(flags);
 612         cli();
 613         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 614         {
 615                 restore_flags(flags);
 616                 if(sk && sk->debug) 
 617                         printk("empty\n");
 618                 return(0);
 619         }
 620   
 621         counted = sk->copied_seq;       /* Where we are at the moment */
 622         amount = 0;
 623   
 624         /* 
 625          *      Do until a push or until we are out of data. 
 626          */
 627          
 628         do 
 629         {
 630                 if (before(counted, skb->seq))          /* Found a hole so stops here */
 631                         break;
 632                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
 633                 if (skb->h.th->syn)
 634                         sum++;
 635                 if (sum > 0) 
 636                 {                                       /* Add it up, move on */
 637                         amount += sum;
 638                         if (skb->h.th->syn) 
 639                                 amount--;
 640                         counted += sum;
 641                 }
 642                 /*
 643                  * Don't count urg data ... but do it in the right place!
 644                  * Consider: "old_data (ptr is here) URG PUSH data"
 645                  * The old code would stop at the first push because
 646                  * it counted the urg (amount==1) and then does amount--
 647                  * *after* the loop.  This means tcp_readable() always
 648                  * returned zero if any URG PUSH was in the queue, even
 649                  * though there was normal data available. If we subtract
 650                  * the urg data right here, we even get it to work for more
 651                  * than one URG PUSH skb without normal data.
 652                  * This means that select() finally works now with urg data
 653                  * in the queue.  Note that rlogin was never affected
 654                  * because it doesn't use select(); it uses two processes
 655                  * and a blocking read().  And the queue scan in tcp_read()
 656                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 657                  */
 658                 if (skb->h.th->urg)
 659                         amount--;       /* don't count urg data */
 660                 if (amount && skb->h.th->psh) break;
 661                 skb = skb->next;
 662         }
 663         while(skb != (struct sk_buff *)&sk->receive_queue);
 664 
 665         restore_flags(flags);
 666         if(sk->debug)
 667                 printk("got %lu bytes.\n",amount);
 668         return(amount);
 669 }
 670 
 671 /*
 672  * LISTEN is a special case for select..
 673  */
 674 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 675 {
 676         if (sel_type == SEL_IN) {
 677                 int retval;
 678 
 679                 lock_sock(sk);
 680                 retval = (tcp_find_established(sk) != NULL);
 681                 release_sock(sk);
 682                 if (!retval)
 683                         select_wait(&master_select_wakeup,wait);
 684                 return retval;
 685         }
 686         return 0;
 687 }
 688 
 689 
 690 /*
 691  *      Wait for a TCP event.
 692  *
 693  *      Note that we don't need to lock the socket, as the upper select layers
 694  *      take care of normal races (between the test and the event) and we don't
 695  *      go look at any of the socket buffers directly.
 696  */
 697 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 698 {
 699         if (sk->state == TCP_LISTEN)
 700                 return tcp_listen_select(sk, sel_type, wait);
 701 
 702         switch(sel_type) {
 703         case SEL_IN:
 704                 if (sk->err)
 705                         return 1;
 706                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 707                         break;
 708 
 709                 if (sk->shutdown & RCV_SHUTDOWN)
 710                         return 1;
 711                         
 712                 if (sk->acked_seq == sk->copied_seq)
 713                         break;
 714 
 715                 if (sk->urg_seq != sk->copied_seq ||
 716                     sk->acked_seq != sk->copied_seq+1 ||
 717                     sk->urginline || !sk->urg_data)
 718                         return 1;
 719                 break;
 720 
 721         case SEL_OUT:
 722                 if (sk->err)
 723                         return 1;
 724                 if (sk->shutdown & SEND_SHUTDOWN) 
 725                         return 0;
 726                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 727                         break;
 728                 /*
 729                  * This is now right thanks to a small fix
 730                  * by Matt Dillon.
 731                  */
 732 
 733                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
 734                         break;
 735                 return 1;
 736 
 737         case SEL_EX:
 738                 if (sk->urg_data)
 739                         return 1;
 740                 break;
 741         }
 742         select_wait(sk->sleep, wait);
 743         return 0;
 744 }
 745 
 746 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 747 {
 748         int err;
 749         switch(cmd) 
 750         {
 751 
 752                 case TIOCINQ:
 753 #ifdef FIXME    /* FIXME: */
 754                 case FIONREAD:
 755 #endif
 756                 {
 757                         unsigned long amount;
 758 
 759                         if (sk->state == TCP_LISTEN) 
 760                                 return(-EINVAL);
 761 
 762                         lock_sock(sk);
 763                         amount = tcp_readable(sk);
 764                         release_sock(sk);
 765                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
 766                         if(err)
 767                                 return err;
 768                         put_user(amount, (int *)arg);
 769                         return(0);
 770                 }
 771                 case SIOCATMARK:
 772                 {
 773                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 774 
 775                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
 776                         if (err)
 777                                 return err;
 778                         put_user(answ,(int *) arg);
 779                         return(0);
 780                 }
 781                 case TIOCOUTQ:
 782                 {
 783                         unsigned long amount;
 784 
 785                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 786                         amount = sock_wspace(sk);
 787                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
 788                         if(err)
 789                                 return err;
 790                         put_user(amount, (int *)arg);
 791                         return(0);
 792                 }
 793                 default:
 794                         return(-EINVAL);
 795         }
 796 }
 797 
 798 
 799 /*
 800  *      This routine computes a TCP checksum. 
 801  *
 802  *      Modified January 1995 from a go-faster DOS routine by
 803  *      Jorge Cwik <jorge@laser.satlink.net>
 804  */
 805 #undef DEBUG_TCP_CHECK
 806 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
 807                 unsigned long daddr, int len, struct sk_buff *skb)
 808 {
 809 #ifdef DEBUG_TCP_CHECK
 810         u16 check;
 811 #endif
 812         th->check = 0;
 813         th->check = tcp_check(th, len, saddr, daddr,
 814                 csum_partial((char *)th,sizeof(*th),skb->csum));
 815 
 816 #ifdef DEBUG_TCP_CHECK
 817         check = th->check;
 818         th->check = 0;
 819         th->check = tcp_check(th, len, saddr, daddr,
 820                 csum_partial((char *)th,len,0));
 821         if (check != th->check) {
 822                 static int count = 0;
 823                 if (++count < 10) {
 824                         printk("Checksum %x (%x) from %p\n", th->check, check,
 825                                 (&th)[-1]);
 826                         printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
 827                 }
 828         }
 829 #endif
 830 }
 831 
 832 
 833 /* 
 834  *      This routine builds a generic TCP header. 
 835  */
 836  
 837 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
 838 {
 839 
 840         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
 841         th->seq = htonl(sk->write_seq);
 842         th->psh =(push == 0) ? 1 : 0;
 843         sk->ack_backlog = 0;
 844         sk->bytes_rcv = 0;
 845         sk->ack_timed = 0;
 846         th->ack_seq = htonl(sk->acked_seq);
 847         sk->window = tcp_select_window(sk);
 848         th->window = htons(sk->window);
 849 
 850         return(sizeof(*th));
 851 }
 852 
 853 /*
 854  *      Wait for a socket to get into the connected state
 855  */
 856 static void wait_for_tcp_connect(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 857 {
 858         release_sock(sk);
 859         cli();                  
 860         if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
 861         {
 862                 interruptible_sleep_on(sk->sleep);      
 863         }
 864         sti();
 865         lock_sock(sk);
 866 }
 867 
 868 /*
 869  *      Wait for more memory for a socket
 870  */
 871 static void wait_for_tcp_memory(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 872 {
 873         release_sock(sk);
 874         cli();
 875         if (sk->wmem_alloc*2 > sk->sndbuf &&
 876             (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
 877                 && sk->err == 0) 
 878         {
 879                 sk->socket->flags &= ~SO_NOSPACE;
 880                 interruptible_sleep_on(sk->sleep);
 881         }
 882         sti();
 883         lock_sock(sk);
 884 }
 885 
 886 
 887 /*
 888  *      This routine copies from a user buffer into a socket,
 889  *      and starts the transmit system.
 890  */
 891 
 892 static int do_tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
 893           int len, int nonblock, int flags)
 894 {
 895         int copied = 0;
 896         int copy;
 897         int tmp;
 898         int seglen;
 899         int iovct=0;
 900         struct sk_buff *skb;
 901         struct sk_buff *send_tmp;
 902         struct proto *prot;
 903         struct device *dev = NULL;
 904         unsigned char *from;
 905         
 906         /*
 907          *      Ok commence sending
 908          */
 909         
 910         while(iovct<msg->msg_iovlen)
 911         {
 912                 seglen=msg->msg_iov[iovct].iov_len;
 913                 from=msg->msg_iov[iovct++].iov_base;
 914                 prot = sk->prot;
 915                 while(seglen > 0) 
 916                 {
 917                         /*
 918                          * Stop on errors
 919                          */
 920                         if (sk->err) 
 921                         {
 922                                 if (copied) 
 923                                         return copied;
 924                                 return sock_error(sk);
 925                         }
 926 
 927                         /*
 928                          *      Make sure that we are established. 
 929                          */
 930                         if (sk->shutdown & SEND_SHUTDOWN) 
 931                         {
 932                                 if (copied)
 933                                         return copied;
 934                                 return -EPIPE;
 935                         }
 936 
 937                         /* 
 938                          *      Wait for a connection to finish.
 939                          */
 940                         while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
 941                         {
 942                                 if (copied)
 943                                         return copied;
 944 
 945                                 if (sk->err) 
 946                                         return sock_error(sk);
 947         
 948                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
 949                                 {
 950                                         if (sk->keepopen)
 951                                                 send_sig(SIGPIPE, current, 0);
 952                                         return -EPIPE;
 953                                 }
 954         
 955                                 if (nonblock)
 956                                         return -EAGAIN;
 957 
 958                                 if (current->signal & ~current->blocked)
 959                                         return -ERESTARTSYS;
 960         
 961                                 wait_for_tcp_connect(sk);
 962                         }
 963         
 964                 /*
 965                  * The following code can result in copy <= if sk->mss is ever
 966                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
 967                  * sk->mtu is constant once SYN processing is finished.  I.e. we
 968                  * had better not get here until we've seen his SYN and at least one
 969                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
 970                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
 971                  * non-decreasing.  Note that any ioctl to set user_mss must be done
 972                  * before the exchange of SYN's.  If the initial ack from the other
 973                  * end has a window of 0, max_window and thus mss will both be 0.
 974                  */
 975         
 976                 /* 
 977                  *      Now we need to check if we have a half built packet. 
 978                  */
 979 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 980                 /*
 981                  *      FIXME:  I'm almost sure that this fragment is BUG,
 982                  *              but it works... I do not know why 8) --ANK
 983                  *
 984                  *      Really, we should rebuild all the queues...
 985                  *      It's difficult. Temprorary hack is to send all
 986                  *      queued segments with allowed fragmentation.
 987                  */
 988                 {
 989                         int new_mss = min(sk->mtu, sk->max_window);
 990                         if (new_mss < sk->mss)
 991                         {
 992                                 tcp_send_partial(sk);
 993                                 sk->mss = new_mss;
 994                         }
 995                 }
 996 #endif
 997         
 998                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
 999                         {
1000                                 int tcp_size;
1001 
1002                                 tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
1003         
1004                                 /* Add more stuff to the end of skb->len */
1005                                 if (!(flags & MSG_OOB)) 
1006                                 {
1007                                         copy = min(sk->mss - tcp_size, seglen);
1008                                         if (copy <= 0) 
1009                                         {
1010                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1011                                                 return -EFAULT;
1012                                         }
1013                                         tcp_size += copy;
1014                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1015                                         skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1016                                         from += copy;
1017                                         copied += copy;
1018                                         len -= copy;
1019                                         sk->write_seq += copy;
1020                                         seglen -= copy;
1021                                 }
1022                                 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1023                                         tcp_send_skb(sk, skb);
1024                                 else
1025                                         tcp_enqueue_partial(skb, sk);
1026                                 continue;
1027                         }
1028 
1029                 /*
1030                  * We also need to worry about the window.
1031                  * If window < 1/2 the maximum window we've seen from this
1032                  *   host, don't use it.  This is sender side
1033                  *   silly window prevention, as specified in RFC1122.
1034                  *   (Note that this is different than earlier versions of
1035                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1036                  *   use the whole MSS.  Since the results in the right
1037                  *   edge of the packet being outside the window, it will
1038                  *   be queued for later rather than sent.
1039                  */
1040 
1041                         copy = sk->window_seq - sk->write_seq;
1042                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1043                                 copy = sk->mss;
1044                         if (copy > seglen)
1045                                 copy = seglen;
1046 
1047                 /*
1048                  *      We should really check the window here also. 
1049                  */
1050                  
1051                         send_tmp = NULL;
1052                         if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out) 
1053                         {
1054                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1055                                 send_tmp = skb;
1056                         } 
1057                         else 
1058                         {
1059                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1060                         }
1061         
1062                         /*
1063                          *      If we didn't get any memory, we need to sleep. 
1064                          */
1065         
1066                         if (skb == NULL) 
1067                         {
1068                                 sk->socket->flags |= SO_NOSPACE;
1069                                 if (nonblock) 
1070                                 {
1071                                         if (copied) 
1072                                                 return copied;
1073                                         return -EAGAIN;
1074                                 }
1075 
1076                                 if (current->signal & ~current->blocked)
1077                                 {
1078                                         if (copied)
1079                                                 return copied;
1080                                         return -ERESTARTSYS;
1081                                 }
1082 
1083                                 wait_for_tcp_memory(sk);
1084                                 continue;
1085                         }
1086 
1087                         skb->sk = sk;
1088                         skb->free = 0;
1089                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1090         
1091                         /*
1092                          * FIXME: we need to optimize this.
1093                          * Perhaps some hints here would be good.
1094                          */
1095                 
1096                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1097                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1098                         if (tmp < 0 ) 
1099                         {
1100                                 sock_wfree(sk, skb);
1101                                 if (copied) 
1102                                         return(copied);
1103                                 return(tmp);
1104                         }
1105 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1106                         skb->ip_hdr->frag_off |= htons(IP_DF);
1107 #endif
1108                         skb->dev = dev;
1109                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1110                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1111                         if (tmp < 0) 
1112                         {
1113                                 sock_wfree(sk, skb);
1114                                 if (copied) 
1115                                         return(copied);
1116                                 return(tmp);
1117                         }
1118         
1119                         if (flags & MSG_OOB) 
1120                         {
1121                                 skb->h.th->urg = 1;
1122                                 skb->h.th->urg_ptr = ntohs(copy);
1123                         }
1124 
1125                         skb->csum = csum_partial_copy_fromuser(from,
1126                                 skb_put(skb,copy), copy, 0);
1127                 
1128                         from += copy;
1129                         copied += copy;
1130                         len -= copy;
1131                         seglen -= copy;
1132                         skb->free = 0;
1133                         sk->write_seq += copy;
1134                 
1135                         if (send_tmp != NULL) 
1136                         {
1137                                 tcp_enqueue_partial(send_tmp, sk);
1138                                 continue;
1139                         }
1140                         tcp_send_skb(sk, skb);
1141                 }
1142         }
1143         sk->err = 0;
1144 
1145         return copied;
1146 }
1147 
1148 
1149 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
1150           int len, int nonblock, int flags)
1151 {
1152         int retval = -EINVAL;
1153 
1154         /*
1155          *      Do sanity checking for sendmsg/sendto/send
1156          */
1157          
1158         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1159                 goto out;
1160         if (msg->msg_name) {
1161                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1162 
1163                 if (msg->msg_namelen < sizeof(*addr))
1164                         goto out;
1165                 if (addr->sin_family && addr->sin_family != AF_INET) 
1166                         goto out;
1167                 retval = -ENOTCONN;
1168                 if(sk->state == TCP_CLOSE)
1169                         goto out;
1170                 retval = -EISCONN;
1171                 if (addr->sin_port != sk->dummy_th.dest) 
1172                         goto out;
1173                 if (addr->sin_addr.s_addr != sk->daddr) 
1174                         goto out;
1175         }
1176 
1177         lock_sock(sk);
1178         retval = do_tcp_sendmsg(sk, msg, len, nonblock, flags);
1179 
1180 /*
1181  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1182  *      interactive fast network servers. It's meant to be on and
1183  *      it really improves the throughput though not the echo time
1184  *      on my slow slip link - Alan
1185  *
1186  *      If not nagling we can send on the before case too..
1187  */
1188 
1189         if (sk->partial) {
1190                 if (!sk->packets_out ||
1191                     (sk->nonagle && before(sk->write_seq , sk->window_seq))) {
1192                         tcp_send_partial(sk);
1193                 }
1194         }
1195 
1196         release_sock(sk);
1197 
1198 out:
1199         return retval;
1200 }
1201         
1202 
1203 /*
1204  *      Send an ack if one is backlogged at this point. Ought to merge
1205  *      this with tcp_send_ack().
1206  *      This is called for delayed acks also.
1207  */
1208  
1209 void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1210 {
1211         int tmp;
1212         struct device *dev = NULL;
1213         struct tcphdr *t1;
1214         struct sk_buff *buff;
1215 
1216         if (!sk->ack_backlog) 
1217                 return;
1218 
1219         /*
1220          * If we're closed, don't send an ack, or we'll get a RST
1221          * from the closed destination.
1222          */
1223         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1224                 return; 
1225 
1226         /*
1227          * FIXME: we need to put code here to prevent this routine from
1228          * being called.  Being called once in a while is ok, so only check
1229          * if this is the second time in a row.
1230          */
1231 
1232         /*
1233          * We need to grab some memory, and put together an ack,
1234          * and then put it into the queue to be sent.
1235          */
1236 
1237         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1238         if (buff == NULL) 
1239         {
1240                 /* Try again real soon. */
1241                 tcp_reset_xmit_timer(sk, TIME_WRITE, HZ);
1242                 return;
1243         }
1244 
1245         buff->sk = sk;
1246         buff->localroute = sk->localroute;
1247         buff->csum = 0;
1248         
1249         /*
1250          *      Put in the IP header and routing stuff. 
1251          */
1252 
1253         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1254                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1255         if (tmp < 0) 
1256         {
1257                 buff->free = 1;
1258                 sock_wfree(sk, buff);
1259                 return;
1260         }
1261 
1262         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1263 
1264         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1265         t1->seq = htonl(sk->sent_seq);
1266 
1267         sk->ack_backlog = 0;
1268         sk->bytes_rcv = 0;
1269 
1270         sk->window = tcp_select_window(sk);
1271         t1->window = htons(sk->window);
1272         t1->ack_seq = htonl(sk->acked_seq);
1273         t1->doff = sizeof(*t1)/4;
1274         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
1275         sk->prot->queue_xmit(sk, dev, buff, 1);
1276         tcp_statistics.TcpOutSegs++;
1277 }
1278 
1279 
1280 /*
1281  *      Handle reading urgent data. BSD has very simple semantics for
1282  *      this, no blocking and very strange errors 8)
1283  */
1284  
1285 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1286              struct msghdr *msg, int len, int flags, int *addr_len)
1287 {
1288         /*
1289          *      No URG data to read
1290          */
1291         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1292                 return -EINVAL; /* Yes this is right ! */
1293                 
1294         if (sk->err) 
1295                 return sock_error(sk);
1296                 
1297         if (sk->state == TCP_CLOSE || sk->done) 
1298         {
1299                 if (!sk->done) 
1300                 {
1301                         sk->done = 1;
1302                         return 0;
1303                 }
1304                 return -ENOTCONN;
1305         }
1306 
1307         if (sk->shutdown & RCV_SHUTDOWN) 
1308         {
1309                 sk->done = 1;
1310                 return 0;
1311         }
1312         lock_sock(sk);
1313         if (sk->urg_data & URG_VALID) 
1314         {
1315                 char c = sk->urg_data;
1316                 if (!(flags & MSG_PEEK))
1317                         sk->urg_data = URG_READ;
1318                 memcpy_toiovec(msg->msg_iov, &c, 1);
1319                 if(msg->msg_name)
1320                 {
1321                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1322                         sin->sin_family=AF_INET;
1323                         sin->sin_addr.s_addr=sk->daddr;
1324                         sin->sin_port=sk->dummy_th.dest;
1325                 }
1326                 if(addr_len)
1327                         *addr_len=sizeof(struct sockaddr_in);
1328                 release_sock(sk);
1329                 return 1;
1330         }
1331         release_sock(sk);
1332         
1333         /*
1334          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1335          * the available implementations agree in this case:
1336          * this call should never block, independent of the
1337          * blocking state of the socket.
1338          * Mike <pall@rz.uni-karlsruhe.de>
1339          */
1340         return -EAGAIN;
1341 }
1342 
1343 /*
1344  *      Release a skb if it is no longer needed. This routine
1345  *      must be called with interrupts disabled or with the
1346  *      socket locked so that the sk_buff queue operation is ok.
1347  */
1348  
1349 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1350 {
1351         sk->ack_backlog++;
1352         skb->sk = sk;
1353         __skb_unlink(skb, &sk->receive_queue);
1354         kfree_skb(skb, FREE_READ);
1355 }
1356 
1357 /*
1358  *      FIXME:
1359  *      This routine frees used buffers.
1360  *      It should consider sending an ACK to let the
1361  *      other end know we now have a bigger window.
1362  */
1363 
1364 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1365 {
1366         struct sk_buff *skb;
1367         unsigned long rspace;
1368 
1369         /*
1370          * NOTE! The socket must be locked, so that we don't get
1371          * a messed-up receive queue.
1372          */
1373         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1374                 if (!skb->used || skb->users)
1375                         break;
1376                 tcp_eat_skb(sk, skb);
1377         }
1378 
1379         /*
1380          *      FIXME:
1381          *      At this point we should send an ack if the difference
1382          *      in the window, and the amount of space is bigger than
1383          *      TCP_WINDOW_DIFF.
1384          */
1385 
1386         rspace=sock_rspace(sk);
1387         if(sk->debug)
1388                 printk("sk->rspace = %lu\n", rspace);
1389         /*
1390          * This area has caused the most trouble.  The current strategy
1391          * is to simply do nothing if the other end has room to send at
1392          * least 3 full packets, because the ack from those will auto-
1393          * matically update the window.  If the other end doesn't think
1394          * we have much space left, but we have room for at least 1 more
1395          * complete packet than it thinks we do, we will send an ack
1396          * immediately.  Otherwise we will wait up to .5 seconds in case
1397          * the user reads some more.
1398          */
1399 
1400         /*
1401          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1402          * if the other end is offering a window smaller than the agreed on MSS
1403          * (called sk->mtu here).  In theory there's no connection between send
1404          * and receive, and so no reason to think that they're going to send
1405          * small packets.  For the moment I'm using the hack of reducing the mss
1406          * only on the send side, so I'm putting mtu here.
1407          */
1408 
1409         if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1410         {
1411                 /* Send an ack right now. */
1412                 tcp_read_wakeup(sk);
1413         } 
1414         else 
1415         {
1416                 /* Force it to send an ack soon. */
1417                 int was_active = del_timer(&sk->retransmit_timer);
1418                 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
1419                 {
1420                         tcp_reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1421                 } 
1422                 else
1423                         add_timer(&sk->retransmit_timer);
1424         }
1425 } 
1426 
1427 
1428 /*
1429  *      This routine copies from a sock struct into the user buffer. 
1430  */
1431  
1432 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
1433         int len, int nonblock, int flags, int *addr_len)
1434 {
1435         struct wait_queue wait = { current, NULL };
1436         int copied = 0;
1437         u32 peek_seq;
1438         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1439         unsigned long used;
1440 
1441         /* 
1442          *      This error should be checked. 
1443          */
1444          
1445         if (sk->state == TCP_LISTEN)
1446                 return -ENOTCONN;
1447 
1448         /*
1449          *      Urgent data needs to be handled specially. 
1450          */
1451          
1452         if (flags & MSG_OOB)
1453                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1454 
1455         /*
1456          *      Copying sequence to update. This is volatile to handle
1457          *      the multi-reader case neatly (memcpy_to/fromfs might be 
1458          *      inline and thus not flush cached variables otherwise).
1459          */
1460          
1461         peek_seq = sk->copied_seq;
1462         seq = &sk->copied_seq;
1463         if (flags & MSG_PEEK)
1464                 seq = &peek_seq;
1465 
1466         add_wait_queue(sk->sleep, &wait);
1467         lock_sock(sk);
1468         while (len > 0) 
1469         {
1470                 struct sk_buff * skb;
1471                 u32 offset;
1472         
1473                 /*
1474                  * Are we at urgent data? Stop if we have read anything.
1475                  */
1476                  
1477                 if (copied && sk->urg_data && sk->urg_seq == *seq)
1478                         break;
1479 
1480                 /*
1481                  *      Next get a buffer.
1482                  */
1483                  
1484                 current->state = TASK_INTERRUPTIBLE;
1485 
1486                 skb = skb_peek(&sk->receive_queue);
1487                 do 
1488                 {
1489                         if (!skb)
1490                                 break;
1491                         if (before(*seq, skb->seq))
1492                                 break;
1493                         offset = *seq - skb->seq;
1494                         if (skb->h.th->syn)
1495                                 offset--;
1496                         if (offset < skb->len)
1497                                 goto found_ok_skb;
1498                         if (skb->h.th->fin)
1499                                 goto found_fin_ok;
1500                         if (!(flags & MSG_PEEK))
1501                                 skb->used = 1;
1502                         skb = skb->next;
1503                 }
1504                 while (skb != (struct sk_buff *)&sk->receive_queue);
1505 
1506                 if (copied)
1507                         break;
1508 
1509                 if (sk->err) 
1510                 {
1511                         copied = sock_error(sk);
1512                         break;
1513                 }
1514 
1515                 if (sk->state == TCP_CLOSE) 
1516                 {
1517                         if (!sk->done) 
1518                         {
1519                                 sk->done = 1;
1520                                 break;
1521                         }
1522                         copied = -ENOTCONN;
1523                         break;
1524                 }
1525 
1526                 if (sk->shutdown & RCV_SHUTDOWN) 
1527                 {
1528                         sk->done = 1;
1529                         break;
1530                 }
1531                         
1532                 if (nonblock) 
1533                 {
1534                         copied = -EAGAIN;
1535                         break;
1536                 }
1537 
1538                 cleanup_rbuf(sk);
1539                 release_sock(sk);
1540                 sk->socket->flags |= SO_WAITDATA;
1541                 schedule();
1542                 sk->socket->flags &= ~SO_WAITDATA;
1543                 lock_sock(sk);
1544 
1545                 if (current->signal & ~current->blocked) 
1546                 {
1547                         copied = -ERESTARTSYS;
1548                         break;
1549                 }
1550                 continue;
1551 
1552         found_ok_skb:
1553                 /*
1554                  *      Lock the buffer. We can be fairly relaxed as
1555                  *      an interrupt will never steal a buffer we are 
1556                  *      using unless I've missed something serious in
1557                  *      tcp_data.
1558                  */
1559                 
1560                 skb->users++;
1561                 
1562                 /*
1563                  *      Ok so how much can we use ? 
1564                  */
1565                  
1566                 used = skb->len - offset;
1567                 if (len < used)
1568                         used = len;
1569                 /*
1570                  *      Do we have urgent data here? 
1571                  */
1572                 
1573                 if (sk->urg_data) 
1574                 {
1575                         u32 urg_offset = sk->urg_seq - *seq;
1576                         if (urg_offset < used) 
1577                         {
1578                                 if (!urg_offset) 
1579                                 {
1580                                         if (!sk->urginline) 
1581                                         {
1582                                                 ++*seq;
1583                                                 offset++;
1584                                                 used--;
1585                                         }
1586                                 }
1587                                 else
1588                                         used = urg_offset;
1589                         }
1590                 }
1591                 
1592                 /*
1593                  *      Copy it - We _MUST_ update *seq first so that we
1594                  *      don't ever double read when we have dual readers
1595                  */
1596                  
1597                 *seq += used;
1598 
1599                 /*
1600                  *      This memcpy_tofs can sleep. If it sleeps and we
1601                  *      do a second read it relies on the skb->users to avoid
1602                  *      a crash when cleanup_rbuf() gets called.
1603                  */
1604                  
1605                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
1606                         skb->h.th->doff*4 + offset, used);
1607                 copied += used;
1608                 len -= used;
1609                 
1610                 /*
1611                  *      We now will not sleep again until we are finished
1612                  *      with skb. Sorry if you are doing the SMP port
1613                  *      but you'll just have to fix it neatly ;)
1614                  */
1615                  
1616                 skb->users --;
1617                 
1618                 if (after(sk->copied_seq,sk->urg_seq))
1619                         sk->urg_data = 0;
1620                 if (used + offset < skb->len)
1621                         continue;
1622                 
1623                 /*
1624                  *      Process the FIN.
1625                  */
1626 
1627                 if (skb->h.th->fin)
1628                         goto found_fin_ok;
1629                 if (flags & MSG_PEEK)
1630                         continue;
1631                 skb->used = 1;
1632                 if (!skb->users)
1633                         tcp_eat_skb(sk, skb);           
1634                 continue;
1635 
1636         found_fin_ok:
1637                 ++*seq;
1638                 if (flags & MSG_PEEK)
1639                         break;
1640                         
1641                 /*
1642                  *      All is done
1643                  */
1644                  
1645                 skb->used = 1;
1646                 sk->shutdown |= RCV_SHUTDOWN;
1647                 break;
1648 
1649         }
1650         
1651         if(copied>0 && msg->msg_name)
1652         {
1653                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1654                 sin->sin_family=AF_INET;
1655                 sin->sin_addr.s_addr=sk->daddr;
1656                 sin->sin_port=sk->dummy_th.dest;
1657         }
1658         if(addr_len)
1659                 *addr_len=sizeof(struct sockaddr_in);
1660                 
1661         remove_wait_queue(sk->sleep, &wait);
1662         current->state = TASK_RUNNING;
1663 
1664         /* Clean up data we have read: This will do ACK frames */
1665         cleanup_rbuf(sk);
1666         release_sock(sk);
1667         return copied;
1668 }
1669 
1670 
1671 
1672 /*
1673  *      State processing on a close. This implements the state shift for
1674  *      sending our FIN frame. Note that we only send a FIN for some 
1675  *      states. A shutdown() may have already sent the FIN, or we may be
1676  *      closed.
1677  */
1678  
1679 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
1680 {
1681         int ns=TCP_CLOSE;
1682         int send_fin=0;
1683         switch(sk->state)
1684         {
1685                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
1686                         break;
1687                 case TCP_SYN_RECV:
1688                 case TCP_ESTABLISHED:   /* Closedown begin */
1689                         ns=TCP_FIN_WAIT1;
1690                         send_fin=1;
1691                         break;
1692                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
1693                 case TCP_FIN_WAIT2:
1694                 case TCP_CLOSING:
1695                         ns=sk->state;
1696                         break;
1697                 case TCP_CLOSE:
1698                 case TCP_LISTEN:
1699                         break;
1700                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
1701                                            wait only for the ACK */
1702                         ns=TCP_LAST_ACK;
1703                         send_fin=1;
1704         }
1705         
1706         tcp_set_state(sk,ns);
1707                 
1708         /*
1709          *      This is a (useful) BSD violating of the RFC. There is a
1710          *      problem with TCP as specified in that the other end could
1711          *      keep a socket open forever with no application left this end.
1712          *      We use a 3 minute timeout (about the same as BSD) then kill
1713          *      our end. If they send after that then tough - BUT: long enough
1714          *      that we won't make the old 4*rto = almost no time - whoops
1715          *      reset mistake.
1716          */
1717         if(dead && ns==TCP_FIN_WAIT2)
1718         {
1719                 int timer_active=del_timer(&sk->timer);
1720                 if(timer_active)
1721                         add_timer(&sk->timer);
1722                 else
1723                         tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1724         }
1725         
1726         return send_fin;
1727 }
1728 
1729 /*
1730  *      Shutdown the sending side of a connection. Much like close except
1731  *      that we don't receive shut down or set sk->dead=1.
1732  */
1733 
1734 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
1735 {
1736         /*
1737          *      We need to grab some memory, and put together a FIN,
1738          *      and then put it into the queue to be sent.
1739          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1740          */
1741 
1742         if (!(how & SEND_SHUTDOWN)) 
1743                 return;
1744          
1745         /*
1746          *      If we've already sent a FIN, or it's a closed state
1747          */
1748          
1749         if (sk->state == TCP_FIN_WAIT1 ||
1750             sk->state == TCP_FIN_WAIT2 ||
1751             sk->state == TCP_CLOSING ||
1752             sk->state == TCP_LAST_ACK ||
1753             sk->state == TCP_TIME_WAIT || 
1754             sk->state == TCP_CLOSE ||
1755             sk->state == TCP_LISTEN
1756           )
1757         {
1758                 return;
1759         }
1760         lock_sock(sk);
1761 
1762         /*
1763          * flag that the sender has shutdown
1764          */
1765 
1766         sk->shutdown |= SEND_SHUTDOWN;
1767 
1768         /*
1769          *  Clear out any half completed packets. 
1770          */
1771 
1772         if (sk->partial)
1773                 tcp_send_partial(sk);
1774                 
1775         /*
1776          *      FIN if needed
1777          */
1778          
1779         if (tcp_close_state(sk,0))
1780                 tcp_send_fin(sk);
1781                 
1782         release_sock(sk);
1783 }
1784 
1785 
1786 /*
1787  *      Return 1 if we still have things to send in our buffers.
1788  */
1789  
1790 static inline int closing(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1791 {
1792         switch (sk->state) {
1793                 case TCP_FIN_WAIT1:
1794                 case TCP_CLOSING:
1795                 case TCP_LAST_ACK:
1796                         return 1;
1797         }
1798         return 0;
1799 }
1800 
1801 
1802 static void tcp_close(struct sock *sk, unsigned long timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1803 {
1804         struct sk_buff *skb;
1805 
1806         /*
1807          * We need to grab some memory, and put together a FIN, 
1808          * and then put it into the queue to be sent.
1809          */
1810         
1811         lock_sock(sk);
1812         
1813         tcp_cache_zap();
1814         if(sk->state == TCP_LISTEN)
1815         {
1816                 /* Special case */
1817                 tcp_set_state(sk, TCP_CLOSE);
1818                 tcp_close_pending(sk);
1819                 release_sock(sk);
1820                 return;
1821         }
1822         
1823         sk->keepopen = 1;
1824         sk->shutdown = SHUTDOWN_MASK;
1825 
1826         if (!sk->dead) 
1827                 sk->state_change(sk);
1828 
1829         /*
1830          *  We need to flush the recv. buffs.  We do this only on the
1831          *  descriptor close, not protocol-sourced closes, because the
1832          *  reader process may not have drained the data yet!
1833          */
1834                  
1835         while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1836                 kfree_skb(skb, FREE_READ);
1837 
1838         /*
1839          *      Get rid off any half-completed packets. 
1840          */
1841 
1842         if (sk->partial) 
1843                 tcp_send_partial(sk);
1844                 
1845         /*
1846          *      Timeout is not the same thing - however the code likes
1847          *      to send both the same way (sigh).
1848          */
1849          
1850         if (tcp_close_state(sk,1)==1)
1851         {
1852                 tcp_send_fin(sk);
1853         }
1854 
1855         if (timeout) {
1856                 cli();
1857                 release_sock(sk);
1858                 current->timeout = timeout;
1859                 while(closing(sk) && current->timeout)
1860                 {
1861                         interruptible_sleep_on(sk->sleep);
1862                         if (current->signal & ~current->blocked) 
1863                         {
1864                                 break;
1865                         }
1866                 }
1867                 current->timeout=0;
1868                 lock_sock(sk);
1869                 sti();
1870         }
1871 
1872         /*
1873          * This will destroy it. The timers will take care of actually
1874          * free'ing up the memory.
1875          */
1876         sk->dead = 1;
1877         tcp_cache_zap();        /* Kill the cache again. */
1878         release_sock(sk);
1879 }
1880 
1881 
1882 /*
1883  *      This will accept the next outstanding connection. 
1884  */
1885  
1886 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
1887 {
1888         struct sock *newsk;
1889         struct sk_buff *skb;
1890   
1891   /*
1892    * We need to make sure that this socket is listening,
1893    * and that it has something pending.
1894    */
1895 
1896         if (sk->state != TCP_LISTEN) 
1897         {
1898                 sk->err = EINVAL;
1899                 return(NULL); 
1900         }
1901 
1902         /* Avoid the race. */
1903         cli();
1904         lock_sock(sk);
1905 
1906         while((skb = tcp_dequeue_established(sk)) == NULL) 
1907         {
1908                 if (flags & O_NONBLOCK) 
1909                 {
1910                         sti();
1911                         release_sock(sk);
1912                         sk->err = EAGAIN;
1913                         return(NULL);
1914                 }
1915 
1916                 release_sock(sk);
1917                 interruptible_sleep_on(sk->sleep);
1918                 if (current->signal & ~current->blocked) 
1919                 {
1920                         sti();
1921                         sk->err = ERESTARTSYS;
1922                         return(NULL);
1923                 }
1924                 lock_sock(sk);
1925         }
1926         sti();
1927 
1928         /*
1929          *      Now all we need to do is return skb->sk. 
1930          */
1931 
1932         newsk = skb->sk;
1933 
1934         kfree_skb(skb, FREE_READ);
1935         sk->ack_backlog--;
1936         release_sock(sk);
1937         return(newsk);
1938 }
1939 
1940 /*
1941  *      This will initiate an outgoing connection. 
1942  */
1943  
1944 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
1945 {
1946         struct sk_buff *buff;
1947         struct device *dev=NULL;
1948         unsigned char *ptr;
1949         int tmp;
1950         int atype;
1951         struct tcphdr *t1;
1952         struct rtable *rt;
1953 
1954         if (sk->state != TCP_CLOSE) 
1955                 return(-EISCONN);
1956 
1957         /*
1958          *      Don't allow a double connect.
1959          */
1960                 
1961         if(sk->daddr)
1962                 return -EINVAL;
1963         
1964         if (addr_len < 8) 
1965                 return(-EINVAL);
1966 
1967         if (usin->sin_family && usin->sin_family != AF_INET) 
1968                 return(-EAFNOSUPPORT);
1969 
1970         /*
1971          *      connect() to INADDR_ANY means loopback (BSD'ism).
1972          */
1973         
1974         if(usin->sin_addr.s_addr==INADDR_ANY)
1975                 usin->sin_addr.s_addr=ip_my_addr();
1976                   
1977         /*
1978          *      Don't want a TCP connection going to a broadcast address 
1979          */
1980 
1981         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
1982                 return -ENETUNREACH;
1983   
1984         lock_sock(sk);
1985         sk->daddr = usin->sin_addr.s_addr;
1986         sk->write_seq = tcp_init_seq();
1987         sk->window_seq = sk->write_seq;
1988         sk->rcv_ack_seq = sk->write_seq -1;
1989         sk->err = 0;
1990         sk->dummy_th.dest = usin->sin_port;
1991         release_sock(sk);
1992 
1993         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1994         if (buff == NULL) 
1995         {
1996                 return(-ENOMEM);
1997         }
1998         lock_sock(sk);
1999         buff->sk = sk;
2000         buff->free = 0;
2001         buff->localroute = sk->localroute;
2002         
2003 
2004         /*
2005          *      Put in the IP header and routing stuff.
2006          */
2007          
2008         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2009                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2010         if (tmp < 0) 
2011         {
2012                 sock_wfree(sk, buff);
2013                 release_sock(sk);
2014                 return(-ENETUNREACH);
2015         }
2016         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
2017                 sk->saddr = rt->rt_src;
2018         sk->rcv_saddr = sk->saddr;
2019 
2020         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
2021 
2022         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
2023         buff->seq = sk->write_seq++;
2024         t1->seq = htonl(buff->seq);
2025         sk->sent_seq = sk->write_seq;
2026         buff->end_seq = sk->write_seq;
2027         t1->ack = 0;
2028         t1->window = 2;
2029         t1->syn = 1;
2030         t1->doff = 6;
2031         /* use 512 or whatever user asked for */
2032         
2033         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2034                 sk->window_clamp=rt->rt_window;
2035         else
2036                 sk->window_clamp=0;
2037 
2038         if (sk->user_mss)
2039                 sk->mtu = sk->user_mss;
2040         else if (rt)
2041                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
2042         else 
2043                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
2044 
2045         /*
2046          *      but not bigger than device MTU 
2047          */
2048 
2049         if(sk->mtu <32)
2050                 sk->mtu = 32;   /* Sanity limit */
2051                 
2052         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
2053 
2054 #ifdef CONFIG_SKIP
2055         
2056         /*
2057          *      SKIP devices set their MTU to 65535. This is so they can take packets
2058          *      unfragmented to security process then fragment. They could lie to the
2059          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
2060          *      simply because the final package we want unfragmented is going to be
2061          *
2062          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
2063          */
2064          
2065         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
2066                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
2067 #endif
2068         
2069         /*
2070          *      Put in the TCP options to say MTU. 
2071          */
2072 
2073         ptr = skb_put(buff,4);
2074         ptr[0] = 2;
2075         ptr[1] = 4;
2076         ptr[2] = (sk->mtu) >> 8;
2077         ptr[3] = (sk->mtu) & 0xff;
2078         buff->csum = csum_partial(ptr, 4, 0);
2079         tcp_send_check(t1, sk->saddr, sk->daddr,
2080                   sizeof(struct tcphdr) + 4, buff);
2081 
2082         /*
2083          *      This must go first otherwise a really quick response will get reset. 
2084          */
2085 
2086         tcp_cache_zap();
2087         tcp_set_state(sk,TCP_SYN_SENT);
2088         if(rt&&rt->rt_flags&RTF_IRTT)
2089                 sk->rto = rt->rt_irtt;
2090         else
2091                 sk->rto = TCP_TIMEOUT_INIT;
2092         sk->retransmit_timer.function=&tcp_retransmit_timer;
2093         sk->retransmit_timer.data = (unsigned long)sk;
2094         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);  /* Timer for repeating the SYN until an answer  */
2095         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
2096                                                                                         initial setting */
2097 
2098         sk->prot->queue_xmit(sk, dev, buff, 0);  
2099         tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2100         tcp_statistics.TcpActiveOpens++;
2101         tcp_statistics.TcpOutSegs++;
2102   
2103         release_sock(sk);
2104         return(0);
2105 }
2106 
2107 /*
2108  *      Socket option code for TCP. 
2109  */
2110   
2111 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
2112 {
2113         int val,err;
2114 
2115         if(level!=SOL_TCP)
2116                 return ip_setsockopt(sk,level,optname,optval,optlen);
2117 
2118         if (optval == NULL) 
2119                 return(-EINVAL);
2120 
2121         err=verify_area(VERIFY_READ, optval, sizeof(int));
2122         if(err)
2123                 return err;
2124         
2125         val = get_user((int *)optval);
2126 
2127         switch(optname)
2128         {
2129                 case TCP_MAXSEG:
2130 /*
2131  * values greater than interface MTU won't take effect.  however at
2132  * the point when this call is done we typically don't yet know
2133  * which interface is going to be used
2134  */
2135                         if(val<1||val>MAX_WINDOW)
2136                                 return -EINVAL;
2137                         sk->user_mss=val;
2138                         return 0;
2139                 case TCP_NODELAY:
2140                         sk->nonagle=(val==0)?0:1;
2141                         return 0;
2142                 default:
2143                         return(-ENOPROTOOPT);
2144         }
2145 }
2146 
2147 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
2148 {
2149         int val,err;
2150 
2151         if(level!=SOL_TCP)
2152                 return ip_getsockopt(sk,level,optname,optval,optlen);
2153                         
2154         switch(optname)
2155         {
2156                 case TCP_MAXSEG:
2157                         val=sk->user_mss;
2158                         break;
2159                 case TCP_NODELAY:
2160                         val=sk->nonagle;
2161                         break;
2162                 default:
2163                         return(-ENOPROTOOPT);
2164         }
2165         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2166         if(err)
2167                 return err;
2168         put_user(sizeof(int),(int *) optlen);
2169 
2170         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2171         if(err)
2172                 return err;
2173         put_user(val,(int *)optval);
2174 
2175         return(0);
2176 }       
2177 
2178 
2179 struct proto tcp_prot = {
2180         tcp_close,
2181         ip_build_header,
2182         tcp_connect,
2183         tcp_accept,
2184         ip_queue_xmit,
2185         tcp_retransmit,
2186         tcp_write_wakeup,
2187         tcp_read_wakeup,
2188         tcp_rcv,
2189         tcp_select,
2190         tcp_ioctl,
2191         NULL,
2192         tcp_shutdown,
2193         tcp_setsockopt,
2194         tcp_getsockopt,
2195         tcp_sendmsg,
2196         tcp_recvmsg,
2197         NULL,           /* No special bind() */
2198         128,
2199         0,
2200         "TCP",
2201         0, 0,
2202         {NULL,}
2203 };

/* [previous][next][first][last][top][bottom][index][help] */