root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_raise_window
  6. tcp_find_established
  7. tcp_dequeue_established
  8. tcp_close_pending
  9. tcp_time_wait
  10. tcp_do_retransmit
  11. reset_xmit_timer
  12. tcp_retransmit_time
  13. tcp_retransmit
  14. tcp_write_timeout
  15. retransmit_timer
  16. tcp_err
  17. tcp_readable
  18. tcp_listen_select
  19. tcp_select
  20. tcp_ioctl
  21. tcp_check
  22. tcp_send_check
  23. tcp_send_skb
  24. tcp_dequeue_partial
  25. tcp_send_partial
  26. tcp_enqueue_partial
  27. tcp_send_ack
  28. tcp_build_header
  29. tcp_sendmsg
  30. tcp_read_wakeup
  31. cleanup_rbuf
  32. tcp_recv_urg
  33. tcp_recvmsg
  34. tcp_close_state
  35. tcp_send_fin
  36. tcp_shutdown
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. bad_tcp_sequence
  52. tcp_sequence
  53. tcp_std_reset
  54. get_tcp_sock
  55. tcp_rcv
  56. tcp_write_wakeup
  57. tcp_send_probe0
  58. tcp_setsockopt
  59. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathalogical case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in 
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in 
 190  *                                      tcp_do_retransmit()
 191  *
 192  * To Fix:
 193  *              Fast path the code. Two things here - fix the window calculation
 194  *              so it doesn't iterate over the queue, also spot packets with no funny
 195  *              options arriving in order and process directly.
 196  *
 197  *              Rewrite output state machine to use a single queue.
 198  *              Speed up input assembly algorithm.
 199  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 200  *              could do with it working on IPv4
 201  *              User settable/learned rtt/max window/mtu
 202  *
 203  *              Change the fundamental structure to a single send queue maintained
 204  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 205  *              active routes too]). Cut the queue off in tcp_retransmit/
 206  *              tcp_transmit.
 207  *              Change the receive queue to assemble as it goes. This lets us
 208  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 209  *              tcp_data/tcp_read as well as the window shrink crud.
 210  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 211  *              tcp_queue_skb seem obvious routines to extract.
 212  *      
 213  *              This program is free software; you can redistribute it and/or
 214  *              modify it under the terms of the GNU General Public License
 215  *              as published by the Free Software Foundation; either version
 216  *              2 of the License, or(at your option) any later version.
 217  *
 218  * Description of States:
 219  *
 220  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 221  *
 222  *      TCP_SYN_RECV            received a connection request, sent ack,
 223  *                              waiting for final ack in three-way handshake.
 224  *
 225  *      TCP_ESTABLISHED         connection established
 226  *
 227  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 228  *                              transmission of remaining buffered data
 229  *
 230  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 231  *                              to shutdown
 232  *
 233  *      TCP_CLOSING             both sides have shutdown but we still have
 234  *                              data we have to finish sending
 235  *
 236  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 237  *                              closed, can only be entered from FIN_WAIT2
 238  *                              or CLOSING.  Required because the other end
 239  *                              may not have gotten our last ACK causing it
 240  *                              to retransmit the data packet (which we ignore)
 241  *
 242  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 243  *                              us to finish writing our data and to shutdown
 244  *                              (we have to close() to move on to LAST_ACK)
 245  *
 246  *      TCP_LAST_ACK            out side has shutdown after remote has
 247  *                              shutdown.  There may still be data in our
 248  *                              buffer that we have to finish sending
 249  *              
 250  *      TCP_CLOSE               socket is finished
 251  */
 252 
 253 /*
 254  * RFC1122 status:
 255  * NOTE: I'm not going to be doing comments in the code for this one except
 256  * for violations and the like.  tcp.c is just too big... If I say something
 257  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 258  * with Alan. -- MS 950903
 259  * 
 260  * Use of PSH (4.2.2.2)
 261  *   MAY aggregate data sent without the PSH flag. (does)
 262  *   MAY queue data received without the PSH flag. (does)
 263  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 264  *   MAY implement PSH on send calls. (doesn't, thus:)
 265  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 266  *     MUST set PSH on last segment (does)
 267  *   MAY pass received PSH to application layer (doesn't)
 268  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 269  * 
 270  * Window Size (4.2.2.3, 4.2.2.16)
 271  *   MUST treat window size as an unsigned number (does)
 272  *   SHOULD treat window size as a 32-bit number (does not)
 273  *   MUST NOT shrink window once it is offered (does not normally)
 274  *   
 275  * Urgent Pointer (4.2.2.4)
 276  * **MUST point urgent pointer to last byte of urgent data (not right
 277  *     after). (doesn't, to be like BSD)
 278  *   MUST inform application layer asynchronously of incoming urgent
 279  *     data. (does)
 280  *   MUST provide application with means of determining the amount of
 281  *     urgent data pending. (does)
 282  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 283  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 284  *      [Follows BSD 1 byte of urgent data]
 285  * 
 286  * TCP Options (4.2.2.5)
 287  *   MUST be able to receive TCP options in any segment. (does)
 288  *   MUST ignore unsupported options (does)
 289  *   
 290  * Maximum Segment Size Option (4.2.2.6)
 291  *   MUST implement both sending and receiving MSS. (does)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *  
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  * 
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 306  * 
 307  * Simultaneous Open Attempts (4.2.2.10)
 308  *   MUST support simultaneous open attempts (does)
 309  * 
 310  * Recovery from Old Duplicate SYN (4.2.2.11)
 311  *   MUST keep track of active vs. passive open (does)
 312  * 
 313  * RST segment (4.2.2.12)
 314  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 315  *     anything with it, which is standard)
 316  * 
 317  * Closing a Connection (4.2.2.13)
 318  *   MUST inform application of whether connectin was closed by RST or
 319  *     normal close. (does)
 320  *   MAY allow "half-duplex" close (treat connection as closed for the
 321  *     local app, even before handshake is done). (does)
 322  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 323  * 
 324  * Retransmission Timeout (4.2.2.15)
 325  *   MUST implement Jacobson's slow start and congestion avoidance
 326  *     stuff. (does) 
 327  * 
 328  * Probing Zero Windows (4.2.2.17)
 329  *   MUST support probing of zero windows. (does)
 330  *   MAY keep offered window closed indefinitely. (does)
 331  *   MUST allow remote window to stay closed indefinitely. (does)
 332  * 
 333  * Passive Open Calls (4.2.2.18)
 334  *   MUST NOT let new passive open affect other connections. (doesn't)
 335  *   MUST support passive opens (LISTENs) concurrently. (does)
 336  *   
 337  * Time to Live (4.2.2.19)
 338  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 339  * 
 340  * Event Processing (4.2.2.20)
 341  *   SHOULD queue out-of-order segments. (does)
 342  *   MUST aggregate ACK segments whenever possible. (does but badly)
 343  *   
 344  * Retransmission Timeout Calculation (4.2.3.1)
 345  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 346  *     calculation. (does, or at least explains them in the comments 8*b)
 347  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 348  * 
 349  * When to Send an ACK Segment (4.2.3.2)
 350  *   SHOULD implement delayed ACK. (does)
 351  *   MUST keep ACK delay < 0.5 sec. (does)
 352  * 
 353  * When to Send a Window Update (4.2.3.3)
 354  *   MUST implement receiver-side SWS. (does)
 355  *   
 356  * When to Send Data (4.2.3.4)
 357  *   MUST implement sender-side SWS. (does)
 358  *   SHOULD implement Nagle algorithm. (does)
 359  * 
 360  * TCP Connection Failures (4.2.3.5)
 361  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 362  *   SHOULD inform application layer of soft errors. (does)
 363  *   
 364  * TCP Keep-Alives (4.2.3.6)
 365  *   MAY provide keep-alives. (does)
 366  *   MUST make keep-alives configurable on a per-connection basis. (does)
 367  *   MUST default to no keep-alives. (does)
 368  * **MUST make keep-alive interval configurable. (doesn't)
 369  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 370  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 371  *     connection. (doesn't)
 372  *   SHOULD send keep-alive with no data. (does)
 373  * 
 374  * TCP Multihoming (4.2.3.7)
 375  *   MUST get source address from IP layer before sending first
 376  *     SYN. (does)
 377  *   MUST use same local address for all segments of a connection. (does)
 378  * 
 379  * IP Options (4.2.3.8)
 380  *   MUST ignore unsupported IP options. (does)
 381  *   MAY support Time Stamp and Record Route. (does)
 382  *   MUST allow application to specify a source route. (does)
 383  *   MUST allow receieved Source Route option to set route for all future
 384  *     segments on this connection. (does not (security issues))
 385  * 
 386  * ICMP messages (4.2.3.9)
 387  *   MUST act on ICMP errors. (does)
 388  *   MUST slow transmission upon receipt of a Source Quench. (does)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does)
 394  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 395  *     messages (2, 3, 4). (does)
 396  * 
 397  * Remote Address Validation (4.2.3.10)
 398  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 399  *   MUST ignore SYN with invalid source address. (does)
 400  *   MUST silently discard incoming SYN for broadcast/multicast
 401  *     address. (does) 
 402  * 
 403  * Asynchronous Reports (4.2.4.1)
 404  * MUST provide mechanism for reporting soft errors to application
 405  *     layer. (does)
 406  * 
 407  * Type of Service (4.2.4.2)
 408  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 409  * 
 410  * (Whew. -- MS 950903)
 411  **/
 412 
 413 #include <linux/types.h>
 414 #include <linux/sched.h>
 415 #include <linux/mm.h>
 416 #include <linux/time.h>
 417 #include <linux/string.h>
 418 #include <linux/config.h>
 419 #include <linux/socket.h>
 420 #include <linux/sockios.h>
 421 #include <linux/termios.h>
 422 #include <linux/in.h>
 423 #include <linux/fcntl.h>
 424 #include <linux/inet.h>
 425 #include <linux/netdevice.h>
 426 #include <net/snmp.h>
 427 #include <net/ip.h>
 428 #include <net/protocol.h>
 429 #include <net/icmp.h>
 430 #include <net/tcp.h>
 431 #include <net/arp.h>
 432 #include <linux/skbuff.h>
 433 #include <net/sock.h>
 434 #include <net/route.h>
 435 #include <linux/errno.h>
 436 #include <linux/timer.h>
 437 #include <asm/system.h>
 438 #include <asm/segment.h>
 439 #include <linux/mm.h>
 440 #include <net/checksum.h>
 441 
 442 /*
 443  *      The MSL timer is the 'normal' timer.
 444  */
 445  
 446 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 447 
 448 #define SEQ_TICK 3
 449 unsigned long seq_offset;
 450 struct tcp_mib  tcp_statistics;
 451 
 452 /*
 453  *      Cached last hit socket
 454  */
 455  
 456 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 457 volatile unsigned short  th_cache_dport, th_cache_sport;
 458 volatile struct sock *th_cache_sk;
 459 
 460 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 461 {
 462         unsigned long flags;
 463         save_flags(flags);
 464         cli();
 465         th_cache_saddr=0;
 466         th_cache_daddr=0;
 467         th_cache_dport=0;
 468         th_cache_sport=0;
 469         th_cache_sk=NULL;
 470         restore_flags(flags);
 471 }
 472 
 473 static void tcp_close(struct sock *sk, int timeout);
 474 static void tcp_read_wakeup(struct sock *sk);
 475 
 476 /*
 477  *      The less said about this the better, but it works and will do for 1.2  (and 1.4 ;))
 478  */
 479 
 480 static struct wait_queue *master_select_wakeup;
 481 
 482 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 483 {
 484         if (a < b) 
 485                 return(a);
 486         return(b);
 487 }
 488 
 489 #undef STATE_TRACE
 490 
 491 #ifdef STATE_TRACE
 492 static char *statename[]={
 493         "Unused","Established","Syn Sent","Syn Recv",
 494         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 495         "Close Wait","Last ACK","Listen","Closing"
 496 };
 497 #endif
 498 
 499 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 500 {
 501         if(sk->state==TCP_ESTABLISHED)
 502                 tcp_statistics.TcpCurrEstab--;
 503 #ifdef STATE_TRACE
 504         if(sk->debug)
 505                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 506 #endif  
 507         /* This is a hack but it doesn't occur often and it's going to
 508            be a real        to fix nicely */
 509            
 510         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 511         {
 512                 wake_up_interruptible(&master_select_wakeup);
 513         }
 514         sk->state=state;
 515         if(state==TCP_ESTABLISHED)
 516                 tcp_statistics.TcpCurrEstab++;
 517         if(sk->state==TCP_CLOSE)
 518                 tcp_cache_zap();
 519 }
 520 
 521 /*
 522  *      This routine picks a TCP windows for a socket based on
 523  *      the following constraints
 524  *  
 525  *      1. The window can never be shrunk once it is offered (RFC 793)
 526  *      2. We limit memory per socket
 527  */
 528 
 529 
 530 static __inline__ unsigned short tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 531 {
 532         long free_space = sock_rspace(sk);      
 533         long window = 0;
 534 
 535         if (free_space > 1024)
 536                 free_space &= ~0x3FF;  /* make free space a multiple of 1024 */
 537  
 538         if(sk->window_clamp)
 539                 free_space = min(sk->window_clamp, free_space);
 540  
 541         /* 
 542          * compute the actual window i.e. 
 543          * old_window - received_bytes_on_that_win 
 544          */
 545 
 546         if (sk->mss == 0)
 547                 sk->mss = sk->mtu;
 548 
 549         window = sk->window - (sk->acked_seq - sk->lastwin_seq);
 550  
 551         if ( window < 0 ) {     
 552                 window = 0;
 553                 printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", 
 554                        sk->window, sk->acked_seq, sk->lastwin_seq);
 555         }
 556 
 557         /*
 558          * RFC 1122:
 559          * "the suggested [SWS] avoidance algoritm for the receiver is to keep
 560          *  RECV.NEXT + RCV.WIN fixed until:
 561          *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 562          * 
 563          * i.e. don't raise the right edge of the window until you can't raise
 564          * it MSS bytes
 565          */
 566         
 567         if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
 568                 window += ((free_space - window) / sk->mss) * sk->mss;
 569         
 570         sk->window = window;
 571         sk->lastwin_seq = sk->acked_seq;
 572         
 573         return sk->window;
 574 }
 575 
 576 /*
 577  *      This function returns the amount that we can raise the
 578  *      usable window.
 579  */
 580 
 581 static __inline__ unsigned short tcp_raise_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 582 {
 583         long free_space = sock_rspace(sk);
 584         long window = 0;
 585 
 586         if (free_space > 1024)
 587                 free_space &= ~0x3FF; /* make free space a multiple of 1024 */
 588 
 589         if(sk->window_clamp)
 590                 free_space = min(sk->window_clamp, free_space);
 591  
 592         /* 
 593          * compute the actual window i.e. 
 594          * old_window - received_bytes_on_that_win 
 595          */
 596 
 597         window = sk->window - (sk->acked_seq - sk->lastwin_seq);
 598 
 599         if (sk->mss == 0)
 600                 sk->mss = sk->mtu;
 601  
 602         if ( window < 0 ) {     
 603                 window = 0;
 604                 printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n", 
 605                        sk->window, sk->acked_seq, sk->lastwin_seq);
 606         }
 607         
 608         if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
 609                 return ((free_space - window) / sk->mss) * sk->mss;
 610 
 611         return 0;
 612 }
 613 
 614 /*
 615  *      Find someone to 'accept'. Must be called with
 616  *      sk->inuse=1 or cli()
 617  */ 
 618 
 619 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 620 {
 621         struct sk_buff *p=skb_peek(&s->receive_queue);
 622         if(p==NULL)
 623                 return NULL;
 624         do
 625         {
 626                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 627                         return p;
 628                 p=p->next;
 629         }
 630         while(p!=(struct sk_buff *)&s->receive_queue);
 631         return NULL;
 632 }
 633 
 634 /*
 635  *      Remove a completed connection and return it. This is used by
 636  *      tcp_accept() to get connections from the queue.
 637  */
 638 
 639 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 640 {
 641         struct sk_buff *skb;
 642         unsigned long flags;
 643         save_flags(flags);
 644         cli(); 
 645         skb=tcp_find_established(s);
 646         if(skb!=NULL)
 647                 skb_unlink(skb);        /* Take it off the queue */
 648         restore_flags(flags);
 649         return skb;
 650 }
 651 
 652 /* 
 653  *      This routine closes sockets which have been at least partially
 654  *      opened, but not yet accepted. Currently it is only called by
 655  *      tcp_close, and timeout mirrors the value there. 
 656  */
 657 
 658 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 659 {
 660         struct sk_buff *skb;
 661 
 662         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 663         {
 664                 skb->sk->dead=1;
 665                 tcp_close(skb->sk, 0);
 666                 kfree_skb(skb, FREE_READ);
 667         }
 668         return;
 669 }
 670 
 671 /*
 672  *      Enter the time wait state. 
 673  */
 674 
 675 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 676 {
 677         tcp_set_state(sk,TCP_TIME_WAIT);
 678         sk->shutdown = SHUTDOWN_MASK;
 679         if (!sk->dead)
 680                 sk->state_change(sk);
 681         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 682 }
 683 
 684 /*
 685  *      A socket has timed out on its send queue and wants to do a
 686  *      little retransmitting. Currently this means TCP.
 687  */
 688 
 689 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 690 {
 691         struct sk_buff * skb;
 692         struct proto *prot;
 693         struct device *dev;
 694         int ct=0;
 695         struct rtable *rt;
 696 
 697         prot = sk->prot;
 698         skb = sk->send_head;
 699 
 700         while (skb != NULL)
 701         {
 702                 struct tcphdr *th;
 703                 struct iphdr *iph;
 704                 int size;
 705 
 706                 dev = skb->dev;
 707                 IS_SKB(skb);
 708                 skb->when = jiffies;
 709                 
 710                 /* dl1bke 960201 - @%$$! Hope this cures strange race conditions    */
 711                 /*                 with AX.25 mode VC. (esp. DAMA)                  */
 712                 /*                 if the buffer is locked we should not retransmit */
 713                 /*                 anyway, so we don't need all the fuss to prepare */
 714                 /*                 the buffer in this case.                         */
 715                 /*                 (the skb_pull() changes skb->data while we may   */
 716                 /*                 actually try to send the data. Ough. A side      */
 717                 /*                 effect is that we'll send some unnecessary data, */
 718                 /*                 but the alternative is desastrous...             */
 719                 
 720                 if (skb_device_locked(skb))
 721                         break;
 722 
 723                 /*
 724                  *      Discard the surplus MAC header
 725                  */
 726                  
 727                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 728 
 729                 /*
 730                  * In general it's OK just to use the old packet.  However we
 731                  * need to use the current ack and window fields.  Urg and
 732                  * urg_ptr could possibly stand to be updated as well, but we
 733                  * don't keep the necessary data.  That shouldn't be a problem,
 734                  * if the other end is doing the right thing.  Since we're
 735                  * changing the packet, we have to issue a new IP identifier.
 736                  */
 737 
 738                 iph = (struct iphdr *)skb->data;
 739                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 740                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 741                 
 742                 /*
 743                  *      Note: We ought to check for window limits here but
 744                  *      currently this is done (less efficiently) elsewhere.
 745                  */
 746 
 747                 /*
 748                  *      Put a MAC header back on (may cause ARPing)
 749                  */
 750                  
 751                 {
 752                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 753                          */
 754                         struct options *  opt = (struct options*)skb->proto_priv;
 755                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 756                 }
 757 
 758                 iph->id = htons(ip_id_count++);
 759 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 760                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 761                         iph->frag_off &= ~htons(IP_DF);
 762 #endif
 763                 ip_send_check(iph);
 764                         
 765                 if (rt==NULL)   /* Deep poo */
 766                 {
 767                         if(skb->sk)
 768                         {
 769                                 skb->sk->err_soft=ENETUNREACH;
 770                                 skb->sk->error_report(skb->sk);
 771                         }
 772                 }
 773                 else
 774                 {
 775                         dev=rt->rt_dev;
 776                         skb->raddr=rt->rt_gateway;
 777                         skb->dev=dev;
 778                         skb->arp=1;
 779                         if (rt->rt_hh)
 780                         {
 781                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 782                                 if (!rt->rt_hh->hh_uptodate)
 783                                 {
 784                                         skb->arp = 0;
 785 #if RT_CACHE_DEBUG >= 2
 786                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 787 #endif
 788                                 }
 789                         }
 790                         else if (dev->hard_header)
 791                         {
 792                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 793                                         skb->arp=0;
 794                         }
 795                 
 796                         /*
 797                          *      This is not the right way to handle this. We have to
 798                          *      issue an up to date window and ack report with this 
 799                          *      retransmit to keep the odd buggy tcp that relies on 
 800                          *      the fact BSD does this happy. 
 801                          *      We don't however need to recalculate the entire 
 802                          *      checksum, so someone wanting a small problem to play
 803                          *      with might like to implement RFC1141/RFC1624 and speed
 804                          *      this up by avoiding a full checksum.
 805                          */
 806                  
 807                         th->ack_seq = htonl(sk->acked_seq);
 808                         sk->ack_backlog = 0;
 809                         sk->bytes_rcv = 0;
 810                         th->window = ntohs(tcp_select_window(sk));
 811                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 812                 
 813                         /*
 814                          *      If the interface is (still) up and running, kick it.
 815                          */
 816         
 817                         if (dev->flags & IFF_UP)
 818                         {
 819                                 /*
 820                                  *      If the packet is still being sent by the device/protocol
 821                                  *      below then don't retransmit. This is both needed, and good -
 822                                  *      especially with connected mode AX.25 where it stops resends
 823                                  *      occurring of an as yet unsent anyway frame!
 824                                  *      We still add up the counts as the round trip time wants
 825                                  *      adjusting.
 826                                  */
 827                                 if (sk && !skb_device_locked(skb))
 828                                 {
 829                                         /* Remove it from any existing driver queue first! */
 830                                         skb_unlink(skb);
 831                                         /* Now queue it */
 832                                         ip_statistics.IpOutRequests++;
 833                                         dev_queue_xmit(skb, dev, sk->priority);
 834                                 }
 835                         }
 836                 }
 837                 
 838                 /*
 839                  *      Count retransmissions
 840                  */
 841                  
 842                 ct++;
 843                 sk->prot->retransmits ++;
 844                 tcp_statistics.TcpRetransSegs++;
 845                 
 846 
 847                 /*
 848                  *      Only one retransmit requested.
 849                  */
 850         
 851                 if (!all)
 852                         break;
 853 
 854                 /*
 855                  *      This should cut it off before we send too many packets.
 856                  */
 857 
 858                 if (ct >= sk->cong_window)
 859                         break;
 860                 skb = skb->link3;
 861         }
 862 }
 863 
 864 /*
 865  *      Reset the retransmission timer
 866  */
 867  
 868 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 869 {
 870         del_timer(&sk->retransmit_timer);
 871         sk->ip_xmit_timeout = why;
 872         if((long)when < 0)
 873         {
 874                 when=3;
 875                 printk("Error: Negative timer in xmit_timer\n");
 876         }
 877         sk->retransmit_timer.expires=jiffies+when;
 878         add_timer(&sk->retransmit_timer);
 879 }
 880 
 881 /*
 882  *      This is the normal code called for timeouts.  It does the retransmission
 883  *      and then does backoff.  tcp_do_retransmit is separated out because
 884  *      tcp_ack needs to send stuff from the retransmit queue without
 885  *      initiating a backoff.
 886  */
 887 
 888 
 889 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 890 {
 891         tcp_do_retransmit(sk, all);
 892 
 893         /*
 894          * Increase the timeout each time we retransmit.  Note that
 895          * we do not increase the rtt estimate.  rto is initialized
 896          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 897          * that doubling rto each time is the least we can get away with.
 898          * In KA9Q, Karn uses this for the first few times, and then
 899          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 900          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 901          * defined in the protocol as the maximum possible RTT.  I guess
 902          * we'll have to use something other than TCP to talk to the
 903          * University of Mars.
 904          *
 905          * PAWS allows us longer timeouts and large windows, so once
 906          * implemented ftp to mars will work nicely. We will have to fix
 907          * the 120 second clamps though!
 908          */
 909 
 910         sk->retransmits++;
 911         sk->prot->retransmits++;
 912         sk->backoff++;
 913         sk->rto = min(sk->rto << 1, 120*HZ);
 914         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 915 }
 916 
 917 
 918 /*
 919  *      A timer event has trigger a tcp retransmit timeout. The
 920  *      socket xmit queue is ready and set up to send. Because
 921  *      the ack receive code keeps the queue straight we do
 922  *      nothing clever here.
 923  */
 924 
 925 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 926 {
 927         if (all) 
 928         {
 929                 tcp_retransmit_time(sk, all);
 930                 return;
 931         }
 932 
 933         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 934         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 935         sk->cong_count = 0;
 936 
 937         sk->cong_window = 1;
 938 
 939         /* Do the actual retransmit. */
 940         tcp_retransmit_time(sk, all);
 941 }
 942 
 943 /*
 944  *      A write timeout has occurred. Process the after effects.
 945  */
 946 
 947 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 948 {
 949         /*
 950          *      Look for a 'soft' timeout.
 951          */
 952         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 953                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 954         {
 955                 /*
 956                  *      Attempt to recover if arp has changed (unlikely!) or
 957                  *      a route has shifted (not supported prior to 1.3).
 958                  */
 959                 ip_rt_advice(&sk->ip_route_cache, 0);
 960         }
 961         
 962         /*
 963          *      Have we tried to SYN too many times (repent repent 8))
 964          */
 965          
 966         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 967         {
 968                 if(sk->err_soft)
 969                         sk->err=sk->err_soft;
 970                 else
 971                         sk->err=ETIMEDOUT;
 972                 sk->error_report(sk);
 973                 del_timer(&sk->retransmit_timer);
 974                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 975                 tcp_set_state(sk,TCP_CLOSE);
 976                 /* Don't FIN, we got nothing back */
 977                 release_sock(sk);
 978                 return 0;
 979         }
 980         /*
 981          *      Has it gone just too far ?
 982          */
 983         if (sk->retransmits > TCP_RETR2) 
 984         {
 985                 if(sk->err_soft)
 986                         sk->err = sk->err_soft;
 987                 else
 988                         sk->err = ETIMEDOUT;
 989                 sk->error_report(sk);
 990                 del_timer(&sk->retransmit_timer);
 991                 /*
 992                  *      Time wait the socket 
 993                  */
 994                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 995                 {
 996                         tcp_set_state(sk,TCP_TIME_WAIT);
 997                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 998                 }
 999                 else
1000                 {
1001                         /*
1002                          *      Clean up time.
1003                          */
1004                         tcp_set_state(sk, TCP_CLOSE);
1005                         release_sock(sk);
1006                         return 0;
1007                 }
1008         }
1009         return 1;
1010 }
1011 
1012 /*
1013  *      The TCP retransmit timer. This lacks a few small details.
1014  *
1015  *      1.      An initial rtt timeout on the probe0 should cause what we can
1016  *              of the first write queue buffer to be split and sent.
1017  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
1018  *              ETIMEDOUT if we know an additional 'soft' error caused this.
1019  *              tcp_err should save a 'soft error' for us.
1020  */
1021 
1022 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
1023 {
1024         struct sock *sk = (struct sock*)data;
1025         int why = sk->ip_xmit_timeout;
1026 
1027         /*
1028          *      We are reset. We will send no more retransmits.
1029          */
1030          
1031         if(sk->zapped)
1032                 return;
1033                 
1034         /* 
1035          *      Only process if socket is not in use
1036          */
1037 
1038         cli();
1039         if (sk->inuse || in_bh) 
1040         {
1041                 /* Try again in 1 second */
1042                 sk->retransmit_timer.expires = jiffies+HZ;
1043                 add_timer(&sk->retransmit_timer);
1044                 sti();
1045                 return;
1046         }
1047 
1048         sk->inuse = 1;
1049         sti();
1050 
1051 
1052         if (sk->ack_backlog && !sk->dead) 
1053                 sk->data_ready(sk,0);
1054 
1055         /* Now we need to figure out why the socket was on the timer. */
1056 
1057         switch (why) 
1058         {
1059                 /* Window probing */
1060                 case TIME_PROBE0:
1061                         tcp_send_probe0(sk);
1062                         tcp_write_timeout(sk);
1063                         break;
1064                 /* Retransmitting */
1065                 case TIME_WRITE:
1066                         /* It could be we got here because we needed to send an ack.
1067                          * So we need to check for that.
1068                          */
1069                 {
1070                         struct sk_buff *skb;
1071                         unsigned long flags;
1072 
1073                         save_flags(flags);
1074                         cli();
1075                         skb = sk->send_head;
1076                         if (!skb) 
1077                         {
1078                                 if (sk->ack_backlog)
1079                                         tcp_read_wakeup(sk);
1080                                 restore_flags(flags);
1081                         } 
1082                         else 
1083                         {
1084                                 /*
1085                                  *      Kicked by a delayed ack. Reset timer
1086                                  *      correctly now
1087                                  */
1088                                 if (jiffies < skb->when + sk->rto) 
1089                                 {
1090                                         if (sk->ack_backlog)
1091                                                 tcp_read_wakeup(sk);
1092                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1093                                         restore_flags(flags);
1094                                         break;
1095                                 }
1096                                 restore_flags(flags);
1097                                 /*
1098                                  *      Retransmission
1099                                  */
1100                                 sk->retransmits++;
1101                                 sk->prot->retransmits++;
1102                                 sk->prot->retransmit (sk, 0);
1103                                 tcp_write_timeout(sk);
1104                         }
1105                         break;
1106                 }
1107                 /* Sending Keepalives */
1108                 case TIME_KEEPOPEN:
1109                         /* 
1110                          * this reset_timer() call is a hack, this is not
1111                          * how KEEPOPEN is supposed to work.
1112                          */
1113                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1114 
1115                         /* Send something to keep the connection open. */
1116                         if (sk->prot->write_wakeup)
1117                                   sk->prot->write_wakeup (sk);
1118                         sk->retransmits++;
1119                         sk->prot->retransmits++;
1120                         tcp_write_timeout(sk);
1121                         break;
1122                 default:
1123                         printk ("rexmit_timer: timer expired - reason unknown\n");
1124                         break;
1125         }
1126         release_sock(sk);
1127 }
1128 
1129 /*
1130  * This routine is called by the ICMP module when it gets some
1131  * sort of error condition.  If err < 0 then the socket should
1132  * be closed and the error returned to the user.  If err > 0
1133  * it's just the icmp type << 8 | icmp code.  After adjustment
1134  * header points to the first 8 bytes of the tcp header.  We need
1135  * to find the appropriate port.
1136  */
1137 
1138 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
1139         __u32 saddr, struct inet_protocol *protocol)
1140 {
1141         struct tcphdr *th = (struct tcphdr *)header;
1142         struct sock *sk;
1143         
1144         /*
1145          *      This one is _WRONG_. FIXME urgently.
1146          */
1147 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1148         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1149 #endif  
1150         th =(struct tcphdr *)header;
1151         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1152 
1153         if (sk == NULL) 
1154                 return;
1155   
1156         if (type == ICMP_SOURCE_QUENCH) 
1157         {
1158                 /*
1159                  * FIXME:
1160                  * For now we will just trigger a linear backoff.
1161                  * The slow start code should cause a real backoff here.
1162                  */
1163                 if (sk->cong_window > 4)
1164                         sk->cong_window--;
1165                 return;
1166         }
1167         
1168         if (type == ICMP_PARAMETERPROB)
1169         {
1170                 sk->err=EPROTO;
1171                 sk->error_report(sk);
1172         }
1173 
1174 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1175         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1176         {
1177                 struct rtable * rt;
1178                 /*
1179                  * Ugly trick to pass MTU to protocol layer.
1180                  * Really we should add argument "info" to error handler.
1181                  */
1182                 unsigned short new_mtu = ntohs(iph->id);
1183 
1184                 if ((rt = sk->ip_route_cache) != NULL)
1185                         if (rt->rt_mtu > new_mtu)
1186                                 rt->rt_mtu = new_mtu;
1187 
1188                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
1189                         && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
1190                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1191 
1192                 return;
1193         }
1194 #endif
1195 
1196         /*
1197          * If we've already connected we will keep trying
1198          * until we time out, or the user gives up.
1199          */
1200 
1201         if (code < 13)
1202         {       
1203                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204                 {
1205                         sk->err = icmp_err_convert[code].errno;
1206                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1207                         {
1208                                 tcp_statistics.TcpAttemptFails++;
1209                                 tcp_set_state(sk,TCP_CLOSE);
1210                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1211                         }
1212                 }
1213                 else    /* Only an error on timeout */
1214                         sk->err_soft = icmp_err_convert[code].errno;
1215         }
1216 }
1217 
1218 
1219 /*
1220  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1221  *      in the received data queue (ie a frame missing that needs sending to us). Not
1222  *      sorting using two queues as data arrives makes life so much harder.
1223  */
1224 
1225 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1226 {
1227         unsigned long counted;
1228         unsigned long amount;
1229         struct sk_buff *skb;
1230         int sum;
1231         unsigned long flags;
1232 
1233         if(sk && sk->debug)
1234                 printk("tcp_readable: %p - ",sk);
1235 
1236         save_flags(flags);
1237         cli();
1238         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1239         {
1240                 restore_flags(flags);
1241                 if(sk && sk->debug) 
1242                         printk("empty\n");
1243                 return(0);
1244         }
1245   
1246         counted = sk->copied_seq;       /* Where we are at the moment */
1247         amount = 0;
1248   
1249         /* 
1250          *      Do until a push or until we are out of data. 
1251          */
1252          
1253         do 
1254         {
1255                 if (before(counted, skb->seq))          /* Found a hole so stops here */
1256                         break;
1257                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
1258                 if (skb->h.th->syn)
1259                         sum++;
1260                 if (sum > 0) 
1261                 {                                       /* Add it up, move on */
1262                         amount += sum;
1263                         if (skb->h.th->syn) 
1264                                 amount--;
1265                         counted += sum;
1266                 }
1267                 /*
1268                  * Don't count urg data ... but do it in the right place!
1269                  * Consider: "old_data (ptr is here) URG PUSH data"
1270                  * The old code would stop at the first push because
1271                  * it counted the urg (amount==1) and then does amount--
1272                  * *after* the loop.  This means tcp_readable() always
1273                  * returned zero if any URG PUSH was in the queue, even
1274                  * though there was normal data available. If we subtract
1275                  * the urg data right here, we even get it to work for more
1276                  * than one URG PUSH skb without normal data.
1277                  * This means that select() finally works now with urg data
1278                  * in the queue.  Note that rlogin was never affected
1279                  * because it doesn't use select(); it uses two processes
1280                  * and a blocking read().  And the queue scan in tcp_read()
1281                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1282                  */
1283                 if (skb->h.th->urg)
1284                         amount--;       /* don't count urg data */
1285                 if (amount && skb->h.th->psh) break;
1286                 skb = skb->next;
1287         }
1288         while(skb != (struct sk_buff *)&sk->receive_queue);
1289 
1290         restore_flags(flags);
1291         if(sk->debug)
1292                 printk("got %lu bytes.\n",amount);
1293         return(amount);
1294 }
1295 
1296 /*
1297  * LISTEN is a special case for select..
1298  */
1299 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1300 {
1301         if (sel_type == SEL_IN) {
1302                 int retval;
1303 
1304                 sk->inuse = 1;
1305                 retval = (tcp_find_established(sk) != NULL);
1306                 release_sock(sk);
1307                 if (!retval)
1308                         select_wait(&master_select_wakeup,wait);
1309                 return retval;
1310         }
1311         return 0;
1312 }
1313 
1314 
1315 /*
1316  *      Wait for a TCP event.
1317  *
1318  *      Note that we don't need to set "sk->inuse", as the upper select layers
1319  *      take care of normal races (between the test and the event) and we don't
1320  *      go look at any of the socket buffers directly.
1321  */
1322 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1323 {
1324         if (sk->state == TCP_LISTEN)
1325                 return tcp_listen_select(sk, sel_type, wait);
1326 
1327         switch(sel_type) {
1328         case SEL_IN:
1329                 if (sk->err)
1330                         return 1;
1331                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1332                         break;
1333 
1334                 if (sk->shutdown & RCV_SHUTDOWN)
1335                         return 1;
1336                         
1337                 if (sk->acked_seq == sk->copied_seq)
1338                         break;
1339 
1340                 if (sk->urg_seq != sk->copied_seq ||
1341                     sk->acked_seq != sk->copied_seq+1 ||
1342                     sk->urginline || !sk->urg_data)
1343                         return 1;
1344                 break;
1345 
1346         case SEL_OUT:
1347                 if (sk->err)
1348                         return 1;
1349                 if (sk->shutdown & SEND_SHUTDOWN) 
1350                         return 0;
1351                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1352                         break;
1353                 /*
1354                  * This is now right thanks to a small fix
1355                  * by Matt Dillon.
1356                  */
1357 
1358                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1359                         break;
1360                 return 1;
1361 
1362         case SEL_EX:
1363                 if (sk->urg_data)
1364                         return 1;
1365                 break;
1366         }
1367         select_wait(sk->sleep, wait);
1368         return 0;
1369 }
1370 
1371 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
1372 {
1373         int err;
1374         switch(cmd) 
1375         {
1376 
1377                 case TIOCINQ:
1378 #ifdef FIXME    /* FIXME: */
1379                 case FIONREAD:
1380 #endif
1381                 {
1382                         unsigned long amount;
1383 
1384                         if (sk->state == TCP_LISTEN) 
1385                                 return(-EINVAL);
1386 
1387                         sk->inuse = 1;
1388                         amount = tcp_readable(sk);
1389                         release_sock(sk);
1390                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1391                         if(err)
1392                                 return err;
1393                         put_user(amount, (int *)arg);
1394                         return(0);
1395                 }
1396                 case SIOCATMARK:
1397                 {
1398                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1399 
1400                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1401                         if (err)
1402                                 return err;
1403                         put_user(answ,(int *) arg);
1404                         return(0);
1405                 }
1406                 case TIOCOUTQ:
1407                 {
1408                         unsigned long amount;
1409 
1410                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1411                         amount = sock_wspace(sk);
1412                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1413                         if(err)
1414                                 return err;
1415                         put_user(amount, (int *)arg);
1416                         return(0);
1417                 }
1418                 default:
1419                         return(-EINVAL);
1420         }
1421 }
1422 
1423 
1424 /*
1425  *      This routine computes a TCP checksum. 
1426  *
1427  *      Modified January 1995 from a go-faster DOS routine by
1428  *      Jorge Cwik <jorge@laser.satlink.net>
1429  */
1430  
1431 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1432           unsigned long saddr, unsigned long daddr, unsigned long base)
1433 {     
1434         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1435 }
1436 
1437 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1438                 unsigned long daddr, int len, struct sock *sk)
1439 {
1440         th->check = 0;
1441         th->check = tcp_check(th, len, saddr, daddr,
1442                 csum_partial((char *)th,len,0));
1443         return;
1444 }
1445 
1446 /*
1447  *      This is the main buffer sending routine. We queue the buffer
1448  *      having checked it is sane seeming.
1449  */
1450  
1451 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1452 {
1453         int size;
1454         struct tcphdr * th = skb->h.th;
1455 
1456         /*
1457          *      length of packet (not counting length of pre-tcp headers) 
1458          */
1459          
1460         size = skb->len - ((unsigned char *) th - skb->data);
1461 
1462         /*
1463          *      Sanity check it.. 
1464          */
1465          
1466         if (size < sizeof(struct tcphdr) || size > skb->len) 
1467         {
1468                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1469                         skb, skb->data, th, skb->len);
1470                 kfree_skb(skb, FREE_WRITE);
1471                 return;
1472         }
1473 
1474         /*
1475          *      If we have queued a header size packet.. (these crash a few
1476          *      tcp stacks if ack is not set)
1477          */
1478          
1479         if (size == sizeof(struct tcphdr)) 
1480         {
1481                 /* If it's got a syn or fin it's notionally included in the size..*/
1482                 if(!th->syn && !th->fin) 
1483                 {
1484                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1485                         kfree_skb(skb,FREE_WRITE);
1486                         return;
1487                 }
1488         }
1489 
1490         /*
1491          *      Actual processing.
1492          */
1493          
1494         tcp_statistics.TcpOutSegs++;  
1495         skb->seq = ntohl(th->seq);
1496         skb->end_seq = skb->seq + size - 4*th->doff;
1497         
1498         /*
1499          *      We must queue if
1500          *
1501          *      a) The right edge of this frame exceeds the window
1502          *      b) We are retransmitting (Nagle's rule)
1503          *      c) We have too many packets 'in flight'
1504          */
1505          
1506         if (after(skb->end_seq, sk->window_seq) ||
1507             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1508              sk->packets_out >= sk->cong_window) 
1509         {
1510                 /* checksum will be supplied by tcp_write_xmit.  So
1511                  * we shouldn't need to set it at all.  I'm being paranoid */
1512                 th->check = 0;
1513                 if (skb->next != NULL) 
1514                 {
1515                         printk("tcp_send_partial: next != NULL\n");
1516                         skb_unlink(skb);
1517                 }
1518                 skb_queue_tail(&sk->write_queue, skb);
1519                 
1520                 /*
1521                  *      If we don't fit we have to start the zero window
1522                  *      probes. This is broken - we really need to do a partial
1523                  *      send _first_ (This is what causes the Cisco and PC/TCP
1524                  *      grief).
1525                  */
1526                  
1527                 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1528                     sk->send_head == NULL && sk->ack_backlog == 0)
1529                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1530         } 
1531         else 
1532         {
1533                 /*
1534                  *      This is going straight out
1535                  */
1536                  
1537                 th->ack_seq = htonl(sk->acked_seq);
1538                 th->window = htons(tcp_select_window(sk));
1539 
1540                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1541 
1542                 sk->sent_seq = sk->write_seq;
1543                 
1544                 /*
1545                  *      This is mad. The tcp retransmit queue is put together
1546                  *      by the ip layer. This causes half the problems with
1547                  *      unroutable FIN's and other things.
1548                  */
1549                  
1550                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1551                 
1552                 
1553                 sk->ack_backlog = 0;
1554                 sk->bytes_rcv = 0;
1555 
1556                 /*
1557                  *      Set for next retransmit based on expected ACK time.
1558                  *      FIXME: We set this every time which means our 
1559                  *      retransmits are really about a window behind.
1560                  */
1561 
1562                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1563         }
1564 }
1565 
1566 /*
1567  *      Locking problems lead us to a messy situation where we can have
1568  *      multiple partially complete buffers queued up. This is really bad
1569  *      as we don't want to be sending partial buffers. Fix this with
1570  *      a semaphore or similar to lock tcp_write per socket.
1571  *
1572  *      These routines are pretty self descriptive.
1573  */
1574  
1575 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1576 {
1577         struct sk_buff * skb;
1578         unsigned long flags;
1579 
1580         save_flags(flags);
1581         cli();
1582         skb = sk->partial;
1583         if (skb) {
1584                 sk->partial = NULL;
1585                 del_timer(&sk->partial_timer);
1586         }
1587         restore_flags(flags);
1588         return skb;
1589 }
1590 
1591 /*
1592  *      Empty the partial queue
1593  */
1594  
1595 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1596 {
1597         struct sk_buff *skb;
1598 
1599         if (sk == NULL)
1600                 return;
1601         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1602                 tcp_send_skb(sk, skb);
1603 }
1604 
1605 /*
1606  *      Queue a partial frame
1607  */
1608  
1609 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1610 {
1611         struct sk_buff * tmp;
1612         unsigned long flags;
1613 
1614         save_flags(flags);
1615         cli();
1616         tmp = sk->partial;
1617         if (tmp)
1618                 del_timer(&sk->partial_timer);
1619         sk->partial = skb;
1620         init_timer(&sk->partial_timer);
1621         /*
1622          *      Wait up to 1 second for the buffer to fill.
1623          */
1624         sk->partial_timer.expires = jiffies+HZ;
1625         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1626         sk->partial_timer.data = (unsigned long) sk;
1627         add_timer(&sk->partial_timer);
1628         restore_flags(flags);
1629         if (tmp)
1630                 tcp_send_skb(sk, tmp);
1631 }
1632 
1633 
1634 
1635 /*
1636  *      This routine sends an ack and also updates the window. 
1637  */
1638  
1639 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1640              struct sock *sk,
1641              struct tcphdr *th, unsigned long daddr)
1642 {
1643         struct sk_buff *buff;
1644         struct tcphdr *t1;
1645         struct device *dev = NULL;
1646         int tmp;
1647 
1648         if(sk->zapped)
1649                 return;         /* We have been reset, we may not send again */
1650                 
1651         /*
1652          * We need to grab some memory, and put together an ack,
1653          * and then put it into the queue to be sent.
1654          */
1655 
1656         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1657         if (buff == NULL) 
1658         {
1659                 /* 
1660                  *      Force it to send an ack. We don't have to do this
1661                  *      (ACK is unreliable) but it's much better use of 
1662                  *      bandwidth on slow links to send a spare ack than
1663                  *      resend packets. 
1664                  */
1665                  
1666                 sk->ack_backlog++;
1667                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1668                 {
1669                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1670                 }
1671                 return;
1672         }
1673 
1674         /*
1675          *      Assemble a suitable TCP frame
1676          */
1677          
1678         buff->sk = sk;
1679         buff->localroute = sk->localroute;
1680 
1681         /* 
1682          *      Put in the IP header and routing stuff. 
1683          */
1684          
1685         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1686                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1687         if (tmp < 0) 
1688         {
1689                 buff->free = 1;
1690                 sock_wfree(sk, buff);
1691                 return;
1692         }
1693         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1694 
1695         memcpy(t1, th, sizeof(*t1));
1696 
1697         /*
1698          *      Swap the send and the receive. 
1699          */
1700          
1701         t1->dest = th->source;
1702         t1->source = th->dest;
1703         t1->seq = ntohl(sequence);
1704         t1->ack = 1;
1705         sk->window = tcp_select_window(sk);
1706         t1->window = ntohs(sk->window);
1707         t1->res1 = 0;
1708         t1->res2 = 0;
1709         t1->rst = 0;
1710         t1->urg = 0;
1711         t1->syn = 0;
1712         t1->psh = 0;
1713         t1->fin = 0;
1714         
1715         /*
1716          *      If we have nothing queued for transmit and the transmit timer
1717          *      is on we are just doing an ACK timeout and need to switch
1718          *      to a keepalive.
1719          */
1720          
1721         if (ack == sk->acked_seq) {               
1722                 sk->ack_backlog = 0;
1723                 sk->bytes_rcv = 0;
1724                 sk->ack_timed = 0;
1725 
1726                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1727                     && sk->ip_xmit_timeout == TIME_WRITE)       
1728                   if(sk->keepopen) 
1729                     reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1730                   else 
1731                     delete_timer(sk);                           
1732         }
1733 
1734         /*
1735          *      Fill in the packet and send it
1736          */
1737          
1738         t1->ack_seq = htonl(ack);
1739         t1->doff = sizeof(*t1)/4;
1740         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1741         if (sk->debug)
1742                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1743         tcp_statistics.TcpOutSegs++;
1744         sk->prot->queue_xmit(sk, dev, buff, 1);
1745 }
1746 
1747 
1748 /* 
1749  *      This routine builds a generic TCP header. 
1750  */
1751  
1752 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1753 {
1754 
1755         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1756         th->seq = htonl(sk->write_seq);
1757         th->psh =(push == 0) ? 1 : 0;
1758         th->doff = sizeof(*th)/4;
1759         th->ack = 1;
1760         th->fin = 0;
1761         sk->ack_backlog = 0;
1762         sk->bytes_rcv = 0;
1763         sk->ack_timed = 0;
1764         th->ack_seq = htonl(sk->acked_seq);
1765         sk->window = tcp_select_window(sk);
1766         th->window = htons(sk->window);
1767 
1768         return(sizeof(*th));
1769 }
1770 
1771 /*
1772  *      This routine copies from a user buffer into a socket,
1773  *      and starts the transmit system.
1774  */
1775 
1776 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
1777           int len, int nonblock, int flags)
1778 {
1779         int copied = 0;
1780         int copy;
1781         int tmp;
1782         int seglen;
1783         int iovct=0;
1784         struct sk_buff *skb;
1785         struct sk_buff *send_tmp;
1786         struct proto *prot;
1787         struct device *dev = NULL;
1788         unsigned char *from;
1789         
1790         /*
1791          *      Do sanity checking for sendmsg/sendto/send
1792          */
1793          
1794         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1795                 return -EINVAL;
1796         if (msg->msg_name)
1797         {
1798                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1799                 if(sk->state == TCP_CLOSE)
1800                         return -ENOTCONN;
1801                 if (msg->msg_namelen < sizeof(*addr))
1802                         return -EINVAL;
1803                 if (addr->sin_family && addr->sin_family != AF_INET) 
1804                         return -EINVAL;
1805                 if (addr->sin_port != sk->dummy_th.dest) 
1806                         return -EISCONN;
1807                 if (addr->sin_addr.s_addr != sk->daddr) 
1808                         return -EISCONN;
1809         }
1810         
1811         /*
1812          *      Ok commence sending
1813          */
1814         
1815         while(iovct<msg->msg_iovlen)
1816         {
1817                 seglen=msg->msg_iov[iovct].iov_len;
1818                 from=msg->msg_iov[iovct++].iov_base;
1819                 sk->inuse=1;
1820                 prot = sk->prot;
1821                 while(seglen > 0) 
1822                 {
1823                         if (sk->err) 
1824                         {                       /* Stop on an error */
1825                                 release_sock(sk);
1826                                 if (copied) 
1827                                         return(copied);
1828                                 return sock_error(sk);
1829                         }
1830 
1831                         /*
1832                          *      First thing we do is make sure that we are established. 
1833                          */
1834         
1835                         if (sk->shutdown & SEND_SHUTDOWN) 
1836                         {
1837                                 release_sock(sk);
1838                                 sk->err = EPIPE;
1839                                 if (copied) 
1840                                         return(copied);
1841                                 sk->err = 0;
1842                                 return(-EPIPE);
1843                         }
1844 
1845                         /* 
1846                          *      Wait for a connection to finish.
1847                          */
1848                 
1849                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1850                         {
1851                                 if (sk->err) 
1852                                 {
1853                                         release_sock(sk);
1854                                         if (copied) 
1855                                                 return(copied);
1856                                         return sock_error(sk);
1857                                 }               
1858         
1859                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1860                                 {
1861                                         release_sock(sk);
1862                                         if (copied) 
1863                                                 return(copied);
1864         
1865                                         if (sk->err) 
1866                                                 return sock_error(sk);
1867 
1868                                         if (sk->keepopen) 
1869                                         {
1870                                                 send_sig(SIGPIPE, current, 0);
1871                                         }
1872                                         return(-EPIPE);
1873                                 }
1874         
1875                                 if (nonblock || copied) 
1876                                 {
1877                                         release_sock(sk);
1878                                         if (copied) 
1879                                                 return(copied);
1880                                         return(-EAGAIN);
1881                                 }
1882         
1883                                 release_sock(sk);
1884                                 cli();
1885                         
1886                                 if (sk->state != TCP_ESTABLISHED &&
1887                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1888                                 {
1889                                         interruptible_sleep_on(sk->sleep);      
1890                                         if (current->signal & ~current->blocked)
1891                                         {
1892                                                 sti();
1893                                                 if (copied) 
1894                                                         return(copied);
1895                                                 return(-ERESTARTSYS);
1896                                         }
1897                                 }
1898                                 sk->inuse = 1;
1899                                 sti();
1900                         }
1901         
1902                 /*
1903                  * The following code can result in copy <= if sk->mss is ever
1904                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1905                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1906                  * had better not get here until we've seen his SYN and at least one
1907                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1908                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1909                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1910                  * before the exchange of SYN's.  If the initial ack from the other
1911                  * end has a window of 0, max_window and thus mss will both be 0.
1912                  */
1913         
1914                 /* 
1915                  *      Now we need to check if we have a half built packet. 
1916                  */
1917 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1918                 /*
1919                  *      FIXME:  I'm almost sure that this fragment is BUG,
1920                  *              but it works... I do not know why 8) --ANK
1921                  *
1922                  *      Really, we should rebuild all the queues...
1923                  *      It's difficult. Temprorary hack is to send all
1924                  *      queued segments with allowed fragmentation.
1925                  */
1926                 {
1927                         int new_mss = min(sk->mtu, sk->max_window);
1928                         if (new_mss < sk->mss)
1929                         {
1930                                 tcp_send_partial(sk);
1931                                 sk->mss = new_mss;
1932                         }
1933                 }
1934 #endif
1935         
1936                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1937                         {
1938                                 int hdrlen;
1939 
1940                                  /* IP header + TCP header */
1941                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1942                                          + sizeof(struct tcphdr);
1943         
1944                                 /* Add more stuff to the end of skb->len */
1945                                 if (!(flags & MSG_OOB)) 
1946                                 {
1947                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1948                                         if (copy <= 0) 
1949                                         {
1950                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1951                                                 return -EFAULT;
1952                                         }                 
1953                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1954                                         from += copy;
1955                                         copied += copy;
1956                                         len -= copy;
1957                                         sk->write_seq += copy;
1958                                         seglen -= copy;
1959                                 }
1960                                 if ((skb->len - hdrlen) >= sk->mss ||
1961                                         (flags & MSG_OOB) || !sk->packets_out)
1962                                         tcp_send_skb(sk, skb);
1963                                 else
1964                                         tcp_enqueue_partial(skb, sk);
1965                                 continue;
1966                         }
1967 
1968                 /*
1969                  * We also need to worry about the window.
1970                  * If window < 1/2 the maximum window we've seen from this
1971                  *   host, don't use it.  This is sender side
1972                  *   silly window prevention, as specified in RFC1122.
1973                  *   (Note that this is different than earlier versions of
1974                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1975                  *   use the whole MSS.  Since the results in the right
1976                  *   edge of the packet being outside the window, it will
1977                  *   be queued for later rather than sent.
1978                  */
1979 
1980                         copy = sk->window_seq - sk->write_seq;
1981                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1982                                 copy = sk->mss;
1983                         if (copy > seglen)
1984                                 copy = seglen;
1985 
1986                 /*
1987                  *      We should really check the window here also. 
1988                  */
1989                  
1990                         send_tmp = NULL;
1991                         if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out) 
1992                         {
1993                                 /*
1994                                  *      We will release the socket in case we sleep here. 
1995                                  */
1996                                 release_sock(sk);
1997                                 /*
1998                                  *      NB: following must be mtu, because mss can be increased.
1999                                  *      mss is always <= mtu 
2000                                  */
2001                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
2002                                 sk->inuse = 1;
2003                                 send_tmp = skb;
2004                         } 
2005                         else 
2006                         {
2007                                 /*
2008                                  *      We will release the socket in case we sleep here. 
2009                                  */
2010                                 release_sock(sk);
2011                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
2012                                 sk->inuse = 1;
2013                         }
2014         
2015                         /*
2016                          *      If we didn't get any memory, we need to sleep. 
2017                          */
2018         
2019                         if (skb == NULL) 
2020                         {
2021                                 sk->socket->flags |= SO_NOSPACE;
2022                                 if (nonblock) 
2023                                 {
2024                                         release_sock(sk);
2025                                         if (copied) 
2026                                                 return(copied);
2027                                         return(-EAGAIN);
2028                                 }
2029 
2030                                 /*
2031                                  *      FIXME: here is another race condition. 
2032                                  */
2033 
2034                                 tmp = sk->wmem_alloc;
2035                                 release_sock(sk);
2036                                 cli();
2037                                 /*
2038                                  *      Again we will try to avoid it. 
2039                                  */
2040                                 if (tmp <= sk->wmem_alloc &&
2041                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
2042                                         && sk->err == 0) 
2043                                 {
2044                                         sk->socket->flags &= ~SO_NOSPACE;
2045                                         interruptible_sleep_on(sk->sleep);
2046                                         if (current->signal & ~current->blocked) 
2047                                         {
2048                                                 sti();
2049                                                 if (copied) 
2050                                                         return(copied);
2051                                                 return(-ERESTARTSYS);
2052                                         }
2053                                 }
2054                                 sk->inuse = 1;
2055                                 sti();
2056                                 continue;
2057                         }
2058 
2059                         skb->sk = sk;
2060                         skb->free = 0;
2061                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
2062         
2063                         /*
2064                          * FIXME: we need to optimize this.
2065                          * Perhaps some hints here would be good.
2066                          */
2067                 
2068                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
2069                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2070                         if (tmp < 0 ) 
2071                         {
2072                                 sock_wfree(sk, skb);
2073                                 release_sock(sk);
2074                                 if (copied) 
2075                                         return(copied);
2076                                 return(tmp);
2077                         }
2078 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
2079                         skb->ip_hdr->frag_off |= htons(IP_DF);
2080 #endif
2081                         skb->dev = dev;
2082                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
2083                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2084                         if (tmp < 0) 
2085                         {
2086                                 sock_wfree(sk, skb);
2087                                 release_sock(sk);
2088                                 if (copied) 
2089                                         return(copied);
2090                                 return(tmp);
2091                         }
2092         
2093                         if (flags & MSG_OOB) 
2094                         {
2095                                 skb->h.th->urg = 1;
2096                                 skb->h.th->urg_ptr = ntohs(copy);
2097                         }
2098 
2099                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2100                 
2101                         from += copy;
2102                         copied += copy;
2103                         len -= copy;
2104                         seglen -= copy;
2105                         skb->free = 0;
2106                         sk->write_seq += copy;
2107                 
2108                         if (send_tmp != NULL) 
2109                         {
2110                                 tcp_enqueue_partial(send_tmp, sk);
2111                                 continue;
2112                         }
2113                         tcp_send_skb(sk, skb);
2114                 }
2115         }
2116         sk->err = 0;
2117 
2118 /*
2119  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2120  *      interactive fast network servers. It's meant to be on and
2121  *      it really improves the throughput though not the echo time
2122  *      on my slow slip link - Alan
2123  */
2124 
2125 /*
2126  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2127  */
2128  
2129         if(sk->partial && ((!sk->packets_out) 
2130      /* If not nagling we can send on the before case too.. */
2131               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2132         ))
2133                 tcp_send_partial(sk);
2134 
2135         release_sock(sk);
2136         return(copied);
2137 }
2138 
2139 /*
2140  *      Send an ack if one is backlogged at this point. Ought to merge
2141  *      this with tcp_send_ack().
2142  *      This is called for delayed acks also.
2143  */
2144  
2145 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2146 {
2147         int tmp;
2148         struct device *dev = NULL;
2149         struct tcphdr *t1;
2150         struct sk_buff *buff;
2151 
2152         if (!sk->ack_backlog) 
2153                 return;
2154 
2155         /*
2156          * If we're closed, don't send an ack, or we'll get a RST
2157          * from the closed destination.
2158          */
2159         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2160                 return; 
2161 
2162         /*
2163          * FIXME: we need to put code here to prevent this routine from
2164          * being called.  Being called once in a while is ok, so only check
2165          * if this is the second time in a row.
2166          */
2167 
2168         /*
2169          * We need to grab some memory, and put together an ack,
2170          * and then put it into the queue to be sent.
2171          */
2172 
2173         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2174         if (buff == NULL) 
2175         {
2176                 /* Try again real soon. */
2177                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2178                 return;
2179         }
2180 
2181         buff->sk = sk;
2182         buff->localroute = sk->localroute;
2183         
2184         /*
2185          *      Put in the IP header and routing stuff. 
2186          */
2187 
2188         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2189                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2190         if (tmp < 0) 
2191         {
2192                 buff->free = 1;
2193                 sock_wfree(sk, buff);
2194                 return;
2195         }
2196 
2197         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2198 
2199         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2200         t1->seq = htonl(sk->sent_seq);
2201         t1->ack = 1;
2202         t1->res1 = 0;
2203         t1->res2 = 0;
2204         t1->rst = 0;
2205         t1->urg = 0;
2206         t1->syn = 0;
2207         t1->psh = 0;
2208 
2209 
2210         sk->ack_backlog = 0;
2211         sk->bytes_rcv = 0;
2212 
2213         sk->window = tcp_select_window(sk);
2214         t1->window = htons(sk->window);
2215         t1->ack_seq = htonl(sk->acked_seq);
2216         t1->doff = sizeof(*t1)/4;
2217         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2218         sk->prot->queue_xmit(sk, dev, buff, 1);
2219         tcp_statistics.TcpOutSegs++;
2220 }
2221 
2222 
2223 /*
2224  *      FIXME:
2225  *      This routine frees used buffers.
2226  *      It should consider sending an ACK to let the
2227  *      other end know we now have a bigger window.
2228  */
2229 
2230 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2231 {
2232         unsigned long flags;
2233         struct sk_buff *skb;
2234         unsigned long rspace;
2235 
2236         save_flags(flags);
2237         cli();
2238 
2239         /*
2240          * See if we have anything to free up?
2241          */
2242 
2243         skb = skb_peek(&sk->receive_queue);
2244         if (!skb || !skb->used || skb->users) {
2245                 restore_flags(flags);
2246                 return;
2247         }
2248 
2249         /*
2250          *      We have to loop through all the buffer headers,
2251          *      and try to free up all the space we can.
2252          */
2253 
2254         do {
2255                 skb_unlink(skb);
2256                 skb->sk = sk;
2257                 kfree_skb(skb, FREE_READ);
2258                 skb = skb_peek(&sk->receive_queue);
2259         } while (skb && skb->used && !skb->users);
2260         restore_flags(flags);
2261 
2262         /*
2263          *      FIXME:
2264          *      At this point we should send an ack if the difference
2265          *      in the window, and the amount of space is bigger than
2266          *      TCP_WINDOW_DIFF.
2267          */
2268 
2269         rspace=sock_rspace(sk);
2270         if(sk->debug)
2271                 printk("sk->rspace = %lu\n", rspace);
2272         /*
2273          * This area has caused the most trouble.  The current strategy
2274          * is to simply do nothing if the other end has room to send at
2275          * least 3 full packets, because the ack from those will auto-
2276          * matically update the window.  If the other end doesn't think
2277          * we have much space left, but we have room for at least 1 more
2278          * complete packet than it thinks we do, we will send an ack
2279          * immediately.  Otherwise we will wait up to .5 seconds in case
2280          * the user reads some more.
2281          */
2282         sk->ack_backlog++;
2283 
2284         /*
2285          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2286          * if the other end is offering a window smaller than the agreed on MSS
2287          * (called sk->mtu here).  In theory there's no connection between send
2288          * and receive, and so no reason to think that they're going to send
2289          * small packets.  For the moment I'm using the hack of reducing the mss
2290          * only on the send side, so I'm putting mtu here.
2291          */
2292 
2293         if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2294         {
2295                 /* Send an ack right now. */
2296                 tcp_read_wakeup(sk);
2297         } 
2298         else 
2299         {
2300                 /* Force it to send an ack soon. */
2301                 int was_active = del_timer(&sk->retransmit_timer);
2302                 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2303                 {
2304                         reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2305                 } 
2306                 else
2307                         add_timer(&sk->retransmit_timer);
2308         }
2309 } 
2310 
2311 
2312 /*
2313  *      Handle reading urgent data. BSD has very simple semantics for
2314  *      this, no blocking and very strange errors 8)
2315  */
2316  
2317 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2318              struct msghdr *msg, int len, int flags, int *addr_len)
2319 {
2320         /*
2321          *      No URG data to read
2322          */
2323         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2324                 return -EINVAL; /* Yes this is right ! */
2325                 
2326         if (sk->err) 
2327                 return sock_error(sk);
2328                 
2329         if (sk->state == TCP_CLOSE || sk->done) 
2330         {
2331                 if (!sk->done) 
2332                 {
2333                         sk->done = 1;
2334                         return 0;
2335                 }
2336                 return -ENOTCONN;
2337         }
2338 
2339         if (sk->shutdown & RCV_SHUTDOWN) 
2340         {
2341                 sk->done = 1;
2342                 return 0;
2343         }
2344         sk->inuse = 1;
2345         if (sk->urg_data & URG_VALID) 
2346         {
2347                 char c = sk->urg_data;
2348                 if (!(flags & MSG_PEEK))
2349                         sk->urg_data = URG_READ;
2350                 memcpy_toiovec(msg->msg_iov, &c, 1);
2351                 if(msg->msg_name)
2352                 {
2353                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2354                         sin->sin_family=AF_INET;
2355                         sin->sin_addr.s_addr=sk->daddr;
2356                         sin->sin_port=sk->dummy_th.dest;
2357                 }
2358                 if(addr_len)
2359                         *addr_len=sizeof(struct sockaddr_in);
2360                 release_sock(sk);
2361                 return 1;
2362         }
2363         release_sock(sk);
2364         
2365         /*
2366          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2367          * the available implementations agree in this case:
2368          * this call should never block, independent of the
2369          * blocking state of the socket.
2370          * Mike <pall@rz.uni-karlsruhe.de>
2371          */
2372         return -EAGAIN;
2373 }
2374 
2375 
2376 /*
2377  *      This routine copies from a sock struct into the user buffer. 
2378  */
2379  
2380 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
2381         int len, int nonblock, int flags, int *addr_len)
2382 {
2383         struct wait_queue wait = { current, NULL };
2384         int copied = 0;
2385         u32 peek_seq;
2386         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2387         unsigned long used;
2388 
2389         /* 
2390          *      This error should be checked. 
2391          */
2392          
2393         if (sk->state == TCP_LISTEN)
2394                 return -ENOTCONN;
2395 
2396         /*
2397          *      Urgent data needs to be handled specially. 
2398          */
2399          
2400         if (flags & MSG_OOB)
2401                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2402 
2403         /*
2404          *      Copying sequence to update. This is volatile to handle
2405          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2406          *      inline and thus not flush cached variables otherwise).
2407          */
2408          
2409         peek_seq = sk->copied_seq;
2410         seq = &sk->copied_seq;
2411         if (flags & MSG_PEEK)
2412                 seq = &peek_seq;
2413 
2414         add_wait_queue(sk->sleep, &wait);
2415         sk->inuse = 1;
2416         while (len > 0) 
2417         {
2418                 struct sk_buff * skb;
2419                 u32 offset;
2420         
2421                 /*
2422                  * Are we at urgent data? Stop if we have read anything.
2423                  */
2424                  
2425                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2426                         break;
2427 
2428                 /*
2429                  *      Next get a buffer.
2430                  */
2431                  
2432                 current->state = TASK_INTERRUPTIBLE;
2433 
2434                 skb = skb_peek(&sk->receive_queue);
2435                 do 
2436                 {
2437                         if (!skb)
2438                                 break;
2439                         if (before(*seq, skb->seq))
2440                                 break;
2441                         offset = *seq - skb->seq;
2442                         if (skb->h.th->syn)
2443                                 offset--;
2444                         if (offset < skb->len)
2445                                 goto found_ok_skb;
2446                         if (skb->h.th->fin)
2447                                 goto found_fin_ok;
2448                         if (!(flags & MSG_PEEK))
2449                                 skb->used = 1;
2450                         skb = skb->next;
2451                 }
2452                 while (skb != (struct sk_buff *)&sk->receive_queue);
2453 
2454                 if (copied)
2455                         break;
2456 
2457                 if (sk->err) 
2458                 {
2459                         copied = sock_error(sk);
2460                         break;
2461                 }
2462 
2463                 if (sk->state == TCP_CLOSE) 
2464                 {
2465                         if (!sk->done) 
2466                         {
2467                                 sk->done = 1;
2468                                 break;
2469                         }
2470                         copied = -ENOTCONN;
2471                         break;
2472                 }
2473 
2474                 if (sk->shutdown & RCV_SHUTDOWN) 
2475                 {
2476                         sk->done = 1;
2477                         break;
2478                 }
2479                         
2480                 if (nonblock) 
2481                 {
2482                         copied = -EAGAIN;
2483                         break;
2484                 }
2485 
2486                 cleanup_rbuf(sk);
2487                 release_sock(sk);
2488                 sk->socket->flags |= SO_WAITDATA;
2489                 schedule();
2490                 sk->socket->flags &= ~SO_WAITDATA;
2491                 sk->inuse = 1;
2492 
2493                 if (current->signal & ~current->blocked) 
2494                 {
2495                         copied = -ERESTARTSYS;
2496                         break;
2497                 }
2498                 continue;
2499 
2500         found_ok_skb:
2501                 /*
2502                  *      Lock the buffer. We can be fairly relaxed as
2503                  *      an interrupt will never steal a buffer we are 
2504                  *      using unless I've missed something serious in
2505                  *      tcp_data.
2506                  */
2507                 
2508                 skb->users++;
2509                 
2510                 /*
2511                  *      Ok so how much can we use ? 
2512                  */
2513                  
2514                 used = skb->len - offset;
2515                 if (len < used)
2516                         used = len;
2517                 /*
2518                  *      Do we have urgent data here? 
2519                  */
2520                 
2521                 if (sk->urg_data) 
2522                 {
2523                         u32 urg_offset = sk->urg_seq - *seq;
2524                         if (urg_offset < used) 
2525                         {
2526                                 if (!urg_offset) 
2527                                 {
2528                                         if (!sk->urginline) 
2529                                         {
2530                                                 ++*seq;
2531                                                 offset++;
2532                                                 used--;
2533                                         }
2534                                 }
2535                                 else
2536                                         used = urg_offset;
2537                         }
2538                 }
2539                 
2540                 /*
2541                  *      Copy it - We _MUST_ update *seq first so that we
2542                  *      don't ever double read when we have dual readers
2543                  */
2544                  
2545                 *seq += used;
2546 
2547                 /*
2548                  *      This memcpy_tofs can sleep. If it sleeps and we
2549                  *      do a second read it relies on the skb->users to avoid
2550                  *      a crash when cleanup_rbuf() gets called.
2551                  */
2552                  
2553                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2554                         skb->h.th->doff*4 + offset, used);
2555                 copied += used;
2556                 len -= used;
2557                 
2558                 /*
2559                  *      We now will not sleep again until we are finished
2560                  *      with skb. Sorry if you are doing the SMP port
2561                  *      but you'll just have to fix it neatly ;)
2562                  */
2563                  
2564                 skb->users --;
2565                 
2566                 if (after(sk->copied_seq,sk->urg_seq))
2567                         sk->urg_data = 0;
2568                 if (used + offset < skb->len)
2569                         continue;
2570                 
2571                 /*
2572                  *      Process the FIN.
2573                  */
2574 
2575                 if (skb->h.th->fin)
2576                         goto found_fin_ok;
2577                 if (flags & MSG_PEEK)
2578                         continue;
2579                 skb->used = 1;
2580                 continue;
2581 
2582         found_fin_ok:
2583                 ++*seq;
2584                 if (flags & MSG_PEEK)
2585                         break;
2586                         
2587                 /*
2588                  *      All is done
2589                  */
2590                  
2591                 skb->used = 1;
2592                 sk->shutdown |= RCV_SHUTDOWN;
2593                 break;
2594 
2595         }
2596         
2597         if(copied>0 && msg->msg_name)
2598         {
2599                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2600                 sin->sin_family=AF_INET;
2601                 sin->sin_addr.s_addr=sk->daddr;
2602                 sin->sin_port=sk->dummy_th.dest;
2603         }
2604         if(addr_len)
2605                 *addr_len=sizeof(struct sockaddr_in);
2606                 
2607         remove_wait_queue(sk->sleep, &wait);
2608         current->state = TASK_RUNNING;
2609 
2610         /* Clean up data we have read: This will do ACK frames */
2611         cleanup_rbuf(sk);
2612         release_sock(sk);
2613         return copied;
2614 }
2615 
2616 
2617 
2618 /*
2619  *      State processing on a close. This implements the state shift for
2620  *      sending our FIN frame. Note that we only send a FIN for some 
2621  *      states. A shutdown() may have already sent the FIN, or we may be
2622  *      closed.
2623  */
2624  
2625 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2626 {
2627         int ns=TCP_CLOSE;
2628         int send_fin=0;
2629         switch(sk->state)
2630         {
2631                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2632                         break;
2633                 case TCP_SYN_RECV:
2634                 case TCP_ESTABLISHED:   /* Closedown begin */
2635                         ns=TCP_FIN_WAIT1;
2636                         send_fin=1;
2637                         break;
2638                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2639                 case TCP_FIN_WAIT2:
2640                 case TCP_CLOSING:
2641                         ns=sk->state;
2642                         break;
2643                 case TCP_CLOSE:
2644                 case TCP_LISTEN:
2645                         break;
2646                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2647                                            wait only for the ACK */
2648                         ns=TCP_LAST_ACK;
2649                         send_fin=1;
2650         }
2651         
2652         tcp_set_state(sk,ns);
2653                 
2654         /*
2655          *      This is a (useful) BSD violating of the RFC. There is a
2656          *      problem with TCP as specified in that the other end could
2657          *      keep a socket open forever with no application left this end.
2658          *      We use a 3 minute timeout (about the same as BSD) then kill
2659          *      our end. If they send after that then tough - BUT: long enough
2660          *      that we won't make the old 4*rto = almost no time - whoops
2661          *      reset mistake.
2662          */
2663         if(dead && ns==TCP_FIN_WAIT2)
2664         {
2665                 int timer_active=del_timer(&sk->timer);
2666                 if(timer_active)
2667                         add_timer(&sk->timer);
2668                 else
2669                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2670         }
2671         
2672         return send_fin;
2673 }
2674 
2675 /*
2676  *      Send a fin.
2677  */
2678 
2679 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2680 {
2681         struct proto *prot =(struct proto *)sk->prot;
2682         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2683         struct tcphdr *t1;
2684         struct sk_buff *buff;
2685         struct device *dev=NULL;
2686         int tmp;
2687                 
2688         release_sock(sk); /* in case the malloc sleeps. */
2689         
2690         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2691         sk->inuse = 1;
2692 
2693         if (buff == NULL)
2694         {
2695                 /* This is a disaster if it occurs */
2696                 printk("tcp_send_fin: Impossible malloc failure");
2697                 return;
2698         }
2699 
2700         /*
2701          *      Administrivia
2702          */
2703          
2704         buff->sk = sk;
2705         buff->localroute = sk->localroute;
2706 
2707         /*
2708          *      Put in the IP header and routing stuff. 
2709          */
2710 
2711         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2712                            IPPROTO_TCP, sk->opt,
2713                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2714         if (tmp < 0) 
2715         {
2716                 int t;
2717                 /*
2718                  *      Finish anyway, treat this as a send that got lost. 
2719                  *      (Not good).
2720                  */
2721                  
2722                 buff->free = 1;
2723                 sock_wfree(sk,buff);
2724                 sk->write_seq++;
2725                 t=del_timer(&sk->timer);
2726                 if(t)
2727                         add_timer(&sk->timer);
2728                 else
2729                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2730                 return;
2731         }
2732         
2733         /*
2734          *      We ought to check if the end of the queue is a buffer and
2735          *      if so simply add the fin to that buffer, not send it ahead.
2736          */
2737 
2738         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2739         buff->dev = dev;
2740         memcpy(t1, th, sizeof(*t1));
2741         buff->seq = sk->write_seq;
2742         sk->write_seq++;
2743         buff->end_seq = sk->write_seq;
2744         t1->seq = htonl(buff->seq);
2745         t1->ack = 1;
2746         t1->ack_seq = htonl(sk->acked_seq);
2747         t1->window = htons(sk->window=tcp_select_window(sk));
2748         t1->fin = 1;
2749         t1->rst = 0;
2750         t1->doff = sizeof(*t1)/4;
2751         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2752 
2753         /*
2754          * If there is data in the write queue, the fin must be appended to
2755          * the write queue.
2756          */
2757         
2758         if (skb_peek(&sk->write_queue) != NULL) 
2759         {
2760                 buff->free = 0;
2761                 if (buff->next != NULL) 
2762                 {
2763                         printk("tcp_send_fin: next != NULL\n");
2764                         skb_unlink(buff);
2765                 }
2766                 skb_queue_tail(&sk->write_queue, buff);
2767         } 
2768         else 
2769         {
2770                 sk->sent_seq = sk->write_seq;
2771                 sk->prot->queue_xmit(sk, dev, buff, 0);
2772                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2773         }
2774 }
2775 
2776 /*
2777  *      Shutdown the sending side of a connection. Much like close except
2778  *      that we don't receive shut down or set sk->dead=1.
2779  */
2780 
2781 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2782 {
2783         /*
2784          *      We need to grab some memory, and put together a FIN,
2785          *      and then put it into the queue to be sent.
2786          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2787          */
2788 
2789         if (!(how & SEND_SHUTDOWN)) 
2790                 return;
2791          
2792         /*
2793          *      If we've already sent a FIN, or it's a closed state
2794          */
2795          
2796         if (sk->state == TCP_FIN_WAIT1 ||
2797             sk->state == TCP_FIN_WAIT2 ||
2798             sk->state == TCP_CLOSING ||
2799             sk->state == TCP_LAST_ACK ||
2800             sk->state == TCP_TIME_WAIT || 
2801             sk->state == TCP_CLOSE ||
2802             sk->state == TCP_LISTEN
2803           )
2804         {
2805                 return;
2806         }
2807         sk->inuse = 1;
2808 
2809         /*
2810          * flag that the sender has shutdown
2811          */
2812 
2813         sk->shutdown |= SEND_SHUTDOWN;
2814 
2815         /*
2816          *  Clear out any half completed packets. 
2817          */
2818 
2819         if (sk->partial)
2820                 tcp_send_partial(sk);
2821                 
2822         /*
2823          *      FIN if needed
2824          */
2825          
2826         if(tcp_close_state(sk,0))
2827                 tcp_send_fin(sk);
2828                 
2829         release_sock(sk);
2830 }
2831 
2832 /*
2833  *      This routine will send an RST to the other tcp. 
2834  */
2835  
2836 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2837           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2838 {
2839         struct sk_buff *buff;
2840         struct tcphdr *t1;
2841         int tmp;
2842         struct device *ndev=NULL;
2843 
2844         /*
2845          *      Cannot reset a reset (Think about it).
2846          */
2847          
2848         if(th->rst)
2849                 return;
2850   
2851         /*
2852          * We need to grab some memory, and put together an RST,
2853          * and then put it into the queue to be sent.
2854          */
2855 
2856         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2857         if (buff == NULL) 
2858                 return;
2859 
2860         buff->sk = NULL;
2861         buff->dev = dev;
2862         buff->localroute = 0;
2863 
2864         /*
2865          *      Put in the IP header and routing stuff. 
2866          */
2867 
2868         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2869                            sizeof(struct tcphdr),tos,ttl,NULL);
2870         if (tmp < 0) 
2871         {
2872                 buff->free = 1;
2873                 sock_wfree(NULL, buff);
2874                 return;
2875         }
2876 
2877         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2878         memcpy(t1, th, sizeof(*t1));
2879 
2880         /*
2881          *      Swap the send and the receive. 
2882          */
2883 
2884         t1->dest = th->source;
2885         t1->source = th->dest;
2886         t1->rst = 1;  
2887         t1->window = 0;
2888   
2889         if(th->ack)
2890         {
2891                 t1->ack = 0;
2892                 t1->seq = th->ack_seq;
2893                 t1->ack_seq = 0;
2894         }
2895         else
2896         {
2897                 t1->ack = 1;
2898                 if(!th->syn)
2899                         t1->ack_seq = th->seq;
2900                 else
2901                         t1->ack_seq = htonl(ntohl(th->seq)+1);
2902                 t1->seq = 0;
2903         }
2904 
2905         t1->syn = 0;
2906         t1->urg = 0;
2907         t1->fin = 0;
2908         t1->psh = 0;
2909         t1->doff = sizeof(*t1)/4;
2910         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2911         prot->queue_xmit(NULL, ndev, buff, 1);
2912         tcp_statistics.TcpOutSegs++;
2913 }
2914 
2915 
2916 /*
2917  *      Look for tcp options. Parses everything but only knows about MSS.
2918  *      This routine is always called with the packet containing the SYN.
2919  *      However it may also be called with the ack to the SYN.  So you
2920  *      can't assume this is always the SYN.  It's always called after
2921  *      we have set up sk->mtu to our own MTU.
2922  *
2923  *      We need at minimum to add PAWS support here. Possibly large windows
2924  *      as Linux gets deployed on 100Mb/sec networks.
2925  */
2926  
2927 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2928 {
2929         unsigned char *ptr;
2930         int length=(th->doff*4)-sizeof(struct tcphdr);
2931         int mss_seen = 0;
2932     
2933         ptr = (unsigned char *)(th + 1);
2934   
2935         while(length>0)
2936         {
2937                 int opcode=*ptr++;
2938                 int opsize=*ptr++;
2939                 switch(opcode)
2940                 {
2941                         case TCPOPT_EOL:
2942                                 return;
2943                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2944                                 length--;
2945                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2946                                 continue;
2947                         
2948                         default:
2949                                 if(opsize<=2)   /* Avoid silly options looping forever */
2950                                         return;
2951                                 switch(opcode)
2952                                 {
2953                                         case TCPOPT_MSS:
2954                                                 if(opsize==4 && th->syn)
2955                                                 {
2956                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2957                                                         mss_seen = 1;
2958                                                 }
2959                                                 break;
2960                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2961                                 }
2962                                 ptr+=opsize-2;
2963                                 length-=opsize;
2964                 }
2965         }
2966         if (th->syn) 
2967         {
2968                 if (! mss_seen)
2969                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2970         }
2971 #ifdef CONFIG_INET_PCTCP
2972         sk->mss = min(sk->max_window >> 1, sk->mtu);
2973 #else    
2974         sk->mss = min(sk->max_window, sk->mtu);
2975         sk->max_unacked = 2 * sk->mss;
2976 #endif  
2977 }
2978 
2979 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2980 {
2981         dst = ntohl(dst);
2982         if (IN_CLASSA(dst))
2983                 return htonl(IN_CLASSA_NET);
2984         if (IN_CLASSB(dst))
2985                 return htonl(IN_CLASSB_NET);
2986         return htonl(IN_CLASSC_NET);
2987 }
2988 
2989 /*
2990  *      Default sequence number picking algorithm.
2991  *      As close as possible to RFC 793, which
2992  *      suggests using a 250kHz clock.
2993  *      Further reading shows this assumes 2MB/s networks.
2994  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2995  *      That's funny, Linux has one built in!  Use it!
2996  */
2997 
2998 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2999 {
3000         struct timeval tv;
3001         do_gettimeofday(&tv);
3002         return tv.tv_usec+tv.tv_sec*1000000;
3003 }
3004 
3005 /*
3006  *      This routine handles a connection request.
3007  *      It should make sure we haven't already responded.
3008  *      Because of the way BSD works, we have to send a syn/ack now.
3009  *      This also means it will be harder to close a socket which is
3010  *      listening.
3011  */
3012  
3013 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
3014                  unsigned long daddr, unsigned long saddr,
3015                  struct options *opt, struct device *dev, u32 seq)
3016 {
3017         struct sk_buff *buff;
3018         struct tcphdr *t1;
3019         unsigned char *ptr;
3020         struct sock *newsk;
3021         struct tcphdr *th;
3022         struct device *ndev=NULL;
3023         int tmp;
3024         struct rtable *rt;
3025   
3026         th = skb->h.th;
3027 
3028         /* If the socket is dead, don't accept the connection. */
3029         if (!sk->dead) 
3030         {
3031                 sk->data_ready(sk,0);
3032         }
3033         else 
3034         {
3035                 if(sk->debug)
3036                         printk("Reset on %p: Connect on dead socket.\n",sk);
3037                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
3038                 tcp_statistics.TcpAttemptFails++;
3039                 kfree_skb(skb, FREE_READ);
3040                 return;
3041         }
3042 
3043         /*
3044          * Make sure we can accept more.  This will prevent a
3045          * flurry of syns from eating up all our memory.
3046          */
3047 
3048         if (sk->ack_backlog >= sk->max_ack_backlog) 
3049         {
3050                 tcp_statistics.TcpAttemptFails++;
3051                 kfree_skb(skb, FREE_READ);
3052                 return;
3053         }
3054 
3055         /*
3056          * We need to build a new sock struct.
3057          * It is sort of bad to have a socket without an inode attached
3058          * to it, but the wake_up's will just wake up the listening socket,
3059          * and if the listening socket is destroyed before this is taken
3060          * off of the queue, this will take care of it.
3061          */
3062 
3063         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
3064         if (newsk == NULL) 
3065         {
3066                 /* just ignore the syn.  It will get retransmitted. */
3067                 tcp_statistics.TcpAttemptFails++;
3068                 kfree_skb(skb, FREE_READ);
3069                 return;
3070         }
3071 
3072         memcpy(newsk, sk, sizeof(*newsk));
3073         newsk->opt = NULL;
3074         newsk->ip_route_cache  = NULL;
3075         if (opt && opt->optlen) {
3076           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
3077           if (!sk->opt) {
3078                 kfree_s(newsk, sizeof(struct sock));
3079                 tcp_statistics.TcpAttemptFails++;
3080                 kfree_skb(skb, FREE_READ);
3081                 return;
3082           }
3083           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
3084                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
3085                 kfree_s(newsk, sizeof(struct sock));
3086                 tcp_statistics.TcpAttemptFails++;
3087                 kfree_skb(skb, FREE_READ);
3088                 return;
3089           }
3090         }
3091         skb_queue_head_init(&newsk->write_queue);
3092         skb_queue_head_init(&newsk->receive_queue);
3093         newsk->send_head = NULL;
3094         newsk->send_tail = NULL;
3095         skb_queue_head_init(&newsk->back_log);
3096         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3097         newsk->rto = TCP_TIMEOUT_INIT;
3098         newsk->mdev = 0;
3099         newsk->max_window = 0;
3100         newsk->cong_window = 1;
3101         newsk->cong_count = 0;
3102         newsk->ssthresh = 0;
3103         newsk->backoff = 0;
3104         newsk->blog = 0;
3105         newsk->intr = 0;
3106         newsk->proc = 0;
3107         newsk->done = 0;
3108         newsk->partial = NULL;
3109         newsk->pair = NULL;
3110         newsk->wmem_alloc = 0;
3111         newsk->rmem_alloc = 0;
3112         newsk->localroute = sk->localroute;
3113 
3114         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3115 
3116         newsk->err = 0;
3117         newsk->shutdown = 0;
3118         newsk->ack_backlog = 0;
3119         newsk->acked_seq = skb->seq+1;
3120         newsk->lastwin_seq = skb->seq+1;
3121         newsk->delay_acks = 1;
3122         newsk->copied_seq = skb->seq+1;
3123         newsk->fin_seq = skb->seq;
3124         newsk->state = TCP_SYN_RECV;
3125         newsk->timeout = 0;
3126         newsk->ip_xmit_timeout = 0;
3127         newsk->write_seq = seq; 
3128         newsk->window_seq = newsk->write_seq;
3129         newsk->rcv_ack_seq = newsk->write_seq;
3130         newsk->urg_data = 0;
3131         newsk->retransmits = 0;
3132         newsk->linger=0;
3133         newsk->destroy = 0;
3134         init_timer(&newsk->timer);
3135         newsk->timer.data = (unsigned long)newsk;
3136         newsk->timer.function = &net_timer;
3137         init_timer(&newsk->retransmit_timer);
3138         newsk->retransmit_timer.data = (unsigned long)newsk;
3139         newsk->retransmit_timer.function=&retransmit_timer;
3140         newsk->dummy_th.source = skb->h.th->dest;
3141         newsk->dummy_th.dest = skb->h.th->source;
3142         
3143         /*
3144          *      Swap these two, they are from our point of view. 
3145          */
3146          
3147         newsk->daddr = saddr;
3148         newsk->saddr = daddr;
3149         newsk->rcv_saddr = daddr;
3150 
3151         put_sock(newsk->num,newsk);
3152         newsk->dummy_th.res1 = 0;
3153         newsk->dummy_th.doff = 6;
3154         newsk->dummy_th.fin = 0;
3155         newsk->dummy_th.syn = 0;
3156         newsk->dummy_th.rst = 0;        
3157         newsk->dummy_th.psh = 0;
3158         newsk->dummy_th.ack = 0;
3159         newsk->dummy_th.urg = 0;
3160         newsk->dummy_th.res2 = 0;
3161         newsk->acked_seq = skb->seq + 1;
3162         newsk->copied_seq = skb->seq + 1;
3163         newsk->socket = NULL;
3164 
3165         /*
3166          *      Grab the ttl and tos values and use them 
3167          */
3168 
3169         newsk->ip_ttl=sk->ip_ttl;
3170         newsk->ip_tos=skb->ip_hdr->tos;
3171 
3172         /*
3173          *      Use 512 or whatever user asked for 
3174          */
3175 
3176         /*
3177          *      Note use of sk->user_mss, since user has no direct access to newsk 
3178          */
3179 
3180         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3181         newsk->ip_route_cache = rt;
3182         
3183         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3184                 newsk->window_clamp = rt->rt_window;
3185         else
3186                 newsk->window_clamp = 0;
3187                 
3188         if (sk->user_mss)
3189                 newsk->mtu = sk->user_mss;
3190         else if (rt)
3191                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3192         else 
3193                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3194 
3195         /*
3196          *      But not bigger than device MTU 
3197          */
3198 
3199         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3200 
3201 #ifdef CONFIG_SKIP
3202         
3203         /*
3204          *      SKIP devices set their MTU to 65535. This is so they can take packets
3205          *      unfragmented to security process then fragment. They could lie to the
3206          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3207          *      simply because the final package we want unfragmented is going to be
3208          *
3209          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3210          */
3211          
3212         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3213                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3214 #endif
3215         /*
3216          *      This will min with what arrived in the packet 
3217          */
3218 
3219         tcp_options(newsk,skb->h.th);
3220         
3221         tcp_cache_zap();
3222 
3223         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3224         if (buff == NULL) 
3225         {
3226                 sk->err = ENOMEM;
3227                 newsk->dead = 1;
3228                 newsk->state = TCP_CLOSE;
3229                 /* And this will destroy it */
3230                 release_sock(newsk);
3231                 kfree_skb(skb, FREE_READ);
3232                 tcp_statistics.TcpAttemptFails++;
3233                 return;
3234         }
3235   
3236         buff->sk = newsk;
3237         buff->localroute = newsk->localroute;
3238 
3239         /*
3240          *      Put in the IP header and routing stuff. 
3241          */
3242 
3243         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3244                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3245 
3246         /*
3247          *      Something went wrong. 
3248          */
3249 
3250         if (tmp < 0) 
3251         {
3252                 sk->err = tmp;
3253                 buff->free = 1;
3254                 kfree_skb(buff,FREE_WRITE);
3255                 newsk->dead = 1;
3256                 newsk->state = TCP_CLOSE;
3257                 release_sock(newsk);
3258                 skb->sk = sk;
3259                 kfree_skb(skb, FREE_READ);
3260                 tcp_statistics.TcpAttemptFails++;
3261                 return;
3262         }
3263 
3264         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3265   
3266         memcpy(t1, skb->h.th, sizeof(*t1));
3267         buff->seq = newsk->write_seq++;
3268         buff->end_seq = newsk->write_seq;
3269         /*
3270          *      Swap the send and the receive. 
3271          */
3272         t1->dest = skb->h.th->source;
3273         t1->source = newsk->dummy_th.source;
3274         t1->seq = ntohl(buff->seq);
3275         t1->ack = 1;
3276         newsk->sent_seq = newsk->write_seq;
3277         t1->window = ntohs(tcp_select_window(newsk));
3278         t1->res1 = 0;
3279         t1->res2 = 0;
3280         t1->rst = 0;
3281         t1->urg = 0;
3282         t1->psh = 0;
3283         t1->syn = 1;
3284         t1->ack_seq = htonl(newsk->acked_seq);
3285         t1->doff = sizeof(*t1)/4+1;
3286         ptr = skb_put(buff,4);
3287         ptr[0] = 2;
3288         ptr[1] = 4;
3289         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3290         ptr[3] =(newsk->mtu) & 0xff;
3291 
3292         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3293         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3294         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3295         skb->sk = newsk;
3296 
3297         /*
3298          *      Charge the sock_buff to newsk. 
3299          */
3300          
3301         sk->rmem_alloc -= skb->truesize;
3302         newsk->rmem_alloc += skb->truesize;
3303         
3304         skb_queue_tail(&sk->receive_queue,skb);
3305         sk->ack_backlog++;
3306         release_sock(newsk);
3307         tcp_statistics.TcpOutSegs++;
3308 }
3309 
3310 
3311 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
3312 {
3313         /*
3314          * We need to grab some memory, and put together a FIN, 
3315          * and then put it into the queue to be sent.
3316          */
3317         
3318         sk->inuse = 1;
3319         
3320         if(th_cache_sk==sk)
3321                 tcp_cache_zap();
3322         if(sk->state == TCP_LISTEN)
3323         {
3324                 /* Special case */
3325                 tcp_set_state(sk, TCP_CLOSE);
3326                 tcp_close_pending(sk);
3327                 release_sock(sk);
3328                 return;
3329         }
3330         
3331         sk->keepopen = 1;
3332         sk->shutdown = SHUTDOWN_MASK;
3333 
3334         if (!sk->dead) 
3335                 sk->state_change(sk);
3336 
3337         if (timeout == 0) 
3338         {
3339                 struct sk_buff *skb;
3340                 
3341                 /*
3342                  *  We need to flush the recv. buffs.  We do this only on the
3343                  *  descriptor close, not protocol-sourced closes, because the
3344                  *  reader process may not have drained the data yet!
3345                  */
3346                  
3347                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3348                         kfree_skb(skb, FREE_READ);
3349                 /*
3350                  *      Get rid off any half-completed packets. 
3351                  */
3352 
3353                 if (sk->partial) 
3354                         tcp_send_partial(sk);
3355         }
3356 
3357                 
3358         /*
3359          *      Timeout is not the same thing - however the code likes
3360          *      to send both the same way (sigh).
3361          */
3362          
3363         if(timeout)
3364         {
3365                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3366         }
3367         else
3368         {
3369                 if(tcp_close_state(sk,1)==1)
3370                 {
3371                         tcp_send_fin(sk);
3372                 }
3373         }
3374         release_sock(sk);
3375 }
3376 
3377 
3378 /*
3379  *      This routine takes stuff off of the write queue,
3380  *      and puts it in the xmit queue. This happens as incoming acks
3381  *      open up the remote window for us.
3382  */
3383  
3384 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3385 {
3386         struct sk_buff *skb;
3387 
3388         /*
3389          *      The bytes will have to remain here. In time closedown will
3390          *      empty the write queue and all will be happy 
3391          */
3392 
3393         if(sk->zapped)
3394                 return;
3395 
3396         /*
3397          *      Anything on the transmit queue that fits the window can
3398          *      be added providing we are not
3399          *
3400          *      a) retransmitting (Nagle's rule)
3401          *      b) exceeding our congestion window.
3402          */
3403          
3404         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3405                 before(skb->end_seq, sk->window_seq + 1) &&
3406                 (sk->retransmits == 0 ||
3407                  sk->ip_xmit_timeout != TIME_WRITE ||
3408                  before(skb->end_seq, sk->rcv_ack_seq + 1))
3409                 && sk->packets_out < sk->cong_window) 
3410         {
3411                 IS_SKB(skb);
3412                 skb_unlink(skb);
3413                 
3414                 /*
3415                  *      See if we really need to send the packet. 
3416                  */
3417                  
3418                 if (before(skb->end_seq, sk->rcv_ack_seq +1)) 
3419                 {
3420                         /*
3421                          *      This is acked data. We can discard it. This 
3422                          *      cannot currently occur.
3423                          */
3424                          
3425                         sk->retransmits = 0;
3426                         kfree_skb(skb, FREE_WRITE);
3427                         if (!sk->dead) 
3428                                 sk->write_space(sk);
3429                 } 
3430                 else
3431                 {
3432                         struct tcphdr *th;
3433                         struct iphdr *iph;
3434                         int size;
3435 /*
3436  * put in the ack seq and window at this point rather than earlier,
3437  * in order to keep them monotonic.  We really want to avoid taking
3438  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3439  * Ack and window will in general have changed since this packet was put
3440  * on the write queue.
3441  */
3442                         iph = skb->ip_hdr;
3443                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3444                         size = skb->len - (((unsigned char *) th) - skb->data);
3445 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3446                         if (size > sk->mtu - sizeof(struct iphdr))
3447                         {
3448                                 iph->frag_off &= ~htons(IP_DF);
3449                                 ip_send_check(iph);
3450                         }
3451 #endif
3452                         
3453                         th->ack_seq = htonl(sk->acked_seq);
3454                         th->window = htons(tcp_select_window(sk));
3455 
3456                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3457 
3458                         sk->sent_seq = skb->end_seq;
3459                         
3460                         /*
3461                          *      IP manages our queue for some crazy reason
3462                          */
3463                          
3464                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3465                         
3466                         
3467                         sk->ack_backlog = 0;
3468                         sk->bytes_rcv = 0;
3469 
3470                         /*
3471                          *      Again we slide the timer wrongly
3472                          */
3473                          
3474                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3475                 }
3476         }
3477 }
3478 
3479 
3480 /*
3481  *      This routine deals with incoming acks, but not outgoing ones.
3482  */
3483 
3484 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3485 {
3486         u32 ack;
3487         int flag = 0;
3488 
3489         /* 
3490          * 1 - there was data in packet as well as ack or new data is sent or 
3491          *     in shutdown state
3492          * 2 - data from retransmit queue was acked and removed
3493          * 4 - window shrunk or data from retransmit queue was acked and removed
3494          */
3495 
3496         if(sk->zapped)
3497                 return(1);      /* Dead, cant ack any more so why bother */
3498 
3499         /*
3500          *      Have we discovered a larger window
3501          */
3502          
3503         ack = ntohl(th->ack_seq);
3504 
3505         if (ntohs(th->window) > sk->max_window) 
3506         {
3507                 sk->max_window = ntohs(th->window);
3508 #ifdef CONFIG_INET_PCTCP
3509                 /* Hack because we don't send partial packets to non SWS
3510                    handling hosts */
3511                 sk->mss = min(sk->max_window>>1, sk->mtu);
3512 #else
3513                 sk->mss = min(sk->max_window, sk->mtu);
3514 #endif  
3515         }
3516 
3517         /*
3518          *      We have dropped back to keepalive timeouts. Thus we have
3519          *      no retransmits pending.
3520          */
3521          
3522         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3523                 sk->retransmits = 0;
3524 
3525         /*
3526          *      If the ack is newer than sent or older than previous acks
3527          *      then we can probably ignore it.
3528          */
3529          
3530         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3531         {
3532                 if(sk->debug)
3533                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3534                         
3535                 /*
3536                  *      Keepalive processing.
3537                  */
3538                  
3539                 if (after(ack, sk->sent_seq)) 
3540                 {
3541                         return(0);
3542                 }
3543                 
3544                 /*
3545                  *      Restart the keepalive timer.
3546                  */
3547                  
3548                 if (sk->keepopen) 
3549                 {
3550                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3551                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3552                 }
3553                 return(1);
3554         }
3555 
3556         /*
3557          *      If there is data set flag 1
3558          */
3559          
3560         if (len != th->doff*4) 
3561                 flag |= 1;
3562 
3563         /*
3564          *      See if our window has been shrunk. 
3565          */
3566 
3567         if (after(sk->window_seq, ack+ntohs(th->window))) 
3568         {
3569                 /*
3570                  * We may need to move packets from the send queue
3571                  * to the write queue, if the window has been shrunk on us.
3572                  * The RFC says you are not allowed to shrink your window
3573                  * like this, but if the other end does, you must be able
3574                  * to deal with it.
3575                  */
3576                 struct sk_buff *skb;
3577                 struct sk_buff *skb2;
3578                 struct sk_buff *wskb = NULL;
3579         
3580                 skb2 = sk->send_head;
3581                 sk->send_head = NULL;
3582                 sk->send_tail = NULL;
3583         
3584                 /*
3585                  *      This is an artifact of a flawed concept. We want one
3586                  *      queue and a smarter send routine when we send all.
3587                  */
3588         
3589                 flag |= 4;      /* Window changed */
3590         
3591                 sk->window_seq = ack + ntohs(th->window);
3592                 cli();
3593                 while (skb2 != NULL) 
3594                 {
3595                         skb = skb2;
3596                         skb2 = skb->link3;
3597                         skb->link3 = NULL;
3598                         if (after(skb->end_seq, sk->window_seq)) 
3599                         {
3600                                 if (sk->packets_out > 0) 
3601                                         sk->packets_out--;
3602                                 /* We may need to remove this from the dev send list. */
3603                                 if (skb->next != NULL) 
3604                                 {
3605                                         skb_unlink(skb);                                
3606                                 }
3607                                 /* Now add it to the write_queue. */
3608                                 if (wskb == NULL)
3609                                         skb_queue_head(&sk->write_queue,skb);
3610                                 else
3611                                         skb_append(wskb,skb);
3612                                 wskb = skb;
3613                         } 
3614                         else 
3615                         {
3616                                 if (sk->send_head == NULL) 
3617                                 {
3618                                         sk->send_head = skb;
3619                                         sk->send_tail = skb;
3620                                 }
3621                                 else
3622                                 {
3623                                         sk->send_tail->link3 = skb;
3624                                         sk->send_tail = skb;
3625                                 }
3626                                 skb->link3 = NULL;
3627                         }
3628                 }
3629                 sti();
3630         }
3631 
3632         /*
3633          *      Pipe has emptied
3634          */
3635          
3636         if (sk->send_tail == NULL || sk->send_head == NULL) 
3637         {
3638                 sk->send_head = NULL;
3639                 sk->send_tail = NULL;
3640                 sk->packets_out= 0;
3641         }
3642 
3643         /*
3644          *      Update the right hand window edge of the host
3645          */
3646          
3647         sk->window_seq = ack + ntohs(th->window);
3648 
3649         /*
3650          *      We don't want too many packets out there. 
3651          */
3652          
3653         if (sk->ip_xmit_timeout == TIME_WRITE && 
3654                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3655         {
3656                 /* 
3657                  * This is Jacobson's slow start and congestion avoidance. 
3658                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3659                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3660                  * counter and increment it once every cwnd times.  It's possible
3661                  * that this should be done only if sk->retransmits == 0.  I'm
3662                  * interpreting "new data is acked" as including data that has
3663                  * been retransmitted but is just now being acked.
3664                  */
3665                 if (sk->cong_window < sk->ssthresh)  
3666                         /* 
3667                          *      In "safe" area, increase
3668                          */
3669                         sk->cong_window++;
3670                 else 
3671                 {
3672                         /*
3673                          *      In dangerous area, increase slowly.  In theory this is
3674                          *      sk->cong_window += 1 / sk->cong_window
3675                          */
3676                         if (sk->cong_count >= sk->cong_window) 
3677                         {
3678                                 sk->cong_window++;
3679                                 sk->cong_count = 0;
3680                         }
3681                         else 
3682                                 sk->cong_count++;
3683                 }
3684         }
3685 
3686         /*
3687          *      Remember the highest ack received.
3688          */
3689          
3690         sk->rcv_ack_seq = ack;
3691         
3692         /*
3693          *      We passed data and got it acked, remove any soft error
3694          *      log. Something worked...
3695          */
3696          
3697         sk->err_soft = 0;
3698 
3699         /*
3700          *      If this ack opens up a zero window, clear backoff.  It was
3701          *      being used to time the probes, and is probably far higher than
3702          *      it needs to be for normal retransmission.
3703          */
3704 
3705         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3706         {
3707                 sk->retransmits = 0;    /* Our probe was answered */
3708                 
3709                 /*
3710                  *      Was it a usable window open ?
3711                  */
3712                  
3713                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3714                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
3715                 {
3716                         sk->backoff = 0;
3717                         
3718                         /*
3719                          *      Recompute rto from rtt.  this eliminates any backoff.
3720                          */
3721 
3722                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3723                         if (sk->rto > 120*HZ)
3724                                 sk->rto = 120*HZ;
3725                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
3726                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3727                                                    .2 of a second is going to need huge windows (SIGH) */
3728                         sk->rto = HZ/5;
3729                 }
3730         }
3731 
3732         /* 
3733          *      See if we can take anything off of the retransmit queue.
3734          */
3735    
3736         while(sk->send_head != NULL) 
3737         {
3738                 /* Check for a bug. */
3739                 if (sk->send_head->link3 &&
3740                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
3741                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3742                         
3743                 /*
3744                  *      If our packet is before the ack sequence we can
3745                  *      discard it as it's confirmed to have arrived the other end.
3746                  */
3747                  
3748                 if (before(sk->send_head->end_seq, ack+1)) 
3749                 {
3750                         struct sk_buff *oskb;   
3751                         if (sk->retransmits) 
3752                         {       
3753                                 /*
3754                                  *      We were retransmitting.  don't count this in RTT est 
3755                                  */
3756                                 flag |= 2;
3757 
3758                                 /*
3759                                  * even though we've gotten an ack, we're still
3760                                  * retransmitting as long as we're sending from
3761                                  * the retransmit queue.  Keeping retransmits non-zero
3762                                  * prevents us from getting new data interspersed with
3763                                  * retransmissions.
3764                                  */
3765 
3766                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3767                                         sk->retransmits = 1;
3768                                 else
3769                                         sk->retransmits = 0;
3770                         }
3771                         /*
3772                          * Note that we only reset backoff and rto in the
3773                          * rtt recomputation code.  And that doesn't happen
3774                          * if there were retransmissions in effect.  So the
3775                          * first new packet after the retransmissions is
3776                          * sent with the backoff still in effect.  Not until
3777                          * we get an ack from a non-retransmitted packet do
3778                          * we reset the backoff and rto.  This allows us to deal
3779                          * with a situation where the network delay has increased
3780                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3781                          */
3782 
3783                         /*
3784                          *      We have one less packet out there. 
3785                          */
3786                          
3787                         if (sk->packets_out > 0) 
3788                                 sk->packets_out --;
3789                         /* 
3790                          *      Wake up the process, it can probably write more. 
3791                          */
3792                         if (!sk->dead) 
3793                                 sk->write_space(sk);
3794                         oskb = sk->send_head;
3795 
3796                         if (!(flag&2))  /* Not retransmitting */
3797                         {
3798                                 long m;
3799         
3800                                 /*
3801                                  *      The following amusing code comes from Jacobson's
3802                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3803                                  *      are scaled versions of rtt and mean deviation.
3804                                  *      This is designed to be as fast as possible 
3805                                  *      m stands for "measurement".
3806                                  */
3807         
3808                                 m = jiffies - oskb->when;  /* RTT */
3809                                 if(m<=0)
3810                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3811                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3812                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3813                                 if (m < 0)
3814                                         m = -m;         /* m is now abs(error) */
3815                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3816                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3817         
3818                                 /*
3819                                  *      Now update timeout.  Note that this removes any backoff.
3820                                  */
3821                          
3822                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3823                                 if (sk->rto > 120*HZ)
3824                                         sk->rto = 120*HZ;
3825                                 if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3826                                         sk->rto = HZ/5;
3827                                 sk->backoff = 0;
3828                         }
3829                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3830                                            In this case as we just set it up */
3831                         cli();
3832                         oskb = sk->send_head;
3833                         IS_SKB(oskb);
3834                         sk->send_head = oskb->link3;
3835                         if (sk->send_head == NULL) 
3836                         {
3837                                 sk->send_tail = NULL;
3838                         }
3839 
3840                 /*
3841                  *      We may need to remove this from the dev send list. 
3842                  */
3843 
3844                         if (oskb->next)
3845                                 skb_unlink(oskb);
3846                         sti();
3847                         kfree_skb(oskb, FREE_WRITE); /* write. */
3848                         if (!sk->dead) 
3849                                 sk->write_space(sk);
3850                 }
3851                 else
3852                 {
3853                         break;
3854                 }
3855         }
3856 
3857         /*
3858          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3859          * returns non-NULL, we complete ignore the timer stuff in the else
3860          * clause.  We ought to organize the code so that else clause can
3861          * (should) be executed regardless, possibly moving the PROBE timer
3862          * reset over.  The skb_peek() thing should only move stuff to the
3863          * write queue, NOT also manage the timer functions.
3864          */
3865 
3866         /*
3867          * Maybe we can take some stuff off of the write queue,
3868          * and put it onto the xmit queue.
3869          */
3870         if (skb_peek(&sk->write_queue) != NULL) 
3871         {
3872                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3873                         (sk->retransmits == 0 || 
3874                          sk->ip_xmit_timeout != TIME_WRITE ||
3875                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3876                         && sk->packets_out < sk->cong_window) 
3877                 {
3878                         /*
3879                          *      Add more data to the send queue.
3880                          */
3881                         flag |= 1;
3882                         tcp_write_xmit(sk);
3883                 }
3884                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3885                         sk->send_head == NULL &&
3886                         sk->ack_backlog == 0 &&
3887                         sk->state != TCP_TIME_WAIT) 
3888                 {
3889                         /*
3890                          *      Data to queue but no room.
3891                          */
3892                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3893                 }               
3894         }
3895         else
3896         {
3897                 /*
3898                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3899                  * from TCP_CLOSE we don't do anything
3900                  *
3901                  * from anything else, if there is write data (or fin) pending,
3902                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3903                  * a KEEPALIVE timeout, else we delete the timer.
3904                  *
3905                  * We do not set flag for nominal write data, otherwise we may
3906                  * force a state where we start to write itsy bitsy tidbits
3907                  * of data.
3908                  */
3909 
3910                 switch(sk->state) {
3911                 case TCP_TIME_WAIT:
3912                         /*
3913                          * keep us in TIME_WAIT until we stop getting packets,
3914                          * reset the timeout.
3915                          */
3916                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3917                         break;
3918                 case TCP_CLOSE:
3919                         /*
3920                          * don't touch the timer.
3921                          */
3922                         break;
3923                 default:
3924                         /*
3925                          *      Must check send_head, write_queue, and ack_backlog
3926                          *      to determine which timeout to use.
3927                          */
3928                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3929                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3930                         } else if (sk->keepopen) {
3931                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3932                         } else {
3933                                 del_timer(&sk->retransmit_timer);
3934                                 sk->ip_xmit_timeout = 0;
3935                         }
3936                         break;
3937                 }
3938         }
3939 
3940         /*
3941          *      We have nothing queued but space to send. Send any partial
3942          *      packets immediately (end of Nagle rule application).
3943          */
3944          
3945         if (sk->packets_out == 0 && sk->partial != NULL &&
3946                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3947         {
3948                 flag |= 1;
3949                 tcp_send_partial(sk);
3950         }
3951 
3952         /*
3953          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3954          * we are now waiting for an acknowledge to our FIN.  The other end is
3955          * already in TIME_WAIT.
3956          *
3957          * Move to TCP_CLOSE on success.
3958          */
3959 
3960         if (sk->state == TCP_LAST_ACK) 
3961         {
3962                 if (!sk->dead)
3963                         sk->state_change(sk);
3964                 if(sk->debug)
3965                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3966                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3967                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3968                 {
3969                         flag |= 1;
3970                         sk->shutdown = SHUTDOWN_MASK;
3971                         tcp_set_state(sk,TCP_CLOSE);
3972                         return 1;
3973                 }
3974         }
3975 
3976         /*
3977          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3978          *
3979          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3980          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3981          */
3982 
3983         if (sk->state == TCP_FIN_WAIT1) 
3984         {
3985 
3986                 if (!sk->dead) 
3987                         sk->state_change(sk);
3988                 if (sk->rcv_ack_seq == sk->write_seq) 
3989                 {
3990                         flag |= 1;
3991                         sk->shutdown |= SEND_SHUTDOWN;
3992                         tcp_set_state(sk, TCP_FIN_WAIT2);
3993                 }
3994         }
3995 
3996         /*
3997          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3998          *
3999          *      Move to TIME_WAIT
4000          */
4001 
4002         if (sk->state == TCP_CLOSING) 
4003         {
4004 
4005                 if (!sk->dead) 
4006                         sk->state_change(sk);
4007                 if (sk->rcv_ack_seq == sk->write_seq) 
4008                 {
4009                         flag |= 1;
4010                         tcp_time_wait(sk);
4011                 }
4012         }
4013         
4014         /*
4015          *      Final ack of a three way shake 
4016          */
4017          
4018         if(sk->state==TCP_SYN_RECV)
4019         {
4020                 tcp_set_state(sk, TCP_ESTABLISHED);
4021                 tcp_options(sk,th);
4022                 sk->dummy_th.dest=th->source;
4023                 sk->copied_seq = sk->acked_seq;
4024                 if(!sk->dead)
4025                         sk->state_change(sk);
4026                 if(sk->max_window==0)
4027                 {
4028                         sk->max_window=32;      /* Sanity check */
4029                         sk->mss=min(sk->max_window,sk->mtu);
4030                 }
4031         }
4032         
4033         /*
4034          * I make no guarantees about the first clause in the following
4035          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
4036          * what conditions "!flag" would be true.  However I think the rest
4037          * of the conditions would prevent that from causing any
4038          * unnecessary retransmission. 
4039          *   Clearly if the first packet has expired it should be 
4040          * retransmitted.  The other alternative, "flag&2 && retransmits", is
4041          * harder to explain:  You have to look carefully at how and when the
4042          * timer is set and with what timeout.  The most recent transmission always
4043          * sets the timer.  So in general if the most recent thing has timed
4044          * out, everything before it has as well.  So we want to go ahead and
4045          * retransmit some more.  If we didn't explicitly test for this
4046          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
4047          * would not be true.  If you look at the pattern of timing, you can
4048          * show that rto is increased fast enough that the next packet would
4049          * almost never be retransmitted immediately.  Then you'd end up
4050          * waiting for a timeout to send each packet on the retransmission
4051          * queue.  With my implementation of the Karn sampling algorithm,
4052          * the timeout would double each time.  The net result is that it would
4053          * take a hideous amount of time to recover from a single dropped packet.
4054          * It's possible that there should also be a test for TIME_WRITE, but
4055          * I think as long as "send_head != NULL" and "retransmit" is on, we've
4056          * got to be in real retransmission mode.
4057          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
4058          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
4059          * As long as no further losses occur, this seems reasonable.
4060          */
4061         
4062         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
4063                (((flag&2) && sk->retransmits) ||
4064                (sk->send_head->when + sk->rto < jiffies))) 
4065         {
4066                 if(sk->send_head->when + sk->rto < jiffies)
4067                         tcp_retransmit(sk,0);   
4068                 else
4069                 {
4070                         tcp_do_retransmit(sk, 1);
4071                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4072                 }
4073         }
4074 
4075         return(1);
4076 }
4077 
4078 
4079 /*
4080  *      Process the FIN bit. This now behaves as it is supposed to work
4081  *      and the FIN takes effect when it is validly part of sequence
4082  *      space. Not before when we get holes.
4083  *
4084  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4085  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
4086  *      TIME-WAIT)
4087  *
4088  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
4089  *      close and we go into CLOSING (and later onto TIME-WAIT)
4090  *
4091  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4092  *
4093  */
4094  
4095 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
4096 {
4097         sk->fin_seq = skb->end_seq;
4098 
4099         if (!sk->dead) 
4100         {
4101                 sk->state_change(sk);
4102                 sock_wake_async(sk->socket, 1);
4103         }
4104 
4105         switch(sk->state) 
4106         {
4107                 case TCP_SYN_RECV:
4108                 case TCP_SYN_SENT:
4109                 case TCP_ESTABLISHED:
4110                         /*
4111                          * move to CLOSE_WAIT, tcp_data() already handled
4112                          * sending the ack.
4113                          */
4114                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4115                         if (th->rst)
4116                                 sk->shutdown = SHUTDOWN_MASK;
4117                         break;
4118 
4119                 case TCP_CLOSE_WAIT:
4120                 case TCP_CLOSING:
4121                         /*
4122                          * received a retransmission of the FIN, do
4123                          * nothing.
4124                          */
4125                         break;
4126                 case TCP_TIME_WAIT:
4127                         /*
4128                          * received a retransmission of the FIN,
4129                          * restart the TIME_WAIT timer.
4130                          */
4131                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4132                         return(0);
4133                 case TCP_FIN_WAIT1:
4134                         /*
4135                          * This case occurs when a simultaneous close
4136                          * happens, we must ack the received FIN and
4137                          * enter the CLOSING state.
4138                          *
4139                          * This causes a WRITE timeout, which will either
4140                          * move on to TIME_WAIT when we timeout, or resend
4141                          * the FIN properly (maybe we get rid of that annoying
4142                          * FIN lost hang). The TIME_WRITE code is already correct
4143                          * for handling this timeout.
4144                          */
4145 
4146                         if(sk->ip_xmit_timeout != TIME_WRITE)
4147                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4148                         tcp_set_state(sk,TCP_CLOSING);
4149                         break;
4150                 case TCP_FIN_WAIT2:
4151                         /*
4152                          * received a FIN -- send ACK and enter TIME_WAIT
4153                          */
4154                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4155                         sk->shutdown|=SHUTDOWN_MASK;
4156                         tcp_set_state(sk,TCP_TIME_WAIT);
4157                         break;
4158                 case TCP_CLOSE:
4159                         /*
4160                          * already in CLOSE
4161                          */
4162                         break;
4163                 default:
4164                         tcp_set_state(sk,TCP_LAST_ACK);
4165         
4166                         /* Start the timers. */
4167                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4168                         return(0);
4169         }
4170 
4171         return(0);
4172 }
4173 
4174 
4175 
4176 /*
4177  *      This routine handles the data.  If there is room in the buffer,
4178  *      it will be have already been moved into it.  If there is no
4179  *      room, then we will just have to discard the packet.
4180  */
4181 
4182 extern /* __inline__ */ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
4183          unsigned long saddr, unsigned short len)
4184 {
4185         struct sk_buff *skb1, *skb2;
4186         struct tcphdr *th;
4187         int dup_dumped=0;
4188         u32 new_seq, shut_seq;
4189 
4190         th = skb->h.th;
4191         skb_pull(skb,th->doff*4);
4192         skb_trim(skb,len-(th->doff*4));
4193 
4194         /*
4195          *      The bytes in the receive read/assembly queue has increased. Needed for the
4196          *      low memory discard algorithm 
4197          */
4198            
4199         sk->bytes_rcv += skb->len;
4200         
4201         if (skb->len == 0 && !th->fin) 
4202         {
4203                 /* 
4204                  *      Don't want to keep passing ack's back and forth. 
4205                  *      (someone sent us dataless, boring frame)
4206                  */
4207                 if (!th->ack)
4208                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4209                 kfree_skb(skb, FREE_READ);
4210                 return(0);
4211         }
4212         
4213         /*
4214          *      We no longer have anyone receiving data on this connection.
4215          */
4216 
4217 #ifndef TCP_DONT_RST_SHUTDOWN            
4218 
4219         if(sk->shutdown & RCV_SHUTDOWN)
4220         {
4221                 /*
4222                  *      FIXME: BSD has some magic to avoid sending resets to
4223                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4224                  *      BSD stacks still have broken keepalives so we want to
4225                  *      cope with it.
4226                  */
4227 
4228                 if(skb->len)    /* We don't care if it's just an ack or
4229                                    a keepalive/window probe */
4230                 {
4231                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
4232                         
4233                         /* Do this the way 4.4BSD treats it. Not what I'd
4234                            regard as the meaning of the spec but it's what BSD
4235                            does and clearly they know everything 8) */
4236 
4237                         /*
4238                          *      This is valid because of two things
4239                          *
4240                          *      a) The way tcp_data behaves at the bottom.
4241                          *      b) A fin takes effect when read not when received.
4242                          */
4243                          
4244                         shut_seq = sk->acked_seq+1;     /* Last byte */
4245                         
4246                         if(after(new_seq,shut_seq))
4247                         {
4248                                 if(sk->debug)
4249                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4250                                                 sk, new_seq, shut_seq, sk->blog);
4251                                 if(sk->dead)
4252                                 {
4253                                         sk->acked_seq = new_seq + th->fin;
4254                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4255                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4256                                         tcp_statistics.TcpEstabResets++;
4257                                         sk->err = EPIPE;
4258                                         sk->error_report(sk);
4259                                         sk->shutdown = SHUTDOWN_MASK;
4260                                         tcp_set_state(sk,TCP_CLOSE);
4261                                         kfree_skb(skb, FREE_READ);
4262                                         return 0;
4263                                 }
4264                         }
4265                 }
4266         }
4267 
4268 #endif
4269 
4270         /*
4271          *      Now we have to walk the chain, and figure out where this one
4272          *      goes into it.  This is set up so that the last packet we received
4273          *      will be the first one we look at, that way if everything comes
4274          *      in order, there will be no performance loss, and if they come
4275          *      out of order we will be able to fit things in nicely.
4276          *
4277          *      [AC: This is wrong. We should assume in order first and then walk
4278          *       forwards from the first hole based upon real traffic patterns.]
4279          *      
4280          */
4281 
4282         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4283         {
4284                 skb_queue_head(&sk->receive_queue,skb);
4285                 skb1= NULL;
4286         } 
4287         else
4288         {
4289                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4290                 {
4291                         if(sk->debug)
4292                         {
4293                                 printk("skb1=%p :", skb1);
4294                                 printk("skb1->seq = %d: ", skb1->seq);
4295                                 printk("skb->seq = %d\n",skb->seq);
4296                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4297                                                 sk->acked_seq);
4298                         }
4299                         
4300                         /*
4301                          *      Optimisation: Duplicate frame or extension of previous frame from
4302                          *      same sequence point (lost ack case).
4303                          *      The frame contains duplicate data or replaces a previous frame
4304                          *      discard the previous frame (safe as sk->inuse is set) and put
4305                          *      the new one in its place.
4306                          */
4307                          
4308                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
4309                         {
4310                                 skb_append(skb1,skb);
4311                                 skb_unlink(skb1);
4312                                 kfree_skb(skb1,FREE_READ);
4313                                 dup_dumped=1;
4314                                 skb1=NULL;
4315                                 break;
4316                         }
4317                         
4318                         /*
4319                          *      Found where it fits
4320                          */
4321                          
4322                         if (after(skb->seq+1, skb1->seq))
4323                         {
4324                                 skb_append(skb1,skb);
4325                                 break;
4326                         }
4327                         
4328                         /*
4329                          *      See if we've hit the start. If so insert.
4330                          */
4331                         if (skb1 == skb_peek(&sk->receive_queue))
4332                         {
4333                                 skb_queue_head(&sk->receive_queue, skb);
4334                                 break;
4335                         }
4336                 }
4337         }
4338 
4339         /*
4340          *      Figure out what the ack value for this frame is
4341          */
4342          
4343         if (before(sk->acked_seq, sk->copied_seq)) 
4344         {
4345                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4346                 sk->acked_seq = sk->copied_seq;
4347         }
4348 
4349         /*
4350          *      Now figure out if we can ack anything. This is very messy because we really want two
4351          *      receive queues, a completed and an assembly queue. We also want only one transmit
4352          *      queue.
4353          */
4354 
4355         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
4356         {
4357                 if (before(skb->seq, sk->acked_seq+1)) 
4358                 {
4359 
4360                         if (after(skb->end_seq, sk->acked_seq)) 
4361                                 sk->acked_seq = skb->end_seq;
4362 
4363                         skb->acked = 1;
4364 
4365                         /*
4366                          *      When we ack the fin, we do the FIN 
4367                          *      processing.
4368                          */
4369 
4370                         if (skb->h.th->fin) 
4371                         {
4372                                 tcp_fin(skb,sk,skb->h.th);
4373                         }
4374           
4375                         for(skb2 = skb->next;
4376                             skb2 != (struct sk_buff *)&sk->receive_queue;
4377                             skb2 = skb2->next) 
4378                         {
4379                                 if (before(skb2->seq, sk->acked_seq+1)) 
4380                                 {
4381                                         if (after(skb2->end_seq, sk->acked_seq))
4382                                                 sk->acked_seq = skb2->end_seq;
4383 
4384                                         skb2->acked = 1;
4385                                         /*
4386                                          *      When we ack the fin, we do
4387                                          *      the fin handling.
4388                                          */
4389                                         if (skb2->h.th->fin) 
4390                                         {
4391                                                 tcp_fin(skb,sk,skb->h.th);
4392                                         }
4393 
4394                                         /*
4395                                          *      Force an immediate ack.
4396                                          */
4397                                          
4398                                         sk->ack_backlog = sk->max_ack_backlog;
4399                                 }
4400                                 else
4401                                 {
4402                                         break;
4403                                 }
4404                         }
4405 
4406                         /*
4407                          *      This also takes care of updating the window.
4408                          *      This if statement needs to be simplified.
4409                          *
4410                          *      rules for delaying an ack:
4411                          *      - delay time <= 0.5 HZ
4412                          *      - we don't have a window update to send
4413                          *      - must send at least every 2 full sized packets
4414                          */
4415                         if (!sk->delay_acks ||
4416                             sk->ack_backlog >= sk->max_ack_backlog || 
4417                             sk->bytes_rcv > sk->max_unacked || th->fin ||
4418                             sk->ato > HZ/2 ||
4419                             tcp_raise_window(sk)) {
4420         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4421                         }
4422                         else 
4423                         {
4424                                 sk->ack_backlog++;
4425                                 
4426                                 if(sk->debug)                           
4427                                         printk("Ack queued.\n");
4428                                 reset_xmit_timer(sk, TIME_WRITE, sk->ato);
4429                         }
4430                 }
4431         }
4432 
4433         /*
4434          *      If we've missed a packet, send an ack.
4435          *      Also start a timer to send another.
4436          */
4437          
4438         if (!skb->acked) 
4439         {
4440         
4441         /*
4442          *      This is important.  If we don't have much room left,
4443          *      we need to throw out a few packets so we have a good
4444          *      window.  Note that mtu is used, not mss, because mss is really
4445          *      for the send side.  He could be sending us stuff as large as mtu.
4446          */
4447                  
4448                 while (sock_rspace(sk) < sk->mtu) 
4449                 {
4450                         skb1 = skb_peek(&sk->receive_queue);
4451                         if (skb1 == NULL) 
4452                         {
4453                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4454                                 break;
4455                         }
4456 
4457                         /*
4458                          *      Don't throw out something that has been acked. 
4459                          */
4460                  
4461                         if (skb1->acked) 
4462                         {
4463                                 break;
4464                         }
4465                 
4466                         skb_unlink(skb1);
4467                         kfree_skb(skb1, FREE_READ);
4468                 }
4469                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4470                 sk->ack_backlog++;
4471                 reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
4472         }
4473         else
4474         {
4475                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4476         }
4477 
4478         /*
4479          *      Now tell the user we may have some data. 
4480          */
4481          
4482         if (!sk->dead) 
4483         {
4484                 if(sk->debug)
4485                         printk("Data wakeup.\n");
4486                 sk->data_ready(sk,0);
4487         } 
4488         return(0);
4489 }
4490 
4491 
4492 /*
4493  *      This routine is only called when we have urgent data
4494  *      signalled. Its the 'slow' part of tcp_urg. It could be
4495  *      moved inline now as tcp_urg is only called from one
4496  *      place. We handle URGent data wrong. We have to - as
4497  *      BSD still doesn't use the correction from RFC961.
4498  */
4499  
4500 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4501 {
4502         u32 ptr = ntohs(th->urg_ptr);
4503 
4504         if (ptr)
4505                 ptr--;
4506         ptr += ntohl(th->seq);
4507 
4508         /* ignore urgent data that we've already seen and read */
4509         if (after(sk->copied_seq, ptr))
4510                 return;
4511 
4512         /* do we already have a newer (or duplicate) urgent pointer? */
4513         if (sk->urg_data && !after(ptr, sk->urg_seq))
4514                 return;
4515 
4516         /* tell the world about our new urgent pointer */
4517         if (sk->proc != 0) {
4518                 if (sk->proc > 0) {
4519                         kill_proc(sk->proc, SIGURG, 1);
4520                 } else {
4521                         kill_pg(-sk->proc, SIGURG, 1);
4522                 }
4523         }
4524         sk->urg_data = URG_NOTYET;
4525         sk->urg_seq = ptr;
4526 }
4527 
4528 /*
4529  *      This is the 'fast' part of urgent handling.
4530  */
4531  
4532 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4533         unsigned long saddr, unsigned long len)
4534 {
4535         u32 ptr;
4536 
4537         /*
4538          *      Check if we get a new urgent pointer - normally not 
4539          */
4540          
4541         if (th->urg)
4542                 tcp_check_urg(sk,th);
4543 
4544         /*
4545          *      Do we wait for any urgent data? - normally not
4546          */
4547          
4548         if (sk->urg_data != URG_NOTYET)
4549                 return 0;
4550 
4551         /*
4552          *      Is the urgent pointer pointing into this packet? 
4553          */
4554          
4555         ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4556         if (ptr >= len)
4557                 return 0;
4558 
4559         /*
4560          *      Ok, got the correct packet, update info 
4561          */
4562          
4563         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4564         if (!sk->dead)
4565                 sk->data_ready(sk,0);
4566         return 0;
4567 }
4568 
4569 /*
4570  *      This will accept the next outstanding connection. 
4571  */
4572  
4573 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4574 {
4575         struct sock *newsk;
4576         struct sk_buff *skb;
4577   
4578   /*
4579    * We need to make sure that this socket is listening,
4580    * and that it has something pending.
4581    */
4582 
4583         if (sk->state != TCP_LISTEN) 
4584         {
4585                 sk->err = EINVAL;
4586                 return(NULL); 
4587         }
4588 
4589         /* Avoid the race. */
4590         cli();
4591         sk->inuse = 1;
4592 
4593         while((skb = tcp_dequeue_established(sk)) == NULL) 
4594         {
4595                 if (flags & O_NONBLOCK) 
4596                 {
4597                         sti();
4598                         release_sock(sk);
4599                         sk->err = EAGAIN;
4600                         return(NULL);
4601                 }
4602 
4603                 release_sock(sk);
4604                 interruptible_sleep_on(sk->sleep);
4605                 if (current->signal & ~current->blocked) 
4606                 {
4607                         sti();
4608                         sk->err = ERESTARTSYS;
4609                         return(NULL);
4610                 }
4611                 sk->inuse = 1;
4612         }
4613         sti();
4614 
4615         /*
4616          *      Now all we need to do is return skb->sk. 
4617          */
4618 
4619         newsk = skb->sk;
4620 
4621         kfree_skb(skb, FREE_READ);
4622         sk->ack_backlog--;
4623         release_sock(sk);
4624         return(newsk);
4625 }
4626 
4627 
4628 /*
4629  *      This will initiate an outgoing connection. 
4630  */
4631  
4632 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4633 {
4634         struct sk_buff *buff;
4635         struct device *dev=NULL;
4636         unsigned char *ptr;
4637         int tmp;
4638         int atype;
4639         struct tcphdr *t1;
4640         struct rtable *rt;
4641 
4642         if (sk->state != TCP_CLOSE) 
4643                 return(-EISCONN);
4644 
4645         /*
4646          *      Don't allow a double connect.
4647          */
4648                 
4649         if(sk->daddr)
4650                 return -EINVAL;
4651         
4652         if (addr_len < 8) 
4653                 return(-EINVAL);
4654 
4655         if (usin->sin_family && usin->sin_family != AF_INET) 
4656                 return(-EAFNOSUPPORT);
4657 
4658         /*
4659          *      connect() to INADDR_ANY means loopback (BSD'ism).
4660          */
4661         
4662         if(usin->sin_addr.s_addr==INADDR_ANY)
4663                 usin->sin_addr.s_addr=ip_my_addr();
4664                   
4665         /*
4666          *      Don't want a TCP connection going to a broadcast address 
4667          */
4668 
4669         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4670                 return -ENETUNREACH;
4671   
4672         sk->inuse = 1;
4673         sk->daddr = usin->sin_addr.s_addr;
4674         sk->write_seq = tcp_init_seq();
4675         sk->window_seq = sk->write_seq;
4676         sk->rcv_ack_seq = sk->write_seq -1;
4677         sk->err = 0;
4678         sk->dummy_th.dest = usin->sin_port;
4679         release_sock(sk);
4680 
4681         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4682         if (buff == NULL) 
4683         {
4684                 return(-ENOMEM);
4685         }
4686         sk->inuse = 1;
4687         buff->sk = sk;
4688         buff->free = 0;
4689         buff->localroute = sk->localroute;
4690         
4691 
4692         /*
4693          *      Put in the IP header and routing stuff.
4694          */
4695          
4696         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4697                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4698         if (tmp < 0) 
4699         {
4700                 sock_wfree(sk, buff);
4701                 release_sock(sk);
4702                 return(-ENETUNREACH);
4703         }
4704         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4705                 sk->saddr = rt->rt_src;
4706         sk->rcv_saddr = sk->saddr;
4707 
4708         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4709 
4710         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4711         buff->seq = sk->write_seq++;
4712         t1->seq = htonl(buff->seq);
4713         sk->sent_seq = sk->write_seq;
4714         buff->end_seq = sk->write_seq;
4715         t1->ack = 0;
4716         t1->window = 2;
4717         t1->res1=0;
4718         t1->res2=0;
4719         t1->rst = 0;
4720         t1->urg = 0;
4721         t1->psh = 0;
4722         t1->syn = 1;
4723         t1->urg_ptr = 0;
4724         t1->doff = 6;
4725         /* use 512 or whatever user asked for */
4726         
4727         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4728                 sk->window_clamp=rt->rt_window;
4729         else
4730                 sk->window_clamp=0;
4731 
4732         if (sk->user_mss)
4733                 sk->mtu = sk->user_mss;
4734         else if (rt)
4735                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4736         else 
4737                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4738 
4739         /*
4740          *      but not bigger than device MTU 
4741          */
4742 
4743         if(sk->mtu <32)
4744                 sk->mtu = 32;   /* Sanity limit */
4745                 
4746         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4747 
4748 #ifdef CONFIG_SKIP
4749         
4750         /*
4751          *      SKIP devices set their MTU to 65535. This is so they can take packets
4752          *      unfragmented to security process then fragment. They could lie to the
4753          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4754          *      simply because the final package we want unfragmented is going to be
4755          *
4756          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4757          */
4758          
4759         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4760                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4761 #endif
4762         
4763         /*
4764          *      Put in the TCP options to say MTU. 
4765          */
4766 
4767         ptr = skb_put(buff,4);
4768         ptr[0] = 2;
4769         ptr[1] = 4;
4770         ptr[2] = (sk->mtu) >> 8;
4771         ptr[3] = (sk->mtu) & 0xff;
4772         tcp_send_check(t1, sk->saddr, sk->daddr,
4773                   sizeof(struct tcphdr) + 4, sk);
4774 
4775         /*
4776          *      This must go first otherwise a really quick response will get reset. 
4777          */
4778 
4779         tcp_cache_zap();
4780         tcp_set_state(sk,TCP_SYN_SENT);
4781         if(rt&&rt->rt_flags&RTF_IRTT)
4782                 sk->rto = rt->rt_irtt;
4783         else
4784                 sk->rto = TCP_TIMEOUT_INIT;
4785         sk->retransmit_timer.function=&retransmit_timer;
4786         sk->retransmit_timer.data = (unsigned long)sk;
4787         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4788         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4789                                                                                         initial setting */
4790 
4791         sk->prot->queue_xmit(sk, dev, buff, 0);  
4792         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4793         tcp_statistics.TcpActiveOpens++;
4794         tcp_statistics.TcpOutSegs++;
4795   
4796         release_sock(sk);
4797         return(0);
4798 }
4799 
4800 /*
4801  * React to a out-of-window TCP sequence number in an incoming packet
4802  */
4803 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4804              struct options *opt, unsigned long saddr, struct device *dev)
4805 {
4806         if (th->rst)
4807                 return;
4808 
4809         /*
4810          *      Send a reset if we get something not ours and we are
4811          *      unsynchronized. Note: We don't do anything to our end. We
4812          *      are just killing the bogus remote connection then we will
4813          *      connect again and it will work (with luck).
4814          */
4815          
4816         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4817         {
4818                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4819                 return;
4820         }
4821 
4822         /* Try to resync things. */
4823         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4824         return;
4825 }
4826 
4827 /*
4828  *      This functions checks to see if the tcp header is actually acceptable. 
4829  */
4830  
4831 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
     /* [previous][next][first][last][top][bottom][index][help] */
4832 {
4833         /* does the packet contain any unseen data AND */
4834         /* does the packet start before the window? */
4835         return  after(end_seq+1, sk->acked_seq) &&
4836                 before(seq, sk->acked_seq + sk->window + 1);
4837 }
4838 
4839 /*
4840  *      When we get a reset we do this.
4841  */
4842 
4843 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4844 {
4845         sk->zapped = 1;
4846         sk->err = ECONNRESET;
4847         if (sk->state == TCP_SYN_SENT)
4848                 sk->err = ECONNREFUSED;
4849         if (sk->state == TCP_CLOSE_WAIT)
4850                 sk->err = EPIPE;
4851 #ifdef TCP_DO_RFC1337           
4852         /*
4853          *      Time wait assassination protection [RFC1337]
4854          */
4855         if(sk->state!=TCP_TIME_WAIT)
4856         {       
4857                 tcp_set_state(sk,TCP_CLOSE);
4858                 sk->shutdown = SHUTDOWN_MASK;
4859         }
4860 #else   
4861         tcp_set_state(sk,TCP_CLOSE);
4862         sk->shutdown = SHUTDOWN_MASK;
4863 #endif  
4864         if (!sk->dead) 
4865                 sk->state_change(sk);
4866         kfree_skb(skb, FREE_READ);
4867         release_sock(sk);
4868         return(0);
4869 }
4870 
4871 /*
4872  *      Find the socket, using the last hit cache if applicable.
4873  */
4874 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
     /* [previous][next][first][last][top][bottom][index][help] */
4875 {
4876         struct sock * sk;
4877 
4878         sk = (struct sock *) th_cache_sk;
4879         if (saddr != th_cache_saddr || daddr != th_cache_daddr ||
4880             sport != th_cache_sport || dport != th_cache_dport) {
4881                 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
4882                 if (sk) {
4883                         th_cache_saddr=saddr;
4884                         th_cache_daddr=daddr;
4885                         th_cache_dport=dport;
4886                         th_cache_sport=sport;
4887                         th_cache_sk=sk;
4888                 }
4889         }
4890         return sk;
4891 }
4892 
4893 
4894 /*
4895  *      A TCP packet has arrived.
4896  *              skb->h.raw is the TCP header.
4897  */
4898  
4899 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4900         __u32 daddr, unsigned short len,
4901         __u32 saddr, int redo, struct inet_protocol * protocol)
4902 {
4903         struct tcphdr *th;
4904         struct sock *sk;
4905         int syn_ok=0;
4906 
4907         /*
4908          * "redo" is 1 if we have already seen this skb but couldn't
4909          * use it at that time (the socket was locked).  In that case
4910          * we have already done a lot of the work (looked up the socket
4911          * etc).
4912          */
4913         th = skb->h.th;
4914         sk = skb->sk;
4915         if (!redo) {
4916                 tcp_statistics.TcpInSegs++;
4917                 if (skb->pkt_type!=PACKET_HOST)
4918                 {
4919                         kfree_skb(skb,FREE_READ);
4920                         return(0);
4921                 }
4922                 /*
4923                  *      Pull up the IP header.
4924                  */
4925                 skb_pull(skb, skb->h.raw-skb->data);
4926                 /*
4927                  *      Try to use the device checksum if provided.
4928                  */
4929                 if (
4930                         ((skb->ip_summed == CHECKSUM_HW) && tcp_check(th, len, saddr, daddr, skb->csum ))||
4931                         ((skb->ip_summed == CHECKSUM_NONE) && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4932                     /* skip if CHECKSUM_UNNECESSARY */
4933                     )
4934                 {
4935                         skb->sk = NULL;
4936                         kfree_skb(skb,FREE_READ);
4937                         /*
4938                          *      We don't release the socket because it was
4939                          *      never marked in use.
4940                          */
4941                         return(0);
4942                 }
4943                 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
4944                 if (!sk)
4945                         goto no_tcp_socket;
4946                 skb->sk = sk;
4947                 skb->seq = ntohl(th->seq);
4948                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4949                 skb->ack_seq = ntohl(th->ack_seq);
4950 
4951                 skb->acked = 0;
4952                 skb->used = 0;
4953                 skb->free = 0;
4954                 skb->saddr = daddr;
4955                 skb->daddr = saddr;
4956         
4957                 /* We may need to add it to the backlog here. */
4958                 cli();
4959                 if (sk->inuse) 
4960                 {
4961                         skb_queue_tail(&sk->back_log, skb);
4962                         sti();
4963                         return(0);
4964                 }
4965                 sk->inuse = 1;
4966                 sti();
4967         }
4968 
4969         /*
4970          *      If this socket has got a reset it's to all intents and purposes 
4971          *      really dead. Count closed sockets as dead.
4972          *
4973          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4974          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4975          *      exist so should cause resets as if the port was unreachable.
4976          */
4977 
4978         if (sk->zapped || sk->state==TCP_CLOSE)
4979                 goto no_tcp_socket;
4980 
4981         if (!sk->prot) 
4982         {
4983                 printk("IMPOSSIBLE 3\n");
4984                 return(0);
4985         }
4986 
4987 
4988         /*
4989          *      Charge the memory to the socket. 
4990          */
4991          
4992         skb->sk=sk;
4993         sk->rmem_alloc += skb->truesize;
4994 
4995         /*
4996          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4997          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4998          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4999          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
5000          */
5001 
5002         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
5003         {
5004         
5005                 /*
5006                  *      Now deal with unusual cases.
5007                  */
5008          
5009                 if(sk->state==TCP_LISTEN)
5010                 {
5011                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
5012                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
5013 
5014                         /*
5015                          *      We don't care for RST, and non SYN are absorbed (old segments)
5016                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
5017                          *      netmask on a running connection it can go broadcast. Even Sun's have
5018                          *      this problem so I'm ignoring it 
5019                          */
5020                            
5021                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
5022                         {
5023                                 kfree_skb(skb, FREE_READ);
5024                                 release_sock(sk);
5025                                 return 0;
5026                         }
5027                 
5028                         /*      
5029                          *      Guess we need to make a new socket up 
5030                          */
5031                 
5032                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
5033                 
5034                         /*
5035                          *      Now we have several options: In theory there is nothing else
5036                          *      in the frame. KA9Q has an option to send data with the syn,
5037                          *      BSD accepts data with the syn up to the [to be] advertised window
5038                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
5039                          *      it, that fits the spec precisely and avoids incompatibilities. It
5040                          *      would be nice in future to drop through and process the data.
5041                          */
5042                          
5043                         release_sock(sk);
5044                         return 0;
5045                 }
5046         
5047                 /* retransmitted SYN? */
5048                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5049                 {
5050                         kfree_skb(skb, FREE_READ);
5051                         release_sock(sk);
5052                         return 0;
5053                 }
5054                 
5055                 /*
5056                  *      SYN sent means we have to look for a suitable ack and either reset
5057                  *      for bad matches or go to connected 
5058                  */
5059            
5060                 if(sk->state==TCP_SYN_SENT)
5061                 {
5062                         /* Crossed SYN or previous junk segment */
5063                         if(th->ack)
5064                         {
5065                                 /* We got an ack, but it's not a good ack */
5066                                 if(!tcp_ack(sk,th,saddr,len))
5067                                 {
5068                                         /* Reset the ack - its an ack from a 
5069                                            different connection  [ th->rst is checked in tcp_reset()] */
5070                                         tcp_statistics.TcpAttemptFails++;
5071                                         tcp_reset(daddr, saddr, th,
5072                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5073                                         kfree_skb(skb, FREE_READ);
5074                                         release_sock(sk);
5075                                         return(0);
5076                                 }
5077                                 if(th->rst)
5078                                         return tcp_std_reset(sk,skb);
5079                                 if(!th->syn)
5080                                 {
5081                                         /* A valid ack from a different connection
5082                                            start. Shouldn't happen but cover it */
5083                                         tcp_statistics.TcpAttemptFails++;
5084                                         tcp_reset(daddr, saddr, th,
5085                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5086                                         kfree_skb(skb, FREE_READ);
5087                                         release_sock(sk);
5088                                         return 0;
5089                                 }
5090                                 /*
5091                                  *      Ok.. it's good. Set up sequence numbers and
5092                                  *      move to established.
5093                                  */
5094                                 syn_ok=1;       /* Don't reset this connection for the syn */
5095                                 sk->acked_seq = skb->seq+1;
5096                                 sk->lastwin_seq = skb->seq+1;
5097                                 sk->fin_seq = skb->seq;
5098                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5099                                 tcp_set_state(sk, TCP_ESTABLISHED);
5100                                 tcp_options(sk,th);
5101                                 sk->dummy_th.dest=th->source;
5102                                 sk->copied_seq = sk->acked_seq;
5103                                 if(!sk->dead)
5104                                 {
5105                                         sk->state_change(sk);
5106                                         sock_wake_async(sk->socket, 0);
5107                                 }
5108                                 if(sk->max_window==0)
5109                                 {
5110                                         sk->max_window = 32;
5111                                         sk->mss = min(sk->max_window, sk->mtu);
5112                                 }
5113                         }
5114                         else
5115                         {
5116                                 /* See if SYN's cross. Drop if boring */
5117                                 if(th->syn && !th->rst)
5118                                 {
5119                                         /* Crossed SYN's are fine - but talking to
5120                                            yourself is right out... */
5121                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5122                                                 sk->dummy_th.source==th->source &&
5123                                                 sk->dummy_th.dest==th->dest)
5124                                         {
5125                                                 tcp_statistics.TcpAttemptFails++;
5126                                                 return tcp_std_reset(sk,skb);
5127                                         }
5128                                         tcp_set_state(sk,TCP_SYN_RECV);
5129                                         
5130                                         /*
5131                                          *      FIXME:
5132                                          *      Must send SYN|ACK here
5133                                          */
5134                                 }               
5135                                 /* Discard junk segment */
5136                                 kfree_skb(skb, FREE_READ);
5137                                 release_sock(sk);
5138                                 return 0;
5139                         }
5140                         /*
5141                          *      SYN_RECV with data maybe.. drop through
5142                          */
5143                         goto rfc_step6;
5144                 }
5145 
5146         /*
5147          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5148          *      a more complex suggestion for fixing these reuse issues in RFC1644
5149          *      but not yet ready for general use. Also see RFC1379.
5150          */
5151         
5152 #define BSD_TIME_WAIT
5153 #ifdef BSD_TIME_WAIT
5154                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5155                         after(skb->seq, sk->acked_seq) && !th->rst)
5156                 {
5157                         u32 seq = sk->write_seq;
5158                         if(sk->debug)
5159                                 printk("Doing a BSD time wait\n");
5160                         tcp_statistics.TcpEstabResets++;           
5161                         sk->rmem_alloc -= skb->truesize;
5162                         skb->sk = NULL;
5163                         sk->err=ECONNRESET;
5164                         tcp_set_state(sk, TCP_CLOSE);
5165                         sk->shutdown = SHUTDOWN_MASK;
5166                         release_sock(sk);
5167                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5168                         if (sk && sk->state==TCP_LISTEN)
5169                         {
5170                                 sk->inuse=1;
5171                                 skb->sk = sk;
5172                                 sk->rmem_alloc += skb->truesize;
5173                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5174                                 release_sock(sk);
5175                                 return 0;
5176                         }
5177                         kfree_skb(skb, FREE_READ);
5178                         return 0;
5179                 }
5180 #endif  
5181         }
5182 
5183         /*
5184          *      We are now in normal data flow (see the step list in the RFC)
5185          *      Note most of these are inline now. I'll inline the lot when
5186          *      I have time to test it hard and look at what gcc outputs 
5187          */
5188         
5189         if (!tcp_sequence(sk, skb->seq, skb->end_seq))
5190         {
5191                 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
5192                 kfree_skb(skb, FREE_READ);
5193                 release_sock(sk);
5194                 return 0;
5195         }
5196 
5197         if(th->rst)
5198                 return tcp_std_reset(sk,skb);
5199         
5200         /*
5201          *      !syn_ok is effectively the state test in RFC793.
5202          */
5203          
5204         if(th->syn && !syn_ok)
5205         {
5206                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5207                 return tcp_std_reset(sk,skb);   
5208         }
5209 
5210 
5211         /*
5212          *      Delayed ACK time estimator.
5213          */
5214         
5215         if (sk->lrcvtime == 0) 
5216         {
5217                 sk->lrcvtime = jiffies;
5218                 sk->ato = HZ/3;
5219         }
5220         else 
5221         {
5222                 int m;
5223                 
5224                 m = jiffies - sk->lrcvtime;
5225 
5226                 sk->lrcvtime = jiffies;
5227 
5228                 if (m <= 0)
5229                         m = 1;
5230 
5231                 if (m > (sk->rtt >> 3)) 
5232                 {
5233                         sk->ato = sk->rtt >> 3;
5234                         /*
5235                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
5236                          */
5237                 }
5238                 else 
5239                 {
5240                         sk->ato = (sk->ato >> 1) + m;
5241                         /*
5242                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
5243                          */
5244                 }
5245         }
5246           
5247         /*
5248          *      Process the ACK
5249          */
5250          
5251 
5252         if(th->ack && !tcp_ack(sk,th,saddr,len))
5253         {
5254                 /*
5255                  *      Our three way handshake failed.
5256                  */
5257                  
5258                 if(sk->state==TCP_SYN_RECV)
5259                 {
5260                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5261                 }
5262                 kfree_skb(skb, FREE_READ);
5263                 release_sock(sk);
5264                 return 0;
5265         }
5266         
5267 rfc_step6:              /* I'll clean this up later */
5268 
5269         /*
5270          *      If the accepted buffer put us over our queue size we
5271          *      now drop it (we must process the ack first to avoid
5272          *      deadlock cases).
5273          */
5274          
5275         if (sk->rmem_alloc  >= sk->rcvbuf) 
5276         {
5277                 kfree_skb(skb, FREE_READ);
5278                 release_sock(sk);
5279                 return(0);
5280         }
5281 
5282 
5283         /*
5284          *      Process urgent data
5285          */
5286                 
5287         if(tcp_urg(sk, th, saddr, len))
5288         {
5289                 kfree_skb(skb, FREE_READ);
5290                 release_sock(sk);
5291                 return 0;
5292         }
5293         
5294         /*
5295          *      Process the encapsulated data
5296          */
5297         
5298         if(tcp_data(skb,sk, saddr, len))
5299         {
5300                 kfree_skb(skb, FREE_READ);
5301                 release_sock(sk);
5302                 return 0;
5303         }
5304 
5305         /*
5306          *      And done
5307          */     
5308         
5309         release_sock(sk);
5310         return 0;
5311 
5312 no_tcp_socket:
5313         /*
5314          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
5315          */
5316         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
5317         skb->sk = NULL;
5318         /*
5319          *      Discard frame
5320          */
5321         kfree_skb(skb, FREE_READ);
5322         return 0;
5323 }
5324 
5325 /*
5326  *      This routine sends a packet with an out of date sequence
5327  *      number. It assumes the other end will try to ack it.
5328  */
5329 
5330 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5331 {
5332         struct sk_buff *buff,*skb;
5333         struct tcphdr *t1;
5334         struct device *dev=NULL;
5335         int tmp;
5336 
5337         if (sk->zapped)
5338                 return; /* After a valid reset we can send no more */
5339 
5340         /*
5341          *      Write data can still be transmitted/retransmitted in the
5342          *      following states.  If any other state is encountered, return.
5343          *      [listen/close will never occur here anyway]
5344          */
5345 
5346         if (sk->state != TCP_ESTABLISHED && 
5347             sk->state != TCP_CLOSE_WAIT &&
5348             sk->state != TCP_FIN_WAIT1 && 
5349             sk->state != TCP_LAST_ACK &&
5350             sk->state != TCP_CLOSING
5351         ) 
5352         {
5353                 return;
5354         }
5355         if ( before(sk->sent_seq, sk->window_seq) && 
5356             (skb=skb_peek(&sk->write_queue)))
5357         {
5358                 /*
5359                  * We are probing the opening of a window
5360                  * but the window size is != 0
5361                  * must have been a result SWS advoidance ( sender )
5362                  */
5363             
5364                 struct iphdr *iph;
5365                 struct tcphdr *th;
5366                 struct tcphdr *nth;
5367                 unsigned long win_size;
5368 #if 0
5369                 unsigned long ow_size;
5370 #endif
5371                 void * tcp_data_start;
5372         
5373                 /*
5374                  *      How many bytes can we send ?
5375                  */
5376                  
5377                 win_size = sk->window_seq - sk->sent_seq;
5378 
5379                 /*
5380                  *      Recover the buffer pointers
5381                  */
5382                  
5383                 iph = (struct iphdr *)skb->ip_hdr;
5384                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5385 
5386                 /*
5387                  *      Grab the data for a temporary frame
5388                  */
5389                  
5390                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5391                                      (iph->ihl << 2) +
5392                                      sk->prot->max_header + 15, 
5393                                      1, GFP_ATOMIC);
5394                 if ( buff == NULL )
5395                         return;
5396 
5397                 /* 
5398                  *      If we strip the packet on the write queue we must
5399                  *      be ready to retransmit this one 
5400                  */
5401             
5402                 buff->free = /*0*/1;
5403 
5404                 buff->sk = sk;
5405                 buff->localroute = sk->localroute;
5406                 
5407                 /*
5408                  *      Put headers on the new packet
5409                  */
5410 
5411                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5412                                          IPPROTO_TCP, sk->opt, buff->truesize,
5413                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5414                 if (tmp < 0) 
5415                 {
5416                         sock_wfree(sk, buff);
5417                         return;
5418                 }
5419                 
5420                 /*
5421                  *      Move the TCP header over
5422                  */
5423 
5424                 buff->dev = dev;
5425 
5426                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5427 
5428                 memcpy(nth, th, th->doff * 4);
5429                 
5430                 /*
5431                  *      Correct the new header
5432                  */
5433                  
5434                 nth->ack = 1; 
5435                 nth->ack_seq = htonl(sk->acked_seq);
5436                 nth->window = htons(tcp_select_window(sk));
5437                 nth->check = 0;
5438 
5439                 /*
5440                  *      Find the first data byte.
5441                  */
5442                  
5443                 tcp_data_start = (char *) th + (th->doff << 2);
5444 
5445                 /*
5446                  *      Add it to our new buffer
5447                  */
5448                  
5449                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5450                 
5451                 /*
5452                  *      Remember our right edge sequence number.
5453                  */
5454                  
5455                 buff->end_seq = sk->sent_seq + win_size;
5456                 sk->sent_seq = buff->end_seq;           /* Hack */
5457                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5458                         nth->urg = 0;
5459 
5460                 /*
5461                  *      Checksum the split buffer
5462                  */
5463                  
5464                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5465                            nth->doff * 4 + win_size , sk);
5466         }
5467         else
5468         {       
5469                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5470                 if (buff == NULL) 
5471                         return;
5472 
5473                 buff->free = 1;
5474                 buff->sk = sk;
5475                 buff->localroute = sk->localroute;
5476 
5477                 /*
5478                  *      Put in the IP header and routing stuff. 
5479                  */
5480                  
5481                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5482                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5483                 if (tmp < 0) 
5484                 {
5485                         sock_wfree(sk, buff);
5486                         return;
5487                 }
5488 
5489                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5490                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5491 
5492                 /*
5493                  *      Use a previous sequence.
5494                  *      This should cause the other end to send an ack.
5495                  */
5496          
5497                 t1->seq = htonl(sk->sent_seq-1);
5498                 t1->ack = 1; 
5499                 t1->res1= 0;
5500                 t1->res2= 0;
5501                 t1->rst = 0;
5502                 t1->urg = 0;
5503                 t1->psh = 0;
5504                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5505                 t1->syn = 0;
5506                 t1->ack_seq = htonl(sk->acked_seq);
5507                 t1->window = htons(tcp_select_window(sk));
5508                 t1->doff = sizeof(*t1)/4;
5509                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5510 
5511         }               
5512 
5513         /*
5514          *      Send it.
5515          */
5516         
5517         sk->prot->queue_xmit(sk, dev, buff, 1);
5518         tcp_statistics.TcpOutSegs++;
5519 }
5520 
5521 /*
5522  *      A window probe timeout has occurred.
5523  */
5524 
5525 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5526 {
5527         if (sk->zapped)
5528                 return;         /* After a valid reset we can send no more */
5529 
5530         tcp_write_wakeup(sk);
5531 
5532         sk->backoff++;
5533         sk->rto = min(sk->rto << 1, 120*HZ);
5534         sk->retransmits++;
5535         sk->prot->retransmits ++;
5536         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5537 }
5538 
5539 /*
5540  *      Socket option code for TCP. 
5541  */
5542   
5543 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5544 {
5545         int val,err;
5546 
5547         if(level!=SOL_TCP)
5548                 return ip_setsockopt(sk,level,optname,optval,optlen);
5549 
5550         if (optval == NULL) 
5551                 return(-EINVAL);
5552 
5553         err=verify_area(VERIFY_READ, optval, sizeof(int));
5554         if(err)
5555                 return err;
5556         
5557         val = get_user((int *)optval);
5558 
5559         switch(optname)
5560         {
5561                 case TCP_MAXSEG:
5562 /*
5563  * values greater than interface MTU won't take effect.  however at
5564  * the point when this call is done we typically don't yet know
5565  * which interface is going to be used
5566  */
5567                         if(val<1||val>MAX_WINDOW)
5568                                 return -EINVAL;
5569                         sk->user_mss=val;
5570                         return 0;
5571                 case TCP_NODELAY:
5572                         sk->nonagle=(val==0)?0:1;
5573                         return 0;
5574                 default:
5575                         return(-ENOPROTOOPT);
5576         }
5577 }
5578 
5579 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5580 {
5581         int val,err;
5582 
5583         if(level!=SOL_TCP)
5584                 return ip_getsockopt(sk,level,optname,optval,optlen);
5585                         
5586         switch(optname)
5587         {
5588                 case TCP_MAXSEG:
5589                         val=sk->user_mss;
5590                         break;
5591                 case TCP_NODELAY:
5592                         val=sk->nonagle;
5593                         break;
5594                 default:
5595                         return(-ENOPROTOOPT);
5596         }
5597         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5598         if(err)
5599                 return err;
5600         put_user(sizeof(int),(int *) optlen);
5601 
5602         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5603         if(err)
5604                 return err;
5605         put_user(val,(int *)optval);
5606 
5607         return(0);
5608 }       
5609 
5610 
5611 struct proto tcp_prot = {
5612         tcp_close,
5613         ip_build_header,
5614         tcp_connect,
5615         tcp_accept,
5616         ip_queue_xmit,
5617         tcp_retransmit,
5618         tcp_write_wakeup,
5619         tcp_read_wakeup,
5620         tcp_rcv,
5621         tcp_select,
5622         tcp_ioctl,
5623         NULL,
5624         tcp_shutdown,
5625         tcp_setsockopt,
5626         tcp_getsockopt,
5627         tcp_sendmsg,
5628         tcp_recvmsg,
5629         NULL,           /* No special bind() */
5630         128,
5631         0,
5632         "TCP",
5633         0, 0,
5634         {NULL,}
5635 };

/* [previous][next][first][last][top][bottom][index][help] */