net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_raise_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathalogical case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in 
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in 
 190  *                                      tcp_do_retransmit()
 191  *
 192  * To Fix:
 193  *              Fast path the code. Two things here - fix the window calculation
 194  *              so it doesn't iterate over the queue, also spot packets with no funny
 195  *              options arriving in order and process directly.
 196  *
 197  *              Rewrite output state machine to use a single queue.
 198  *              Speed up input assembly algorithm.
 199  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 200  *              could do with it working on IPv4
 201  *              User settable/learned rtt/max window/mtu
 202  *
 203  *              Change the fundamental structure to a single send queue maintained
 204  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 205  *              active routes too]). Cut the queue off in tcp_retransmit/
 206  *              tcp_transmit.
 207  *              Change the receive queue to assemble as it goes. This lets us
 208  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 209  *              tcp_data/tcp_read as well as the window shrink crud.
 210  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 211  *              tcp_queue_skb seem obvious routines to extract.
 212  *      
 213  *              This program is free software; you can redistribute it and/or
 214  *              modify it under the terms of the GNU General Public License
 215  *              as published by the Free Software Foundation; either version
 216  *              2 of the License, or(at your option) any later version.
 217  *
 218  * Description of States:
 219  *
 220  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 221  *
 222  *      TCP_SYN_RECV            received a connection request, sent ack,
 223  *                              waiting for final ack in three-way handshake.
 224  *
 225  *      TCP_ESTABLISHED         connection established
 226  *
 227  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 228  *                              transmission of remaining buffered data
 229  *
 230  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 231  *                              to shutdown
 232  *
 233  *      TCP_CLOSING             both sides have shutdown but we still have
 234  *                              data we have to finish sending
 235  *
 236  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 237  *                              closed, can only be entered from FIN_WAIT2
 238  *                              or CLOSING.  Required because the other end
 239  *                              may not have gotten our last ACK causing it
 240  *                              to retransmit the data packet (which we ignore)
 241  *
 242  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 243  *                              us to finish writing our data and to shutdown
 244  *                              (we have to close() to move on to LAST_ACK)
 245  *
 246  *      TCP_LAST_ACK            out side has shutdown after remote has
 247  *                              shutdown.  There may still be data in our
 248  *                              buffer that we have to finish sending
 249  *              
 250  *      TCP_CLOSE               socket is finished
 251  */
 252 
 253 /*
 254  * RFC1122 status:
 255  * NOTE: I'm not going to be doing comments in the code for this one except
 256  * for violations and the like.  tcp.c is just too big... If I say something
 257  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 258  * with Alan. -- MS 950903
 259  * 
 260  * Use of PSH (4.2.2.2)
 261  *   MAY aggregate data sent without the PSH flag. (does)
 262  *   MAY queue data received without the PSH flag. (does)
 263  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 264  *   MAY implement PSH on send calls. (doesn't, thus:)
 265  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 266  *     MUST set PSH on last segment (does)
 267  *   MAY pass received PSH to application layer (doesn't)
 268  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 269  * 
 270  * Window Size (4.2.2.3, 4.2.2.16)
 271  *   MUST treat window size as an unsigned number (does)
 272  *   SHOULD treat window size as a 32-bit number (does not)
 273  *   MUST NOT shrink window once it is offered (does not normally)
 274  *   
 275  * Urgent Pointer (4.2.2.4)
 276  * **MUST point urgent pointer to last byte of urgent data (not right
 277  *     after). (doesn't, to be like BSD)
 278  *   MUST inform application layer asynchronously of incoming urgent
 279  *     data. (does)
 280  *   MUST provide application with means of determining the amount of
 281  *     urgent data pending. (does)
 282  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 283  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 284  *      [Follows BSD 1 byte of urgent data]
 285  * 
 286  * TCP Options (4.2.2.5)
 287  *   MUST be able to receive TCP options in any segment. (does)
 288  *   MUST ignore unsupported options (does)
 289  *   
 290  * Maximum Segment Size Option (4.2.2.6)
 291  *   MUST implement both sending and receiving MSS. (does)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *  
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  * 
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 306  * 
 307  * Simultaneous Open Attempts (4.2.2.10)
 308  *   MUST support simultaneous open attempts (does)
 309  * 
 310  * Recovery from Old Duplicate SYN (4.2.2.11)
 311  *   MUST keep track of active vs. passive open (does)
 312  * 
 313  * RST segment (4.2.2.12)
 314  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 315  *     anything with it, which is standard)
 316  * 
 317  * Closing a Connection (4.2.2.13)
 318  *   MUST inform application of whether connectin was closed by RST or
 319  *     normal close. (does)
 320  *   MAY allow "half-duplex" close (treat connection as closed for the
 321  *     local app, even before handshake is done). (does)
 322  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 323  * 
 324  * Retransmission Timeout (4.2.2.15)
 325  *   MUST implement Jacobson's slow start and congestion avoidance
 326  *     stuff. (does) 
 327  * 
 328  * Probing Zero Windows (4.2.2.17)
 329  *   MUST support probing of zero windows. (does)
 330  *   MAY keep offered window closed indefinitely. (does)
 331  *   MUST allow remote window to stay closed indefinitely. (does)
 332  * 
 333  * Passive Open Calls (4.2.2.18)
 334  *   MUST NOT let new passive open affect other connections. (doesn't)
 335  *   MUST support passive opens (LISTENs) concurrently. (does)
 336  *   
 337  * Time to Live (4.2.2.19)
 338  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 339  * 
 340  * Event Processing (4.2.2.20)
 341  *   SHOULD queue out-of-order segments. (does)
 342  *   MUST aggregate ACK segments whenever possible. (does but badly)
 343  *   
 344  * Retransmission Timeout Calculation (4.2.3.1)
 345  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 346  *     calculation. (does, or at least explains them in the comments 8*b)
 347  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 348  * 
 349  * When to Send an ACK Segment (4.2.3.2)
 350  *   SHOULD implement delayed ACK. (does)
 351  *   MUST keep ACK delay < 0.5 sec. (does)
 352  * 
 353  * When to Send a Window Update (4.2.3.3)
 354  *   MUST implement receiver-side SWS. (does)
 355  *   
 356  * When to Send Data (4.2.3.4)
 357  *   MUST implement sender-side SWS. (does)
 358  *   SHOULD implement Nagle algorithm. (does)
 359  * 
 360  * TCP Connection Failures (4.2.3.5)
 361  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 362  *   SHOULD inform application layer of soft errors. (does)
 363  *   
 364  * TCP Keep-Alives (4.2.3.6)
 365  *   MAY provide keep-alives. (does)
 366  *   MUST make keep-alives configurable on a per-connection basis. (does)
 367  *   MUST default to no keep-alives. (does)
 368  * **MUST make keep-alive interval configurable. (doesn't)
 369  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 370  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 371  *     connection. (doesn't)
 372  *   SHOULD send keep-alive with no data. (does)
 373  * 
 374  * TCP Multihoming (4.2.3.7)
 375  *   MUST get source address from IP layer before sending first
 376  *     SYN. (does)
 377  *   MUST use same local address for all segments of a connection. (does)
 378  * 
 379  * IP Options (4.2.3.8)
 380  *   MUST ignore unsupported IP options. (does)
 381  *   MAY support Time Stamp and Record Route. (does)
 382  *   MUST allow application to specify a source route. (does)
 383  *   MUST allow receieved Source Route option to set route for all future
 384  *     segments on this connection. (does not (security issues))
 385  * 
 386  * ICMP messages (4.2.3.9)
 387  *   MUST act on ICMP errors. (does)
 388  *   MUST slow transmission upon receipt of a Source Quench. (does)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does)
 394  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 395  *     messages (2, 3, 4). (does)
 396  * 
 397  * Remote Address Validation (4.2.3.10)
 398  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 399  *   MUST ignore SYN with invalid source address. (does)
 400  *   MUST silently discard incoming SYN for broadcast/multicast
 401  *     address. (does) 
 402  * 
 403  * Asynchronous Reports (4.2.4.1)
 404  * MUST provide mechanism for reporting soft errors to application
 405  *     layer. (does)
 406  * 
 407  * Type of Service (4.2.4.2)
 408  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 409  * 
 410  * (Whew. -- MS 950903)
 411  **/
 412 
 413 #include <linux/types.h>
 414 #include <linux/sched.h>
 415 #include <linux/mm.h>
 416 #include <linux/time.h>
 417 #include <linux/string.h>
 418 #include <linux/config.h>
 419 #include <linux/socket.h>
 420 #include <linux/sockios.h>
 421 #include <linux/termios.h>
 422 #include <linux/in.h>
 423 #include <linux/fcntl.h>
 424 #include <linux/inet.h>
 425 #include <linux/netdevice.h>
 426 #include <net/snmp.h>
 427 #include <net/ip.h>
 428 #include <net/protocol.h>
 429 #include <net/icmp.h>
 430 #include <net/tcp.h>
 431 #include <net/arp.h>
 432 #include <linux/skbuff.h>
 433 #include <net/sock.h>
 434 #include <net/route.h>
 435 #include <linux/errno.h>
 436 #include <linux/timer.h>
 437 #include <asm/system.h>
 438 #include <asm/segment.h>
 439 #include <linux/mm.h>
 440 #include <net/checksum.h>
 441 
 442 /*
 443  *      The MSL timer is the 'normal' timer.
 444  */
 445  
 446 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 447 
 448 #define SEQ_TICK 3
 449 unsigned long seq_offset;
 450 struct tcp_mib  tcp_statistics;
 451 
 452 /*
 453  *      Cached last hit socket
 454  */
 455  
 456 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 457 volatile unsigned short  th_cache_dport, th_cache_sport;
 458 volatile struct sock *th_cache_sk;
 459 
 460 void tcp_cache_zap(void)
     /*  */
 461 {
 462         unsigned long flags;
 463         save_flags(flags);
 464         cli();
 465         th_cache_saddr=0;
 466         th_cache_daddr=0;
 467         th_cache_dport=0;
 468         th_cache_sport=0;
 469         th_cache_sk=NULL;
 470         restore_flags(flags);
 471 }
 472 
 473 static void tcp_close(struct sock *sk, int timeout);
 474 static void tcp_read_wakeup(struct sock *sk);
 475 
 476 /*
 477  *      The less said about this the better, but it works and will do for 1.2  (and 1.4 ;))
 478  */
 479 
 480 static struct wait_queue *master_select_wakeup;
 481 
 482 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 483 {
 484         if (a < b) 
 485                 return(a);
 486         return(b);
 487 }
 488 
 489 #undef STATE_TRACE
 490 
 491 #ifdef STATE_TRACE
 492 static char *statename[]={
 493         "Unused","Established","Syn Sent","Syn Recv",
 494         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 495         "Close Wait","Last ACK","Listen","Closing"
 496 };
 497 #endif
 498 
 499 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 500 {
 501         if(sk->state==TCP_ESTABLISHED)
 502                 tcp_statistics.TcpCurrEstab--;
 503 #ifdef STATE_TRACE
 504         if(sk->debug)
 505                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 506 #endif  
 507         /* This is a hack but it doesn't occur often and it's going to
 508            be a real        to fix nicely */
 509            
 510         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 511         {
 512                 wake_up_interruptible(&master_select_wakeup);
 513         }
 514         sk->state=state;
 515         if(state==TCP_ESTABLISHED)
 516                 tcp_statistics.TcpCurrEstab++;
 517         if(sk->state==TCP_CLOSE)
 518                 tcp_cache_zap();
 519 }
 520 
 521 /*
 522  *      This routine picks a TCP windows for a socket based on
 523  *      the following constraints
 524  *  
 525  *      1. The window can never be shrunk once it is offered (RFC 793)
 526  *      2. We limit memory per socket
 527  */
 528 
 529 
 530 static __inline__ unsigned short tcp_select_window(struct sock *sk)
     /*  */
 531 {
 532         long free_space = sock_rspace(sk);      
 533         long window = 0;
 534 
 535         if (free_space > 1024)
 536                 free_space &= ~0x3FF;  /* make free space a multiple of 1024 */
 537  
 538         if(sk->window_clamp)
 539                 free_space = min(sk->window_clamp, free_space);
 540  
 541         /* 
 542          * compute the actual window i.e. 
 543          * old_window - received_bytes_on_that_win 
 544          */
 545 
 546         if (sk->mss == 0)
 547                 sk->mss = sk->mtu;
 548 
 549         window = sk->window - (sk->acked_seq - sk->lastwin_seq);
 550  
 551         if ( window < 0 ) {     
 552                 window = 0;
 553                 printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", 
 554                        sk->window, sk->acked_seq, sk->lastwin_seq);
 555         }
 556 
 557         /*
 558          * RFC 1122:
 559          * "the suggested [SWS] avoidance algoritm for the receiver is to keep
 560          *  RECV.NEXT + RCV.WIN fixed until:
 561          *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 562          * 
 563          * i.e. don't raise the right edge of the window until you can't raise
 564          * it MSS bytes
 565          */
 566         
 567         if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
 568                 window += ((free_space - window) / sk->mss) * sk->mss;
 569         
 570         sk->window = window;
 571         sk->lastwin_seq = sk->acked_seq;
 572         
 573         return sk->window;
 574 }
 575 
 576 /*
 577  *      This function returns the amount that we can raise the
 578  *      usable window.
 579  */
 580 
 581 static __inline__ unsigned short tcp_raise_window(struct sock *sk)
     /*  */
 582 {
 583         long free_space = sock_rspace(sk);
 584         long window = 0;
 585 
 586         if (free_space > 1024)
 587                 free_space &= ~0x3FF; /* make free space a multiple of 1024 */
 588 
 589         if(sk->window_clamp)
 590                 free_space = min(sk->window_clamp, free_space);
 591  
 592         /* 
 593          * compute the actual window i.e. 
 594          * old_window - received_bytes_on_that_win 
 595          */
 596 
 597         window = sk->window - (sk->acked_seq - sk->lastwin_seq);
 598 
 599         if (sk->mss == 0)
 600                 sk->mss = sk->mtu;
 601  
 602         if ( window < 0 ) {     
 603                 window = 0;
 604                 printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n", 
 605                        sk->window, sk->acked_seq, sk->lastwin_seq);
 606         }
 607         
 608         if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
 609                 return ((free_space - window) / sk->mss) * sk->mss;
 610 
 611         return 0;
 612 }
 613 
 614 /*
 615  *      Find someone to 'accept'. Must be called with
 616  *      sk->inuse=1 or cli()
 617  */ 
 618 
 619 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 620 {
 621         struct sk_buff *p=skb_peek(&s->receive_queue);
 622         if(p==NULL)
 623                 return NULL;
 624         do
 625         {
 626                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 627                         return p;
 628                 p=p->next;
 629         }
 630         while(p!=(struct sk_buff *)&s->receive_queue);
 631         return NULL;
 632 }
 633 
 634 /*
 635  *      Remove a completed connection and return it. This is used by
 636  *      tcp_accept() to get connections from the queue.
 637  */
 638 
 639 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 640 {
 641         struct sk_buff *skb;
 642         unsigned long flags;
 643         save_flags(flags);
 644         cli(); 
 645         skb=tcp_find_established(s);
 646         if(skb!=NULL)
 647                 skb_unlink(skb);        /* Take it off the queue */
 648         restore_flags(flags);
 649         return skb;
 650 }
 651 
 652 /* 
 653  *      This routine closes sockets which have been at least partially
 654  *      opened, but not yet accepted. Currently it is only called by
 655  *      tcp_close, and timeout mirrors the value there. 
 656  */
 657 
 658 static void tcp_close_pending (struct sock *sk) 
     /*  */
 659 {
 660         struct sk_buff *skb;
 661 
 662         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 663         {
 664                 skb->sk->dead=1;
 665                 tcp_close(skb->sk, 0);
 666                 kfree_skb(skb, FREE_READ);
 667         }
 668         return;
 669 }
 670 
 671 /*
 672  *      Enter the time wait state. 
 673  */
 674 
 675 static void tcp_time_wait(struct sock *sk)
     /*  */
 676 {
 677         tcp_set_state(sk,TCP_TIME_WAIT);
 678         sk->shutdown = SHUTDOWN_MASK;
 679         if (!sk->dead)
 680                 sk->state_change(sk);
 681         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 682 }
 683 
 684 /*
 685  *      A socket has timed out on its send queue and wants to do a
 686  *      little retransmitting. Currently this means TCP.
 687  */
 688 
 689 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 690 {
 691         struct sk_buff * skb;
 692         struct proto *prot;
 693         struct device *dev;
 694         int ct=0;
 695         struct rtable *rt;
 696 
 697         prot = sk->prot;
 698         skb = sk->send_head;
 699 
 700         while (skb != NULL)
 701         {
 702                 struct tcphdr *th;
 703                 struct iphdr *iph;
 704                 int size;
 705 
 706                 dev = skb->dev;
 707                 IS_SKB(skb);
 708                 skb->when = jiffies;
 709                 
 710                 /* dl1bke 960201 - @%$$! Hope this cures strange race conditions    */
 711                 /*                 with AX.25 mode VC. (esp. DAMA)                  */
 712                 /*                 if the buffer is locked we should not retransmit */
 713                 /*                 anyway, so we don't need all the fuss to prepare */
 714                 /*                 the buffer in this case.                         */
 715                 /*                 (the skb_pull() changes skb->data while we may   */
 716                 /*                 actually try to send the data. Ough. A side      */
 717                 /*                 effect is that we'll send some unnecessary data, */
 718                 /*                 but the alternative is desastrous...             */
 719                 
 720                 if (skb_device_locked(skb))
 721                         break;
 722 
 723                 /*
 724                  *      Discard the surplus MAC header
 725                  */
 726                  
 727                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 728 
 729                 /*
 730                  * In general it's OK just to use the old packet.  However we
 731                  * need to use the current ack and window fields.  Urg and
 732                  * urg_ptr could possibly stand to be updated as well, but we
 733                  * don't keep the necessary data.  That shouldn't be a problem,
 734                  * if the other end is doing the right thing.  Since we're
 735                  * changing the packet, we have to issue a new IP identifier.
 736                  */
 737 
 738                 iph = (struct iphdr *)skb->data;
 739                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 740                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 741                 
 742                 /*
 743                  *      Note: We ought to check for window limits here but
 744                  *      currently this is done (less efficiently) elsewhere.
 745                  */
 746 
 747                 /*
 748                  *      Put a MAC header back on (may cause ARPing)
 749                  */
 750                  
 751                 {
 752                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 753                          */
 754                         struct options *  opt = (struct options*)skb->proto_priv;
 755                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 756                 }
 757 
 758                 iph->id = htons(ip_id_count++);
 759 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 760                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 761                         iph->frag_off &= ~htons(IP_DF);
 762 #endif
 763                 ip_send_check(iph);
 764                         
 765                 if (rt==NULL)   /* Deep poo */
 766                 {
 767                         if(skb->sk)
 768                         {
 769                                 skb->sk->err_soft=ENETUNREACH;
 770                                 skb->sk->error_report(skb->sk);
 771                         }
 772                 }
 773                 else
 774                 {
 775                         dev=rt->rt_dev;
 776                         skb->raddr=rt->rt_gateway;
 777                         skb->dev=dev;
 778                         skb->arp=1;
 779                         if (rt->rt_hh)
 780                         {
 781                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 782                                 if (!rt->rt_hh->hh_uptodate)
 783                                 {
 784                                         skb->arp = 0;
 785 #if RT_CACHE_DEBUG >= 2
 786                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 787 #endif
 788                                 }
 789                         }
 790                         else if (dev->hard_header)
 791                         {
 792                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 793                                         skb->arp=0;
 794                         }
 795                 
 796                         /*
 797                          *      This is not the right way to handle this. We have to
 798                          *      issue an up to date window and ack report with this 
 799                          *      retransmit to keep the odd buggy tcp that relies on 
 800                          *      the fact BSD does this happy. 
 801                          *      We don't however need to recalculate the entire 
 802                          *      checksum, so someone wanting a small problem to play
 803                          *      with might like to implement RFC1141/RFC1624 and speed
 804                          *      this up by avoiding a full checksum.
 805                          */
 806                  
 807                         th->ack_seq = htonl(sk->acked_seq);
 808                         sk->ack_backlog = 0;
 809                         sk->bytes_rcv = 0;
 810                         th->window = ntohs(tcp_select_window(sk));
 811                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 812                 
 813                         /*
 814                          *      If the interface is (still) up and running, kick it.
 815                          */
 816         
 817                         if (dev->flags & IFF_UP)
 818                         {
 819                                 /*
 820                                  *      If the packet is still being sent by the device/protocol
 821                                  *      below then don't retransmit. This is both needed, and good -
 822                                  *      especially with connected mode AX.25 where it stops resends
 823                                  *      occurring of an as yet unsent anyway frame!
 824                                  *      We still add up the counts as the round trip time wants
 825                                  *      adjusting.
 826                                  */
 827                                 if (sk && !skb_device_locked(skb))
 828                                 {
 829                                         /* Remove it from any existing driver queue first! */
 830                                         skb_unlink(skb);
 831                                         /* Now queue it */
 832                                         ip_statistics.IpOutRequests++;
 833                                         dev_queue_xmit(skb, dev, sk->priority);
 834                                 }
 835                         }
 836                 }
 837                 
 838                 /*
 839                  *      Count retransmissions
 840                  */
 841                  
 842                 ct++;
 843                 sk->prot->retransmits ++;
 844                 tcp_statistics.TcpRetransSegs++;
 845                 
 846 
 847                 /*
 848                  *      Only one retransmit requested.
 849                  */
 850         
 851                 if (!all)
 852                         break;
 853 
 854                 /*
 855                  *      This should cut it off before we send too many packets.
 856                  */
 857 
 858                 if (ct >= sk->cong_window)
 859                         break;
 860                 skb = skb->link3;
 861         }
 862 }
 863 
 864 /*
 865  *      Reset the retransmission timer
 866  */
 867  
 868 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 869 {
 870         del_timer(&sk->retransmit_timer);
 871         sk->ip_xmit_timeout = why;
 872         if((long)when < 0)
 873         {
 874                 when=3;
 875                 printk("Error: Negative timer in xmit_timer\n");
 876         }
 877         sk->retransmit_timer.expires=jiffies+when;
 878         add_timer(&sk->retransmit_timer);
 879 }
 880 
 881 /*
 882  *      This is the normal code called for timeouts.  It does the retransmission
 883  *      and then does backoff.  tcp_do_retransmit is separated out because
 884  *      tcp_ack needs to send stuff from the retransmit queue without
 885  *      initiating a backoff.
 886  */
 887 
 888 
 889 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 890 {
 891         tcp_do_retransmit(sk, all);
 892 
 893         /*
 894          * Increase the timeout each time we retransmit.  Note that
 895          * we do not increase the rtt estimate.  rto is initialized
 896          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 897          * that doubling rto each time is the least we can get away with.
 898          * In KA9Q, Karn uses this for the first few times, and then
 899          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 900          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 901          * defined in the protocol as the maximum possible RTT.  I guess
 902          * we'll have to use something other than TCP to talk to the
 903          * University of Mars.
 904          *
 905          * PAWS allows us longer timeouts and large windows, so once
 906          * implemented ftp to mars will work nicely. We will have to fix
 907          * the 120 second clamps though!
 908          */
 909 
 910         sk->retransmits++;
 911         sk->prot->retransmits++;
 912         sk->backoff++;
 913         sk->rto = min(sk->rto << 1, 120*HZ);
 914         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 915 }
 916 
 917 
 918 /*
 919  *      A timer event has trigger a tcp retransmit timeout. The
 920  *      socket xmit queue is ready and set up to send. Because
 921  *      the ack receive code keeps the queue straight we do
 922  *      nothing clever here.
 923  */
 924 
 925 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 926 {
 927         if (all) 
 928         {
 929                 tcp_retransmit_time(sk, all);
 930                 return;
 931         }
 932 
 933         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 934         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 935         sk->cong_count = 0;
 936 
 937         sk->cong_window = 1;
 938 
 939         /* Do the actual retransmit. */
 940         tcp_retransmit_time(sk, all);
 941 }
 942 
 943 /*
 944  *      A write timeout has occurred. Process the after effects.
 945  */
 946 
 947 static int tcp_write_timeout(struct sock *sk)
     /*  */
 948 {
 949         /*
 950          *      Look for a 'soft' timeout.
 951          */
 952         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 953                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 954         {
 955                 /*
 956                  *      Attempt to recover if arp has changed (unlikely!) or
 957                  *      a route has shifted (not supported prior to 1.3).
 958                  */
 959                 ip_rt_advice(&sk->ip_route_cache, 0);
 960         }
 961         
 962         /*
 963          *      Have we tried to SYN too many times (repent repent 8))
 964          */
 965          
 966         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 967         {
 968                 if(sk->err_soft)
 969                         sk->err=sk->err_soft;
 970                 else
 971                         sk->err=ETIMEDOUT;
 972                 sk->error_report(sk);
 973                 del_timer(&sk->retransmit_timer);
 974                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 975                 tcp_set_state(sk,TCP_CLOSE);
 976                 /* Don't FIN, we got nothing back */
 977                 release_sock(sk);
 978                 return 0;
 979         }
 980         /*
 981          *      Has it gone just too far ?
 982          */
 983         if (sk->retransmits > TCP_RETR2) 
 984         {
 985                 if(sk->err_soft)
 986                         sk->err = sk->err_soft;
 987                 else
 988                         sk->err = ETIMEDOUT;
 989                 sk->error_report(sk);
 990                 del_timer(&sk->retransmit_timer);
 991                 /*
 992                  *      Time wait the socket 
 993                  */
 994                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 995                 {
 996                         tcp_set_state(sk,TCP_TIME_WAIT);
 997                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 998                 }
 999                 else
1000                 {
1001                         /*
1002                          *      Clean up time.
1003                          */
1004                         tcp_set_state(sk, TCP_CLOSE);
1005                         release_sock(sk);
1006                         return 0;
1007                 }
1008         }
1009         return 1;
1010 }
1011 
1012 /*
1013  *      The TCP retransmit timer. This lacks a few small details.
1014  *
1015  *      1.      An initial rtt timeout on the probe0 should cause what we can
1016  *              of the first write queue buffer to be split and sent.
1017  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
1018  *              ETIMEDOUT if we know an additional 'soft' error caused this.
1019  *              tcp_err should save a 'soft error' for us.
1020  */
1021 
1022 static void retransmit_timer(unsigned long data)
     /*  */
1023 {
1024         struct sock *sk = (struct sock*)data;
1025         int why = sk->ip_xmit_timeout;
1026 
1027         /*
1028          *      We are reset. We will send no more retransmits.
1029          */
1030          
1031         if(sk->zapped)
1032                 return;
1033                 
1034         /* 
1035          *      Only process if socket is not in use
1036          */
1037 
1038         cli();
1039         if (sk->inuse || in_bh) 
1040         {
1041                 /* Try again in 1 second */
1042                 sk->retransmit_timer.expires = jiffies+HZ;
1043                 add_timer(&sk->retransmit_timer);
1044                 sti();
1045                 return;
1046         }
1047 
1048         sk->inuse = 1;
1049         sti();
1050 
1051 
1052         if (sk->ack_backlog && !sk->dead) 
1053                 sk->data_ready(sk,0);
1054 
1055         /* Now we need to figure out why the socket was on the timer. */
1056 
1057         switch (why) 
1058         {
1059                 /* Window probing */
1060                 case TIME_PROBE0:
1061                         tcp_send_probe0(sk);
1062                         tcp_write_timeout(sk);
1063                         break;
1064                 /* Retransmitting */
1065                 case TIME_WRITE:
1066                         /* It could be we got here because we needed to send an ack.
1067                          * So we need to check for that.
1068                          */
1069                 {
1070                         struct sk_buff *skb;
1071                         unsigned long flags;
1072 
1073                         save_flags(flags);
1074                         cli();
1075                         skb = sk->send_head;
1076                         if (!skb) 
1077                         {
1078                                 if (sk->ack_backlog)
1079                                         tcp_read_wakeup(sk);
1080                                 restore_flags(flags);
1081                         } 
1082                         else 
1083                         {
1084                                 /*
1085                                  *      Kicked by a delayed ack. Reset timer
1086                                  *      correctly now
1087                                  */
1088                                 if (jiffies < skb->when + sk->rto) 
1089                                 {
1090                                         if (sk->ack_backlog)
1091                                                 tcp_read_wakeup(sk);
1092                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1093                                         restore_flags(flags);
1094                                         break;
1095                                 }
1096                                 restore_flags(flags);
1097                                 /*
1098                                  *      Retransmission
1099                                  */
1100                                 sk->retransmits++;
1101                                 sk->prot->retransmits++;
1102                                 sk->prot->retransmit (sk, 0);
1103                                 tcp_write_timeout(sk);
1104                         }
1105                         break;
1106                 }
1107                 /* Sending Keepalives */
1108                 case TIME_KEEPOPEN:
1109                         /* 
1110                          * this reset_timer() call is a hack, this is not
1111                          * how KEEPOPEN is supposed to work.
1112                          */
1113                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1114 
1115                         /* Send something to keep the connection open. */
1116                         if (sk->prot->write_wakeup)
1117                                   sk->prot->write_wakeup (sk);
1118                         sk->retransmits++;
1119                         sk->prot->retransmits++;
1120                         tcp_write_timeout(sk);
1121                         break;
1122                 default:
1123                         printk ("rexmit_timer: timer expired - reason unknown\n");
1124                         break;
1125         }
1126         release_sock(sk);
1127 }
1128 
1129 /*
1130  * This routine is called by the ICMP module when it gets some
1131  * sort of error condition.  If err < 0 then the socket should
1132  * be closed and the error returned to the user.  If err > 0
1133  * it's just the icmp type << 8 | icmp code.  After adjustment
1134  * header points to the first 8 bytes of the tcp header.  We need
1135  * to find the appropriate port.
1136  */
1137 
1138 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1139         __u32 saddr, struct inet_protocol *protocol)
1140 {
1141         struct tcphdr *th = (struct tcphdr *)header;
1142         struct sock *sk;
1143         
1144         /*
1145          *      This one is _WRONG_. FIXME urgently.
1146          */
1147 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1148         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1149 #endif  
1150         th =(struct tcphdr *)header;
1151         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1152 
1153         if (sk == NULL) 
1154                 return;
1155   
1156         if (type == ICMP_SOURCE_QUENCH) 
1157         {
1158                 /*
1159                  * FIXME:
1160                  * For now we will just trigger a linear backoff.
1161                  * The slow start code should cause a real backoff here.
1162                  */
1163                 if (sk->cong_window > 4)
1164                         sk->cong_window--;
1165                 return;
1166         }
1167         
1168         if (type == ICMP_PARAMETERPROB)
1169         {
1170                 sk->err=EPROTO;
1171                 sk->error_report(sk);
1172         }
1173 
1174 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1175         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1176         {
1177                 struct rtable * rt;
1178                 /*
1179                  * Ugly trick to pass MTU to protocol layer.
1180                  * Really we should add argument "info" to error handler.
1181                  */
1182                 unsigned short new_mtu = ntohs(iph->id);
1183 
1184                 if ((rt = sk->ip_route_cache) != NULL)
1185                         if (rt->rt_mtu > new_mtu)
1186                                 rt->rt_mtu = new_mtu;
1187 
1188                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
1189                         && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
1190                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1191 
1192                 return;
1193         }
1194 #endif
1195 
1196         /*
1197          * If we've already connected we will keep trying
1198          * until we time out, or the user gives up.
1199          */
1200 
1201         if (code < 13)
1202         {       
1203                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204                 {
1205                         sk->err = icmp_err_convert[code].errno;
1206                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1207                         {
1208                                 tcp_statistics.TcpAttemptFails++;
1209                                 tcp_set_state(sk,TCP_CLOSE);
1210                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1211                         }
1212                 }
1213                 else    /* Only an error on timeout */
1214                         sk->err_soft = icmp_err_convert[code].errno;
1215         }
1216 }
1217 
1218 
1219 /*
1220  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1221  *      in the received data queue (ie a frame missing that needs sending to us). Not
1222  *      sorting using two queues as data arrives makes life so much harder.
1223  */
1224 
1225 static int tcp_readable(struct sock *sk)
     /*  */
1226 {
1227         unsigned long counted;
1228         unsigned long amount;
1229         struct sk_buff *skb;
1230         int sum;
1231         unsigned long flags;
1232 
1233         if(sk && sk->debug)
1234                 printk("tcp_readable: %p - ",sk);
1235 
1236         save_flags(flags);
1237         cli();
1238         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1239         {
1240                 restore_flags(flags);
1241                 if(sk && sk->debug) 
1242                         printk("empty\n");
1243                 return(0);
1244         }
1245   
1246         counted = sk->copied_seq;       /* Where we are at the moment */
1247         amount = 0;
1248   
1249         /* 
1250          *      Do until a push or until we are out of data. 
1251          */
1252          
1253         do 
1254         {
1255                 if (before(counted, skb->seq))          /* Found a hole so stops here */
1256                         break;
1257                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
1258                 if (skb->h.th->syn)
1259                         sum++;
1260                 if (sum > 0) 
1261                 {                                       /* Add it up, move on */
1262                         amount += sum;
1263                         if (skb->h.th->syn) 
1264                                 amount--;
1265                         counted += sum;
1266                 }
1267                 /*
1268                  * Don't count urg data ... but do it in the right place!
1269                  * Consider: "old_data (ptr is here) URG PUSH data"
1270                  * The old code would stop at the first push because
1271                  * it counted the urg (amount==1) and then does amount--
1272                  * *after* the loop.  This means tcp_readable() always
1273                  * returned zero if any URG PUSH was in the queue, even
1274                  * though there was normal data available. If we subtract
1275                  * the urg data right here, we even get it to work for more
1276                  * than one URG PUSH skb without normal data.
1277                  * This means that select() finally works now with urg data
1278                  * in the queue.  Note that rlogin was never affected
1279                  * because it doesn't use select(); it uses two processes
1280                  * and a blocking read().  And the queue scan in tcp_read()
1281                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1282                  */
1283                 if (skb->h.th->urg)
1284                         amount--;       /* don't count urg data */
1285                 if (amount && skb->h.th->psh) break;
1286                 skb = skb->next;
1287         }
1288         while(skb != (struct sk_buff *)&sk->receive_queue);
1289 
1290         restore_flags(flags);
1291         if(sk->debug)
1292                 printk("got %lu bytes.\n",amount);
1293         return(amount);
1294 }
1295 
1296 /*
1297  * LISTEN is a special case for select..
1298  */
1299 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1300 {
1301         if (sel_type == SEL_IN) {
1302                 int retval;
1303 
1304                 sk->inuse = 1;
1305                 retval = (tcp_find_established(sk) != NULL);
1306                 release_sock(sk);
1307                 if (!retval)
1308                         select_wait(&master_select_wakeup,wait);
1309                 return retval;
1310         }
1311         return 0;
1312 }
1313 
1314 
1315 /*
1316  *      Wait for a TCP event.
1317  *
1318  *      Note that we don't need to set "sk->inuse", as the upper select layers
1319  *      take care of normal races (between the test and the event) and we don't
1320  *      go look at any of the socket buffers directly.
1321  */
1322 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1323 {
1324         if (sk->state == TCP_LISTEN)
1325                 return tcp_listen_select(sk, sel_type, wait);
1326 
1327         switch(sel_type) {
1328         case SEL_IN:
1329                 if (sk->err)
1330                         return 1;
1331                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1332                         break;
1333 
1334                 if (sk->shutdown & RCV_SHUTDOWN)
1335                         return 1;
1336                         
1337                 if (sk->acked_seq == sk->copied_seq)
1338                         break;
1339 
1340                 if (sk->urg_seq != sk->copied_seq ||
1341                     sk->acked_seq != sk->copied_seq+1 ||
1342                     sk->urginline || !sk->urg_data)
1343                         return 1;
1344                 break;
1345 
1346         case SEL_OUT:
1347                 if (sk->err)
1348                         return 1;
1349                 if (sk->shutdown & SEND_SHUTDOWN) 
1350                         return 0;
1351                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1352                         break;
1353                 /*
1354                  * This is now right thanks to a small fix
1355                  * by Matt Dillon.
1356                  */
1357 
1358                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1359                         break;
1360                 return 1;
1361 
1362         case SEL_EX:
1363                 if (sk->urg_data)
1364                         return 1;
1365                 break;
1366         }
1367         select_wait(sk->sleep, wait);
1368         return 0;
1369 }
1370 
1371 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1372 {
1373         int err;
1374         switch(cmd) 
1375         {
1376 
1377                 case TIOCINQ:
1378 #ifdef FIXME    /* FIXME: */
1379                 case FIONREAD:
1380 #endif
1381                 {
1382                         unsigned long amount;
1383 
1384                         if (sk->state == TCP_LISTEN) 
1385                                 return(-EINVAL);
1386 
1387                         sk->inuse = 1;
1388                         amount = tcp_readable(sk);
1389                         release_sock(sk);
1390                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1391                         if(err)
1392                                 return err;
1393                         put_user(amount, (int *)arg);
1394                         return(0);
1395                 }
1396                 case SIOCATMARK:
1397                 {
1398                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1399 
1400                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1401                         if (err)
1402                                 return err;
1403                         put_user(answ,(int *) arg);
1404                         return(0);
1405                 }
1406                 case TIOCOUTQ:
1407                 {
1408                         unsigned long amount;
1409 
1410                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1411                         amount = sock_wspace(sk);
1412                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1413                         if(err)
1414                                 return err;
1415                         put_user(amount, (int *)arg);
1416                         return(0);
1417                 }
1418                 default:
1419                         return(-EINVAL);
1420         }
1421 }
1422 
1423 
1424 /*
1425  *      This routine computes a TCP checksum. 
1426  *
1427  *      Modified January 1995 from a go-faster DOS routine by
1428  *      Jorge Cwik <jorge@laser.satlink.net>
1429  */
1430  
1431 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1432           unsigned long saddr, unsigned long daddr, unsigned long base)
1433 {     
1434         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1435 }
1436 
1437 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1438                 unsigned long daddr, int len, struct sock *sk)
1439 {
1440         th->check = 0;
1441         th->check = tcp_check(th, len, saddr, daddr,
1442                 csum_partial((char *)th,len,0));
1443         return;
1444 }
1445 
1446 /*
1447  *      This is the main buffer sending routine. We queue the buffer
1448  *      having checked it is sane seeming.
1449  */
1450  
1451 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1452 {
1453         int size;
1454         struct tcphdr * th = skb->h.th;
1455 
1456         /*
1457          *      length of packet (not counting length of pre-tcp headers) 
1458          */
1459          
1460         size = skb->len - ((unsigned char *) th - skb->data);
1461 
1462         /*
1463          *      Sanity check it.. 
1464          */
1465          
1466         if (size < sizeof(struct tcphdr) || size > skb->len) 
1467         {
1468                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1469                         skb, skb->data, th, skb->len);
1470                 kfree_skb(skb, FREE_WRITE);
1471                 return;
1472         }
1473 
1474         /*
1475          *      If we have queued a header size packet.. (these crash a few
1476          *      tcp stacks if ack is not set)
1477          */
1478          
1479         if (size == sizeof(struct tcphdr)) 
1480         {
1481                 /* If it's got a syn or fin it's notionally included in the size..*/
1482                 if(!th->syn && !th->fin) 
1483                 {
1484                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1485                         kfree_skb(skb,FREE_WRITE);
1486                         return;
1487                 }
1488         }
1489 
1490         /*
1491          *      Actual processing.
1492          */
1493          
1494         tcp_statistics.TcpOutSegs++;  
1495         skb->seq = ntohl(th->seq);
1496         skb->end_seq = skb->seq + size - 4*th->doff;
1497         
1498         /*
1499          *      We must queue if
1500          *
1501          *      a) The right edge of this frame exceeds the window
1502          *      b) We are retransmitting (Nagle's rule)
1503          *      c) We have too many packets 'in flight'
1504          */
1505          
1506         if (after(skb->end_seq, sk->window_seq) ||
1507             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1508              sk->packets_out >= sk->cong_window) 
1509         {
1510                 /* checksum will be supplied by tcp_write_xmit.  So
1511                  * we shouldn't need to set it at all.  I'm being paranoid */
1512                 th->check = 0;
1513                 if (skb->next != NULL) 
1514                 {
1515                         printk("tcp_send_partial: next != NULL\n");
1516                         skb_unlink(skb);
1517                 }
1518                 skb_queue_tail(&sk->write_queue, skb);
1519                 
1520                 /*
1521                  *      If we don't fit we have to start the zero window
1522                  *      probes. This is broken - we really need to do a partial
1523                  *      send _first_ (This is what causes the Cisco and PC/TCP
1524                  *      grief).
1525                  */
1526                  
1527                 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1528                     sk->send_head == NULL && sk->ack_backlog == 0)
1529                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1530         } 
1531         else 
1532         {
1533                 /*
1534                  *      This is going straight out
1535                  */
1536                  
1537                 th->ack_seq = htonl(sk->acked_seq);
1538                 th->window = htons(tcp_select_window(sk));
1539 
1540                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1541 
1542                 sk->sent_seq = sk->write_seq;
1543                 
1544                 /*
1545                  *      This is mad. The tcp retransmit queue is put together
1546                  *      by the ip layer. This causes half the problems with
1547                  *      unroutable FIN's and other things.
1548                  */
1549                  
1550                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1551                 
1552                 
1553                 sk->ack_backlog = 0;
1554                 sk->bytes_rcv = 0;
1555 
1556                 /*
1557                  *      Set for next retransmit based on expected ACK time.
1558                  *      FIXME: We set this every time which means our 
1559                  *      retransmits are really about a window behind.
1560                  */
1561 
1562                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1563         }
1564 }
1565 
1566 /*
1567  *      Locking problems lead us to a messy situation where we can have
1568  *      multiple partially complete buffers queued up. This is really bad
1569  *      as we don't want to be sending partial buffers. Fix this with
1570  *      a semaphore or similar to lock tcp_write per socket.
1571  *
1572  *      These routines are pretty self descriptive.
1573  */
1574  
1575 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1576 {
1577         struct sk_buff * skb;
1578         unsigned long flags;
1579 
1580         save_flags(flags);
1581         cli();
1582         skb = sk->partial;
1583         if (skb) {
1584                 sk->partial = NULL;
1585                 del_timer(&sk->partial_timer);
1586         }
1587         restore_flags(flags);
1588         return skb;
1589 }
1590 
1591 /*
1592  *      Empty the partial queue
1593  */
1594  
1595 static void tcp_send_partial(struct sock *sk)
     /*  */
1596 {
1597         struct sk_buff *skb;
1598 
1599         if (sk == NULL)
1600                 return;
1601         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1602                 tcp_send_skb(sk, skb);
1603 }
1604 
1605 /*
1606  *      Queue a partial frame
1607  */
1608  
1609 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1610 {
1611         struct sk_buff * tmp;
1612         unsigned long flags;
1613 
1614         save_flags(flags);
1615         cli();
1616         tmp = sk->partial;
1617         if (tmp)
1618                 del_timer(&sk->partial_timer);
1619         sk->partial = skb;
1620         init_timer(&sk->partial_timer);
1621         /*
1622          *      Wait up to 1 second for the buffer to fill.
1623          */
1624         sk->partial_timer.expires = jiffies+HZ;
1625         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1626         sk->partial_timer.data = (unsigned long) sk;
1627         add_timer(&sk->partial_timer);
1628         restore_flags(flags);
1629         if (tmp)
1630                 tcp_send_skb(sk, tmp);
1631 }
1632 
1633 
1634 
1635 /*
1636  *      This routine sends an ack and also updates the window. 
1637  */
1638  
1639 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1640              struct sock *sk,
1641              struct tcphdr *th, unsigned long daddr)
1642 {
1643         struct sk_buff *buff;
1644         struct tcphdr *t1;
1645         struct device *dev = NULL;
1646         int tmp;
1647 
1648         if(sk->zapped)
1649                 return;         /* We have been reset, we may not send again */
1650                 
1651         /*
1652          * We need to grab some memory, and put together an ack,
1653          * and then put it into the queue to be sent.
1654          */
1655 
1656         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1657         if (buff == NULL) 
1658         {
1659                 /* 
1660                  *      Force it to send an ack. We don't have to do this
1661                  *      (ACK is unreliable) but it's much better use of 
1662                  *      bandwidth on slow links to send a spare ack than
1663                  *      resend packets. 
1664                  */
1665                  
1666                 sk->ack_backlog++;
1667                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1668                 {
1669                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1670                 }
1671                 return;
1672         }
1673 
1674         /*
1675          *      Assemble a suitable TCP frame
1676          */
1677          
1678         buff->sk = sk;
1679         buff->localroute = sk->localroute;
1680 
1681         /* 
1682          *      Put in the IP header and routing stuff. 
1683          */
1684          
1685         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1686                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1687         if (tmp < 0) 
1688         {
1689                 buff->free = 1;
1690                 sock_wfree(sk, buff);
1691                 return;
1692         }
1693         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1694 
1695         memcpy(t1, th, sizeof(*t1));
1696 
1697         /*
1698          *      Swap the send and the receive. 
1699          */
1700          
1701         t1->dest = th->source;
1702         t1->source = th->dest;
1703         t1->seq = ntohl(sequence);
1704         t1->ack = 1;
1705         sk->window = tcp_select_window(sk);
1706         t1->window = ntohs(sk->window);
1707         t1->res1 = 0;
1708         t1->res2 = 0;
1709         t1->rst = 0;
1710         t1->urg = 0;
1711         t1->syn = 0;
1712         t1->psh = 0;
1713         t1->fin = 0;
1714         
1715         /*
1716          *      If we have nothing queued for transmit and the transmit timer
1717          *      is on we are just doing an ACK timeout and need to switch
1718          *      to a keepalive.
1719          */
1720          
1721         if (ack == sk->acked_seq) {               
1722                 sk->ack_backlog = 0;
1723                 sk->bytes_rcv = 0;
1724                 sk->ack_timed = 0;
1725 
1726                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1727                     && sk->ip_xmit_timeout == TIME_WRITE)       
1728                   if(sk->keepopen) 
1729                     reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1730                   else 
1731                     delete_timer(sk);                           
1732         }
1733 
1734         /*
1735          *      Fill in the packet and send it
1736          */
1737          
1738         t1->ack_seq = htonl(ack);
1739         t1->doff = sizeof(*t1)/4;
1740         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1741         if (sk->debug)
1742                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1743         tcp_statistics.TcpOutSegs++;
1744         sk->prot->queue_xmit(sk, dev, buff, 1);
1745 }
1746 
1747 
1748 /* 
1749  *      This routine builds a generic TCP header. 
1750  */
1751  
1752 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1753 {
1754 
1755         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1756         th->seq = htonl(sk->write_seq);
1757         th->psh =(push == 0) ? 1 : 0;
1758         th->doff = sizeof(*th)/4;
1759         th->ack = 1;
1760         th->fin = 0;
1761         sk->ack_backlog = 0;
1762         sk->bytes_rcv = 0;
1763         sk->ack_timed = 0;
1764         th->ack_seq = htonl(sk->acked_seq);
1765         sk->window = tcp_select_window(sk);
1766         th->window = htons(sk->window);
1767 
1768         return(sizeof(*th));
1769 }
1770 
1771 /*
1772  *      This routine copies from a user buffer into a socket,
1773  *      and starts the transmit system.
1774  */
1775 
1776 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1777           int len, int nonblock, int flags)
1778 {
1779         int copied = 0;
1780         int copy;
1781         int tmp;
1782         int seglen;
1783         int iovct=0;
1784         struct sk_buff *skb;
1785         struct sk_buff *send_tmp;
1786         struct proto *prot;
1787         struct device *dev = NULL;
1788         unsigned char *from;
1789         
1790         /*
1791          *      Do sanity checking for sendmsg/sendto/send
1792          */
1793          
1794         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1795                 return -EINVAL;
1796         if (msg->msg_name)
1797         {
1798                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1799                 if(sk->state == TCP_CLOSE)
1800                         return -ENOTCONN;
1801                 if (msg->msg_namelen < sizeof(*addr))
1802                         return -EINVAL;
1803                 if (addr->sin_family && addr->sin_family != AF_INET) 
1804                         return -EINVAL;
1805                 if (addr->sin_port != sk->dummy_th.dest) 
1806                         return -EISCONN;
1807                 if (addr->sin_addr.s_addr != sk->daddr) 
1808                         return -EISCONN;
1809         }
1810         
1811         /*
1812          *      Ok commence sending
1813          */
1814         
1815         while(iovct<msg->msg_iovlen)
1816         {
1817                 seglen=msg->msg_iov[iovct].iov_len;
1818                 from=msg->msg_iov[iovct++].iov_base;
1819                 sk->inuse=1;
1820                 prot = sk->prot;
1821                 while(seglen > 0) 
1822                 {
1823                         if (sk->err) 
1824                         {                       /* Stop on an error */
1825                                 release_sock(sk);
1826                                 if (copied) 
1827                                         return(copied);
1828                                 return sock_error(sk);
1829                         }
1830 
1831                         /*
1832                          *      First thing we do is make sure that we are established. 
1833                          */
1834         
1835                         if (sk->shutdown & SEND_SHUTDOWN) 
1836                         {
1837                                 release_sock(sk);
1838                                 sk->err = EPIPE;
1839                                 if (copied) 
1840                                         return(copied);
1841                                 sk->err = 0;
1842                                 return(-EPIPE);
1843                         }
1844 
1845                         /* 
1846                          *      Wait for a connection to finish.
1847                          */
1848                 
1849                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1850                         {
1851                                 if (sk->err) 
1852                                 {
1853                                         release_sock(sk);
1854                                         if (copied) 
1855                                                 return(copied);
1856                                         return sock_error(sk);
1857                                 }               
1858         
1859                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1860                                 {
1861                                         release_sock(sk);
1862                                         if (copied) 
1863                                                 return(copied);
1864         
1865                                         if (sk->err) 
1866                                                 return sock_error(sk);
1867 
1868                                         if (sk->keepopen) 
1869                                         {
1870                                                 send_sig(SIGPIPE, current, 0);
1871                                         }
1872                                         return(-EPIPE);
1873                                 }
1874         
1875                                 if (nonblock || copied) 
1876                                 {
1877                                         release_sock(sk);
1878                                         if (copied) 
1879                                                 return(copied);
1880                                         return(-EAGAIN);
1881                                 }
1882         
1883                                 release_sock(sk);
1884                                 cli();
1885                         
1886                                 if (sk->state != TCP_ESTABLISHED &&
1887                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1888                                 {
1889                                         interruptible_sleep_on(sk->sleep);      
1890                                         if (current->signal & ~current->blocked)
1891                                         {
1892                                                 sti();
1893                                                 if (copied) 
1894                                                         return(copied);
1895                                                 return(-ERESTARTSYS);
1896                                         }
1897                                 }
1898                                 sk->inuse = 1;
1899                                 sti();
1900                         }
1901         
1902                 /*
1903                  * The following code can result in copy <= if sk->mss is ever
1904                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1905                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1906                  * had better not get here until we've seen his SYN and at least one
1907                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1908                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1909                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1910                  * before the exchange of SYN's.  If the initial ack from the other
1911                  * end has a window of 0, max_window and thus mss will both be 0.
1912                  */
1913         
1914                 /* 
1915                  *      Now we need to check if we have a half built packet. 
1916                  */
1917 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1918                 /*
1919                  *      FIXME:  I'm almost sure that this fragment is BUG,
1920                  *              but it works... I do not know why 8) --ANK
1921                  *
1922                  *      Really, we should rebuild all the queues...
1923                  *      It's difficult. Temprorary hack is to send all
1924                  *      queued segments with allowed fragmentation.
1925                  */
1926                 {
1927                         int new_mss = min(sk->mtu, sk->max_window);
1928                         if (new_mss < sk->mss)
1929                         {
1930                                 tcp_send_partial(sk);
1931                                 sk->mss = new_mss;
1932                         }
1933                 }
1934 #endif
1935         
1936                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1937                         {
1938                                 int hdrlen;
1939 
1940                                  /* IP header + TCP header */
1941                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1942                                          + sizeof(struct tcphdr);
1943         
1944                                 /* Add more stuff to the end of skb->len */
1945                                 if (!(flags & MSG_OOB)) 
1946                                 {
1947                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1948                                         if (copy <= 0) 
1949                                         {
1950                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1951                                                 return -EFAULT;
1952                                         }                 
1953                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1954                                         from += copy;
1955                                         copied += copy;
1956                                         len -= copy;
1957                                         sk->write_seq += copy;
1958                                         seglen -= copy;
1959                                 }
1960                                 if ((skb->len - hdrlen) >= sk->mss ||
1961                                         (flags & MSG_OOB) || !sk->packets_out)
1962                                         tcp_send_skb(sk, skb);
1963                                 else
1964                                         tcp_enqueue_partial(skb, sk);
1965                                 continue;
1966                         }
1967 
1968                 /*
1969                  * We also need to worry about the window.
1970                  * If window < 1/2 the maximum window we've seen from this
1971                  *   host, don't use it.  This is sender side
1972                  *   silly window prevention, as specified in RFC1122.
1973                  *   (Note that this is different than earlier versions of
1974                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1975                  *   use the whole MSS.  Since the results in the right
1976                  *   edge of the packet being outside the window, it will
1977                  *   be queued for later rather than sent.
1978                  */
1979 
1980                         copy = sk->window_seq - sk->write_seq;
1981                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1982                                 copy = sk->mss;
1983                         if (copy > seglen)
1984                                 copy = seglen;
1985 
1986                 /*
1987                  *      We should really check the window here also. 
1988                  */
1989                  
1990                         send_tmp = NULL;
1991                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1992                         {
1993                                 /*
1994                                  *      We will release the socket in case we sleep here. 
1995                                  */
1996                                 release_sock(sk);
1997                                 /*
1998                                  *      NB: following must be mtu, because mss can be increased.
1999                                  *      mss is always <= mtu 
2000                                  */
2001                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
2002                                 sk->inuse = 1;
2003                                 send_tmp = skb;
2004                         } 
2005                         else 
2006                         {
2007                                 /*
2008                                  *      We will release the socket in case we sleep here. 
2009                                  */
2010                                 release_sock(sk);
2011                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
2012                                 sk->inuse = 1;
2013                         }
2014         
2015                         /*
2016                          *      If we didn't get any memory, we need to sleep. 
2017                          */
2018         
2019                         if (skb == NULL) 
2020                         {
2021                                 sk->socket->flags |= SO_NOSPACE;
2022                                 if (nonblock) 
2023                                 {
2024                                         release_sock(sk);
2025                                         if (copied) 
2026                                                 return(copied);
2027                                         return(-EAGAIN);
2028                                 }
2029 
2030                                 /*
2031                                  *      FIXME: here is another race condition. 
2032                                  */
2033 
2034                                 tmp = sk->wmem_alloc;
2035                                 release_sock(sk);
2036                                 cli();
2037                                 /*
2038                                  *      Again we will try to avoid it. 
2039                                  */
2040                                 if (tmp <= sk->wmem_alloc &&
2041                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
2042                                         && sk->err == 0) 
2043                                 {
2044                                         sk->socket->flags &= ~SO_NOSPACE;
2045                                         interruptible_sleep_on(sk->sleep);
2046                                         if (current->signal & ~current->blocked) 
2047                                         {
2048                                                 sti();
2049                                                 if (copied) 
2050                                                         return(copied);
2051                                                 return(-ERESTARTSYS);
2052                                         }
2053                                 }
2054                                 sk->inuse = 1;
2055                                 sti();
2056                                 continue;
2057                         }
2058 
2059                         skb->sk = sk;
2060                         skb->free = 0;
2061                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
2062         
2063                         /*
2064                          * FIXME: we need to optimize this.
2065                          * Perhaps some hints here would be good.
2066                          */
2067                 
2068                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
2069                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2070                         if (tmp < 0 ) 
2071                         {
2072                                 sock_wfree(sk, skb);
2073                                 release_sock(sk);
2074                                 if (copied) 
2075                                         return(copied);
2076                                 return(tmp);
2077                         }
2078 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
2079                         skb->ip_hdr->frag_off |= htons(IP_DF);
2080 #endif
2081                         skb->dev = dev;
2082                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
2083                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2084                         if (tmp < 0) 
2085                         {
2086                                 sock_wfree(sk, skb);
2087                                 release_sock(sk);
2088                                 if (copied) 
2089                                         return(copied);
2090                                 return(tmp);
2091                         }
2092         
2093                         if (flags & MSG_OOB) 
2094                         {
2095                                 skb->h.th->urg = 1;
2096                                 skb->h.th->urg_ptr = ntohs(copy);
2097                         }
2098 
2099                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2100                 
2101                         from += copy;
2102                         copied += copy;
2103                         len -= copy;
2104                         seglen -= copy;
2105                         skb->free = 0;
2106                         sk->write_seq += copy;
2107                 
2108                         if (send_tmp != NULL && sk->packets_out) 
2109                         {
2110                                 tcp_enqueue_partial(send_tmp, sk);
2111                                 continue;
2112                         }
2113                         tcp_send_skb(sk, skb);
2114                 }
2115         }
2116         sk->err = 0;
2117 
2118 /*
2119  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2120  *      interactive fast network servers. It's meant to be on and
2121  *      it really improves the throughput though not the echo time
2122  *      on my slow slip link - Alan
2123  */
2124 
2125 /*
2126  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2127  */
2128  
2129         if(sk->partial && ((!sk->packets_out) 
2130      /* If not nagling we can send on the before case too.. */
2131               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2132         ))
2133                 tcp_send_partial(sk);
2134 
2135         release_sock(sk);
2136         return(copied);
2137 }
2138 
2139 /*
2140  *      Send an ack if one is backlogged at this point. Ought to merge
2141  *      this with tcp_send_ack().
2142  *      This is called for delayed acks also.
2143  */
2144  
2145 static void tcp_read_wakeup(struct sock *sk)
     /*  */
2146 {
2147         int tmp;
2148         struct device *dev = NULL;
2149         struct tcphdr *t1;
2150         struct sk_buff *buff;
2151 
2152         if (!sk->ack_backlog) 
2153                 return;
2154 
2155         /*
2156          * If we're closed, don't send an ack, or we'll get a RST
2157          * from the closed destination.
2158          */
2159         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2160                 return; 
2161 
2162         /*
2163          * FIXME: we need to put code here to prevent this routine from
2164          * being called.  Being called once in a while is ok, so only check
2165          * if this is the second time in a row.
2166          */
2167 
2168         /*
2169          * We need to grab some memory, and put together an ack,
2170          * and then put it into the queue to be sent.
2171          */
2172 
2173         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2174         if (buff == NULL) 
2175         {
2176                 /* Try again real soon. */
2177                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2178                 return;
2179         }
2180 
2181         buff->sk = sk;
2182         buff->localroute = sk->localroute;
2183         
2184         /*
2185          *      Put in the IP header and routing stuff. 
2186          */
2187 
2188         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2189                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2190         if (tmp < 0) 
2191         {
2192                 buff->free = 1;
2193                 sock_wfree(sk, buff);
2194                 return;
2195         }
2196 
2197         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2198 
2199         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2200         t1->seq = htonl(sk->sent_seq);
2201         t1->ack = 1;
2202         t1->res1 = 0;
2203         t1->res2 = 0;
2204         t1->rst = 0;
2205         t1->urg = 0;
2206         t1->syn = 0;
2207         t1->psh = 0;
2208 
2209 
2210         sk->ack_backlog = 0;
2211         sk->bytes_rcv = 0;
2212 
2213         sk->window = tcp_select_window(sk);
2214         t1->window = htons(sk->window);
2215         t1->ack_seq = htonl(sk->acked_seq);
2216         t1->doff = sizeof(*t1)/4;
2217         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2218         sk->prot->queue_xmit(sk, dev, buff, 1);
2219         tcp_statistics.TcpOutSegs++;
2220 }
2221 
2222 
2223 /*
2224  *      FIXME:
2225  *      This routine frees used buffers.
2226  *      It should consider sending an ACK to let the
2227  *      other end know we now have a bigger window.
2228  */
2229 
2230 static void cleanup_rbuf(struct sock *sk)
     /*  */
2231 {
2232         unsigned long flags;
2233         unsigned long left;
2234         struct sk_buff *skb;
2235         unsigned long rspace;
2236 
2237         if(sk->debug)
2238                 printk("cleaning rbuf for sk=%p\n", sk);
2239   
2240         save_flags(flags);
2241         cli();
2242   
2243         left = sock_rspace(sk);
2244  
2245         /*
2246          *      We have to loop through all the buffer headers,
2247          *      and try to free up all the space we can.
2248          */
2249 
2250         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2251         {
2252                 if (!skb->used || skb->users) 
2253                         break;
2254                 skb_unlink(skb);
2255                 skb->sk = sk;
2256                 kfree_skb(skb, FREE_READ);
2257         }
2258 
2259         restore_flags(flags);
2260 
2261         /*
2262          *      FIXME:
2263          *      At this point we should send an ack if the difference
2264          *      in the window, and the amount of space is bigger than
2265          *      TCP_WINDOW_DIFF.
2266          */
2267 
2268         if(sk->debug)
2269                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2270                                             left);
2271         if ((rspace=sock_rspace(sk)) != left) 
2272         {
2273                 /*
2274                  * This area has caused the most trouble.  The current strategy
2275                  * is to simply do nothing if the other end has room to send at
2276                  * least 3 full packets, because the ack from those will auto-
2277                  * matically update the window.  If the other end doesn't think
2278                  * we have much space left, but we have room for at least 1 more
2279                  * complete packet than it thinks we do, we will send an ack
2280                  * immediately.  Otherwise we will wait up to .5 seconds in case
2281                  * the user reads some more.
2282                  */
2283                 sk->ack_backlog++;
2284         /*
2285          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2286          * if the other end is offering a window smaller than the agreed on MSS
2287          * (called sk->mtu here).  In theory there's no connection between send
2288          * and receive, and so no reason to think that they're going to send
2289          * small packets.  For the moment I'm using the hack of reducing the mss
2290          * only on the send side, so I'm putting mtu here.
2291          */
2292 
2293                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2294                 {
2295                         /* Send an ack right now. */
2296                         tcp_read_wakeup(sk);
2297                 } 
2298                 else 
2299                 {
2300                         /* Force it to send an ack soon. */
2301                         int was_active = del_timer(&sk->retransmit_timer);
2302                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2303                         {
2304                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2305                         } 
2306                         else
2307                                 add_timer(&sk->retransmit_timer);
2308                 }
2309         }
2310 } 
2311 
2312 
2313 /*
2314  *      Handle reading urgent data. BSD has very simple semantics for
2315  *      this, no blocking and very strange errors 8)
2316  */
2317  
2318 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2319              struct msghdr *msg, int len, int flags, int *addr_len)
2320 {
2321         /*
2322          *      No URG data to read
2323          */
2324         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2325                 return -EINVAL; /* Yes this is right ! */
2326                 
2327         if (sk->err) 
2328                 return sock_error(sk);
2329                 
2330         if (sk->state == TCP_CLOSE || sk->done) 
2331         {
2332                 if (!sk->done) 
2333                 {
2334                         sk->done = 1;
2335                         return 0;
2336                 }
2337                 return -ENOTCONN;
2338         }
2339 
2340         if (sk->shutdown & RCV_SHUTDOWN) 
2341         {
2342                 sk->done = 1;
2343                 return 0;
2344         }
2345         sk->inuse = 1;
2346         if (sk->urg_data & URG_VALID) 
2347         {
2348                 char c = sk->urg_data;
2349                 if (!(flags & MSG_PEEK))
2350                         sk->urg_data = URG_READ;
2351                 memcpy_toiovec(msg->msg_iov, &c, 1);
2352                 if(msg->msg_name)
2353                 {
2354                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2355                         sin->sin_family=AF_INET;
2356                         sin->sin_addr.s_addr=sk->daddr;
2357                         sin->sin_port=sk->dummy_th.dest;
2358                 }
2359                 if(addr_len)
2360                         *addr_len=sizeof(struct sockaddr_in);
2361                 release_sock(sk);
2362                 return 1;
2363         }
2364         release_sock(sk);
2365         
2366         /*
2367          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2368          * the available implementations agree in this case:
2369          * this call should never block, independent of the
2370          * blocking state of the socket.
2371          * Mike <pall@rz.uni-karlsruhe.de>
2372          */
2373         return -EAGAIN;
2374 }
2375 
2376 
2377 /*
2378  *      This routine copies from a sock struct into the user buffer. 
2379  */
2380  
2381 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2382         int len, int nonblock, int flags, int *addr_len)
2383 {
2384         struct wait_queue wait = { current, NULL };
2385         int copied = 0;
2386         u32 peek_seq;
2387         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2388         unsigned long used;
2389 
2390         /* 
2391          *      This error should be checked. 
2392          */
2393          
2394         if (sk->state == TCP_LISTEN)
2395                 return -ENOTCONN;
2396 
2397         /*
2398          *      Urgent data needs to be handled specially. 
2399          */
2400          
2401         if (flags & MSG_OOB)
2402                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2403 
2404         /*
2405          *      Copying sequence to update. This is volatile to handle
2406          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2407          *      inline and thus not flush cached variables otherwise).
2408          */
2409          
2410         peek_seq = sk->copied_seq;
2411         seq = &sk->copied_seq;
2412         if (flags & MSG_PEEK)
2413                 seq = &peek_seq;
2414 
2415         add_wait_queue(sk->sleep, &wait);
2416         sk->inuse = 1;
2417         while (len > 0) 
2418         {
2419                 struct sk_buff * skb;
2420                 u32 offset;
2421         
2422                 /*
2423                  * Are we at urgent data? Stop if we have read anything.
2424                  */
2425                  
2426                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2427                         break;
2428 
2429                 /*
2430                  *      Next get a buffer.
2431                  */
2432                  
2433                 current->state = TASK_INTERRUPTIBLE;
2434 
2435                 skb = skb_peek(&sk->receive_queue);
2436                 do 
2437                 {
2438                         if (!skb)
2439                                 break;
2440                         if (before(*seq, skb->seq))
2441                                 break;
2442                         offset = *seq - skb->seq;
2443                         if (skb->h.th->syn)
2444                                 offset--;
2445                         if (offset < skb->len)
2446                                 goto found_ok_skb;
2447                         if (skb->h.th->fin)
2448                                 goto found_fin_ok;
2449                         if (!(flags & MSG_PEEK))
2450                                 skb->used = 1;
2451                         skb = skb->next;
2452                 }
2453                 while (skb != (struct sk_buff *)&sk->receive_queue);
2454 
2455                 if (copied)
2456                         break;
2457 
2458                 if (sk->err) 
2459                 {
2460                         copied = sock_error(sk);
2461                         break;
2462                 }
2463 
2464                 if (sk->state == TCP_CLOSE) 
2465                 {
2466                         if (!sk->done) 
2467                         {
2468                                 sk->done = 1;
2469                                 break;
2470                         }
2471                         copied = -ENOTCONN;
2472                         break;
2473                 }
2474 
2475                 if (sk->shutdown & RCV_SHUTDOWN) 
2476                 {
2477                         sk->done = 1;
2478                         break;
2479                 }
2480                         
2481                 if (nonblock) 
2482                 {
2483                         copied = -EAGAIN;
2484                         break;
2485                 }
2486 
2487                 cleanup_rbuf(sk);
2488                 release_sock(sk);
2489                 sk->socket->flags |= SO_WAITDATA;
2490                 schedule();
2491                 sk->socket->flags &= ~SO_WAITDATA;
2492                 sk->inuse = 1;
2493 
2494                 if (current->signal & ~current->blocked) 
2495                 {
2496                         copied = -ERESTARTSYS;
2497                         break;
2498                 }
2499                 continue;
2500 
2501         found_ok_skb:
2502                 /*
2503                  *      Lock the buffer. We can be fairly relaxed as
2504                  *      an interrupt will never steal a buffer we are 
2505                  *      using unless I've missed something serious in
2506                  *      tcp_data.
2507                  */
2508                 
2509                 skb->users++;
2510                 
2511                 /*
2512                  *      Ok so how much can we use ? 
2513                  */
2514                  
2515                 used = skb->len - offset;
2516                 if (len < used)
2517                         used = len;
2518                 /*
2519                  *      Do we have urgent data here? 
2520                  */
2521                 
2522                 if (sk->urg_data) 
2523                 {
2524                         u32 urg_offset = sk->urg_seq - *seq;
2525                         if (urg_offset < used) 
2526                         {
2527                                 if (!urg_offset) 
2528                                 {
2529                                         if (!sk->urginline) 
2530                                         {
2531                                                 ++*seq;
2532                                                 offset++;
2533                                                 used--;
2534                                         }
2535                                 }
2536                                 else
2537                                         used = urg_offset;
2538                         }
2539                 }
2540                 
2541                 /*
2542                  *      Copy it - We _MUST_ update *seq first so that we
2543                  *      don't ever double read when we have dual readers
2544                  */
2545                  
2546                 *seq += used;
2547 
2548                 /*
2549                  *      This memcpy_tofs can sleep. If it sleeps and we
2550                  *      do a second read it relies on the skb->users to avoid
2551                  *      a crash when cleanup_rbuf() gets called.
2552                  */
2553                  
2554                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2555                         skb->h.th->doff*4 + offset, used);
2556                 copied += used;
2557                 len -= used;
2558                 
2559                 /*
2560                  *      We now will not sleep again until we are finished
2561                  *      with skb. Sorry if you are doing the SMP port
2562                  *      but you'll just have to fix it neatly ;)
2563                  */
2564                  
2565                 skb->users --;
2566                 
2567                 if (after(sk->copied_seq,sk->urg_seq))
2568                         sk->urg_data = 0;
2569                 if (used + offset < skb->len)
2570                         continue;
2571                 
2572                 /*
2573                  *      Process the FIN.
2574                  */
2575 
2576                 if (skb->h.th->fin)
2577                         goto found_fin_ok;
2578                 if (flags & MSG_PEEK)
2579                         continue;
2580                 skb->used = 1;
2581                 continue;
2582 
2583         found_fin_ok:
2584                 ++*seq;
2585                 if (flags & MSG_PEEK)
2586                         break;
2587                         
2588                 /*
2589                  *      All is done
2590                  */
2591                  
2592                 skb->used = 1;
2593                 sk->shutdown |= RCV_SHUTDOWN;
2594                 break;
2595 
2596         }
2597         
2598         if(copied>0 && msg->msg_name)
2599         {
2600                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2601                 sin->sin_family=AF_INET;
2602                 sin->sin_addr.s_addr=sk->daddr;
2603                 sin->sin_port=sk->dummy_th.dest;
2604         }
2605         if(addr_len)
2606                 *addr_len=sizeof(struct sockaddr_in);
2607                 
2608         remove_wait_queue(sk->sleep, &wait);
2609         current->state = TASK_RUNNING;
2610 
2611         /* Clean up data we have read: This will do ACK frames */
2612         cleanup_rbuf(sk);
2613         release_sock(sk);
2614         return copied;
2615 }
2616 
2617 
2618 
2619 /*
2620  *      State processing on a close. This implements the state shift for
2621  *      sending our FIN frame. Note that we only send a FIN for some 
2622  *      states. A shutdown() may have already sent the FIN, or we may be
2623  *      closed.
2624  */
2625  
2626 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2627 {
2628         int ns=TCP_CLOSE;
2629         int send_fin=0;
2630         switch(sk->state)
2631         {
2632                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2633                         break;
2634                 case TCP_SYN_RECV:
2635                 case TCP_ESTABLISHED:   /* Closedown begin */
2636                         ns=TCP_FIN_WAIT1;
2637                         send_fin=1;
2638                         break;
2639                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2640                 case TCP_FIN_WAIT2:
2641                 case TCP_CLOSING:
2642                         ns=sk->state;
2643                         break;
2644                 case TCP_CLOSE:
2645                 case TCP_LISTEN:
2646                         break;
2647                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2648                                            wait only for the ACK */
2649                         ns=TCP_LAST_ACK;
2650                         send_fin=1;
2651         }
2652         
2653         tcp_set_state(sk,ns);
2654                 
2655         /*
2656          *      This is a (useful) BSD violating of the RFC. There is a
2657          *      problem with TCP as specified in that the other end could
2658          *      keep a socket open forever with no application left this end.
2659          *      We use a 3 minute timeout (about the same as BSD) then kill
2660          *      our end. If they send after that then tough - BUT: long enough
2661          *      that we won't make the old 4*rto = almost no time - whoops
2662          *      reset mistake.
2663          */
2664         if(dead && ns==TCP_FIN_WAIT2)
2665         {
2666                 int timer_active=del_timer(&sk->timer);
2667                 if(timer_active)
2668                         add_timer(&sk->timer);
2669                 else
2670                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2671         }
2672         
2673         return send_fin;
2674 }
2675 
2676 /*
2677  *      Send a fin.
2678  */
2679 
2680 static void tcp_send_fin(struct sock *sk)
     /*  */
2681 {
2682         struct proto *prot =(struct proto *)sk->prot;
2683         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2684         struct tcphdr *t1;
2685         struct sk_buff *buff;
2686         struct device *dev=NULL;
2687         int tmp;
2688                 
2689         release_sock(sk); /* in case the malloc sleeps. */
2690         
2691         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2692         sk->inuse = 1;
2693 
2694         if (buff == NULL)
2695         {
2696                 /* This is a disaster if it occurs */
2697                 printk("tcp_send_fin: Impossible malloc failure");
2698                 return;
2699         }
2700 
2701         /*
2702          *      Administrivia
2703          */
2704          
2705         buff->sk = sk;
2706         buff->localroute = sk->localroute;
2707 
2708         /*
2709          *      Put in the IP header and routing stuff. 
2710          */
2711 
2712         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2713                            IPPROTO_TCP, sk->opt,
2714                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2715         if (tmp < 0) 
2716         {
2717                 int t;
2718                 /*
2719                  *      Finish anyway, treat this as a send that got lost. 
2720                  *      (Not good).
2721                  */
2722                  
2723                 buff->free = 1;
2724                 sock_wfree(sk,buff);
2725                 sk->write_seq++;
2726                 t=del_timer(&sk->timer);
2727                 if(t)
2728                         add_timer(&sk->timer);
2729                 else
2730                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2731                 return;
2732         }
2733         
2734         /*
2735          *      We ought to check if the end of the queue is a buffer and
2736          *      if so simply add the fin to that buffer, not send it ahead.
2737          */
2738 
2739         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2740         buff->dev = dev;
2741         memcpy(t1, th, sizeof(*t1));
2742         buff->seq = sk->write_seq;
2743         sk->write_seq++;
2744         buff->end_seq = sk->write_seq;
2745         t1->seq = htonl(buff->seq);
2746         t1->ack = 1;
2747         t1->ack_seq = htonl(sk->acked_seq);
2748         t1->window = htons(sk->window=tcp_select_window(sk));
2749         t1->fin = 1;
2750         t1->rst = 0;
2751         t1->doff = sizeof(*t1)/4;
2752         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2753 
2754         /*
2755          * If there is data in the write queue, the fin must be appended to
2756          * the write queue.
2757          */
2758         
2759         if (skb_peek(&sk->write_queue) != NULL) 
2760         {
2761                 buff->free = 0;
2762                 if (buff->next != NULL) 
2763                 {
2764                         printk("tcp_send_fin: next != NULL\n");
2765                         skb_unlink(buff);
2766                 }
2767                 skb_queue_tail(&sk->write_queue, buff);
2768         } 
2769         else 
2770         {
2771                 sk->sent_seq = sk->write_seq;
2772                 sk->prot->queue_xmit(sk, dev, buff, 0);
2773                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2774         }
2775 }
2776 
2777 /*
2778  *      Shutdown the sending side of a connection. Much like close except
2779  *      that we don't receive shut down or set sk->dead=1.
2780  */
2781 
2782 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2783 {
2784         /*
2785          *      We need to grab some memory, and put together a FIN,
2786          *      and then put it into the queue to be sent.
2787          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2788          */
2789 
2790         if (!(how & SEND_SHUTDOWN)) 
2791                 return;
2792          
2793         /*
2794          *      If we've already sent a FIN, or it's a closed state
2795          */
2796          
2797         if (sk->state == TCP_FIN_WAIT1 ||
2798             sk->state == TCP_FIN_WAIT2 ||
2799             sk->state == TCP_CLOSING ||
2800             sk->state == TCP_LAST_ACK ||
2801             sk->state == TCP_TIME_WAIT || 
2802             sk->state == TCP_CLOSE ||
2803             sk->state == TCP_LISTEN
2804           )
2805         {
2806                 return;
2807         }
2808         sk->inuse = 1;
2809 
2810         /*
2811          * flag that the sender has shutdown
2812          */
2813 
2814         sk->shutdown |= SEND_SHUTDOWN;
2815 
2816         /*
2817          *  Clear out any half completed packets. 
2818          */
2819 
2820         if (sk->partial)
2821                 tcp_send_partial(sk);
2822                 
2823         /*
2824          *      FIN if needed
2825          */
2826          
2827         if(tcp_close_state(sk,0))
2828                 tcp_send_fin(sk);
2829                 
2830         release_sock(sk);
2831 }
2832 
2833 /*
2834  *      This routine will send an RST to the other tcp. 
2835  */
2836  
2837 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2838           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2839 {
2840         struct sk_buff *buff;
2841         struct tcphdr *t1;
2842         int tmp;
2843         struct device *ndev=NULL;
2844 
2845         /*
2846          *      Cannot reset a reset (Think about it).
2847          */
2848          
2849         if(th->rst)
2850                 return;
2851   
2852         /*
2853          * We need to grab some memory, and put together an RST,
2854          * and then put it into the queue to be sent.
2855          */
2856 
2857         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2858         if (buff == NULL) 
2859                 return;
2860 
2861         buff->sk = NULL;
2862         buff->dev = dev;
2863         buff->localroute = 0;
2864 
2865         /*
2866          *      Put in the IP header and routing stuff. 
2867          */
2868 
2869         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2870                            sizeof(struct tcphdr),tos,ttl,NULL);
2871         if (tmp < 0) 
2872         {
2873                 buff->free = 1;
2874                 sock_wfree(NULL, buff);
2875                 return;
2876         }
2877 
2878         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2879         memcpy(t1, th, sizeof(*t1));
2880 
2881         /*
2882          *      Swap the send and the receive. 
2883          */
2884 
2885         t1->dest = th->source;
2886         t1->source = th->dest;
2887         t1->rst = 1;  
2888         t1->window = 0;
2889   
2890         if(th->ack)
2891         {
2892                 t1->ack = 0;
2893                 t1->seq = th->ack_seq;
2894                 t1->ack_seq = 0;
2895         }
2896         else
2897         {
2898                 t1->ack = 1;
2899                 if(!th->syn)
2900                         t1->ack_seq = th->seq;
2901                 else
2902                         t1->ack_seq = htonl(ntohl(th->seq)+1);
2903                 t1->seq = 0;
2904         }
2905 
2906         t1->syn = 0;
2907         t1->urg = 0;
2908         t1->fin = 0;
2909         t1->psh = 0;
2910         t1->doff = sizeof(*t1)/4;
2911         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2912         prot->queue_xmit(NULL, ndev, buff, 1);
2913         tcp_statistics.TcpOutSegs++;
2914 }
2915 
2916 
2917 /*
2918  *      Look for tcp options. Parses everything but only knows about MSS.
2919  *      This routine is always called with the packet containing the SYN.
2920  *      However it may also be called with the ack to the SYN.  So you
2921  *      can't assume this is always the SYN.  It's always called after
2922  *      we have set up sk->mtu to our own MTU.
2923  *
2924  *      We need at minimum to add PAWS support here. Possibly large windows
2925  *      as Linux gets deployed on 100Mb/sec networks.
2926  */
2927  
2928 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2929 {
2930         unsigned char *ptr;
2931         int length=(th->doff*4)-sizeof(struct tcphdr);
2932         int mss_seen = 0;
2933     
2934         ptr = (unsigned char *)(th + 1);
2935   
2936         while(length>0)
2937         {
2938                 int opcode=*ptr++;
2939                 int opsize=*ptr++;
2940                 switch(opcode)
2941                 {
2942                         case TCPOPT_EOL:
2943                                 return;
2944                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2945                                 length--;
2946                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2947                                 continue;
2948                         
2949                         default:
2950                                 if(opsize<=2)   /* Avoid silly options looping forever */
2951                                         return;
2952                                 switch(opcode)
2953                                 {
2954                                         case TCPOPT_MSS:
2955                                                 if(opsize==4 && th->syn)
2956                                                 {
2957                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2958                                                         mss_seen = 1;
2959                                                 }
2960                                                 break;
2961                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2962                                 }
2963                                 ptr+=opsize-2;
2964                                 length-=opsize;
2965                 }
2966         }
2967         if (th->syn) 
2968         {
2969                 if (! mss_seen)
2970                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2971         }
2972 #ifdef CONFIG_INET_PCTCP
2973         sk->mss = min(sk->max_window >> 1, sk->mtu);
2974 #else    
2975         sk->mss = min(sk->max_window, sk->mtu);
2976         sk->max_unacked = 2 * sk->mss;
2977 #endif  
2978 }
2979 
2980 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2981 {
2982         dst = ntohl(dst);
2983         if (IN_CLASSA(dst))
2984                 return htonl(IN_CLASSA_NET);
2985         if (IN_CLASSB(dst))
2986                 return htonl(IN_CLASSB_NET);
2987         return htonl(IN_CLASSC_NET);
2988 }
2989 
2990 /*
2991  *      Default sequence number picking algorithm.
2992  *      As close as possible to RFC 793, which
2993  *      suggests using a 250kHz clock.
2994  *      Further reading shows this assumes 2MB/s networks.
2995  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2996  *      That's funny, Linux has one built in!  Use it!
2997  */
2998 
2999 extern inline u32 tcp_init_seq(void)
     /*  */
3000 {
3001         struct timeval tv;
3002         do_gettimeofday(&tv);
3003         return tv.tv_usec+tv.tv_sec*1000000;
3004 }
3005 
3006 /*
3007  *      This routine handles a connection request.
3008  *      It should make sure we haven't already responded.
3009  *      Because of the way BSD works, we have to send a syn/ack now.
3010  *      This also means it will be harder to close a socket which is
3011  *      listening.
3012  */
3013  
3014 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
3015                  unsigned long daddr, unsigned long saddr,
3016                  struct options *opt, struct device *dev, u32 seq)
3017 {
3018         struct sk_buff *buff;
3019         struct tcphdr *t1;
3020         unsigned char *ptr;
3021         struct sock *newsk;
3022         struct tcphdr *th;
3023         struct device *ndev=NULL;
3024         int tmp;
3025         struct rtable *rt;
3026   
3027         th = skb->h.th;
3028 
3029         /* If the socket is dead, don't accept the connection. */
3030         if (!sk->dead) 
3031         {
3032                 sk->data_ready(sk,0);
3033         }
3034         else 
3035         {
3036                 if(sk->debug)
3037                         printk("Reset on %p: Connect on dead socket.\n",sk);
3038                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
3039                 tcp_statistics.TcpAttemptFails++;
3040                 kfree_skb(skb, FREE_READ);
3041                 return;
3042         }
3043 
3044         /*
3045          * Make sure we can accept more.  This will prevent a
3046          * flurry of syns from eating up all our memory.
3047          */
3048 
3049         if (sk->ack_backlog >= sk->max_ack_backlog) 
3050         {
3051                 tcp_statistics.TcpAttemptFails++;
3052                 kfree_skb(skb, FREE_READ);
3053                 return;
3054         }
3055 
3056         /*
3057          * We need to build a new sock struct.
3058          * It is sort of bad to have a socket without an inode attached
3059          * to it, but the wake_up's will just wake up the listening socket,
3060          * and if the listening socket is destroyed before this is taken
3061          * off of the queue, this will take care of it.
3062          */
3063 
3064         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
3065         if (newsk == NULL) 
3066         {
3067                 /* just ignore the syn.  It will get retransmitted. */
3068                 tcp_statistics.TcpAttemptFails++;
3069                 kfree_skb(skb, FREE_READ);
3070                 return;
3071         }
3072 
3073         memcpy(newsk, sk, sizeof(*newsk));
3074         newsk->opt = NULL;
3075         newsk->ip_route_cache  = NULL;
3076         if (opt && opt->optlen) {
3077           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
3078           if (!sk->opt) {
3079                 kfree_s(newsk, sizeof(struct sock));
3080                 tcp_statistics.TcpAttemptFails++;
3081                 kfree_skb(skb, FREE_READ);
3082                 return;
3083           }
3084           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
3085                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
3086                 kfree_s(newsk, sizeof(struct sock));
3087                 tcp_statistics.TcpAttemptFails++;
3088                 kfree_skb(skb, FREE_READ);
3089                 return;
3090           }
3091         }
3092         skb_queue_head_init(&newsk->write_queue);
3093         skb_queue_head_init(&newsk->receive_queue);
3094         newsk->send_head = NULL;
3095         newsk->send_tail = NULL;
3096         skb_queue_head_init(&newsk->back_log);
3097         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3098         newsk->rto = TCP_TIMEOUT_INIT;
3099         newsk->mdev = 0;
3100         newsk->max_window = 0;
3101         newsk->cong_window = 1;
3102         newsk->cong_count = 0;
3103         newsk->ssthresh = 0;
3104         newsk->backoff = 0;
3105         newsk->blog = 0;
3106         newsk->intr = 0;
3107         newsk->proc = 0;
3108         newsk->done = 0;
3109         newsk->partial = NULL;
3110         newsk->pair = NULL;
3111         newsk->wmem_alloc = 0;
3112         newsk->rmem_alloc = 0;
3113         newsk->localroute = sk->localroute;
3114 
3115         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3116 
3117         newsk->err = 0;
3118         newsk->shutdown = 0;
3119         newsk->ack_backlog = 0;
3120         newsk->acked_seq = skb->seq+1;
3121         newsk->lastwin_seq = skb->seq+1;
3122         newsk->delay_acks = 1;
3123         newsk->copied_seq = skb->seq+1;
3124         newsk->fin_seq = skb->seq;
3125         newsk->state = TCP_SYN_RECV;
3126         newsk->timeout = 0;
3127         newsk->ip_xmit_timeout = 0;
3128         newsk->write_seq = seq; 
3129         newsk->window_seq = newsk->write_seq;
3130         newsk->rcv_ack_seq = newsk->write_seq;
3131         newsk->urg_data = 0;
3132         newsk->retransmits = 0;
3133         newsk->linger=0;
3134         newsk->destroy = 0;
3135         init_timer(&newsk->timer);
3136         newsk->timer.data = (unsigned long)newsk;
3137         newsk->timer.function = &net_timer;
3138         init_timer(&newsk->retransmit_timer);
3139         newsk->retransmit_timer.data = (unsigned long)newsk;
3140         newsk->retransmit_timer.function=&retransmit_timer;
3141         newsk->dummy_th.source = skb->h.th->dest;
3142         newsk->dummy_th.dest = skb->h.th->source;
3143         
3144         /*
3145          *      Swap these two, they are from our point of view. 
3146          */
3147          
3148         newsk->daddr = saddr;
3149         newsk->saddr = daddr;
3150         newsk->rcv_saddr = daddr;
3151 
3152         put_sock(newsk->num,newsk);
3153         newsk->dummy_th.res1 = 0;
3154         newsk->dummy_th.doff = 6;
3155         newsk->dummy_th.fin = 0;
3156         newsk->dummy_th.syn = 0;
3157         newsk->dummy_th.rst = 0;        
3158         newsk->dummy_th.psh = 0;
3159         newsk->dummy_th.ack = 0;
3160         newsk->dummy_th.urg = 0;
3161         newsk->dummy_th.res2 = 0;
3162         newsk->acked_seq = skb->seq + 1;
3163         newsk->copied_seq = skb->seq + 1;
3164         newsk->socket = NULL;
3165 
3166         /*
3167          *      Grab the ttl and tos values and use them 
3168          */
3169 
3170         newsk->ip_ttl=sk->ip_ttl;
3171         newsk->ip_tos=skb->ip_hdr->tos;
3172 
3173         /*
3174          *      Use 512 or whatever user asked for 
3175          */
3176 
3177         /*
3178          *      Note use of sk->user_mss, since user has no direct access to newsk 
3179          */
3180 
3181         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3182         newsk->ip_route_cache = rt;
3183         
3184         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3185                 newsk->window_clamp = rt->rt_window;
3186         else
3187                 newsk->window_clamp = 0;
3188                 
3189         if (sk->user_mss)
3190                 newsk->mtu = sk->user_mss;
3191         else if (rt)
3192                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3193         else 
3194                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3195 
3196         /*
3197          *      But not bigger than device MTU 
3198          */
3199 
3200         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3201 
3202 #ifdef CONFIG_SKIP
3203         
3204         /*
3205          *      SKIP devices set their MTU to 65535. This is so they can take packets
3206          *      unfragmented to security process then fragment. They could lie to the
3207          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3208          *      simply because the final package we want unfragmented is going to be
3209          *
3210          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3211          */
3212          
3213         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3214                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3215 #endif
3216         /*
3217          *      This will min with what arrived in the packet 
3218          */
3219 
3220         tcp_options(newsk,skb->h.th);
3221         
3222         tcp_cache_zap();
3223 
3224         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3225         if (buff == NULL) 
3226         {
3227                 sk->err = ENOMEM;
3228                 newsk->dead = 1;
3229                 newsk->state = TCP_CLOSE;
3230                 /* And this will destroy it */
3231                 release_sock(newsk);
3232                 kfree_skb(skb, FREE_READ);
3233                 tcp_statistics.TcpAttemptFails++;
3234                 return;
3235         }
3236   
3237         buff->sk = newsk;
3238         buff->localroute = newsk->localroute;
3239 
3240         /*
3241          *      Put in the IP header and routing stuff. 
3242          */
3243 
3244         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3245                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3246 
3247         /*
3248          *      Something went wrong. 
3249          */
3250 
3251         if (tmp < 0) 
3252         {
3253                 sk->err = tmp;
3254                 buff->free = 1;
3255                 kfree_skb(buff,FREE_WRITE);
3256                 newsk->dead = 1;
3257                 newsk->state = TCP_CLOSE;
3258                 release_sock(newsk);
3259                 skb->sk = sk;
3260                 kfree_skb(skb, FREE_READ);
3261                 tcp_statistics.TcpAttemptFails++;
3262                 return;
3263         }
3264 
3265         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3266   
3267         memcpy(t1, skb->h.th, sizeof(*t1));
3268         buff->seq = newsk->write_seq++;
3269         buff->end_seq = newsk->write_seq;
3270         /*
3271          *      Swap the send and the receive. 
3272          */
3273         t1->dest = skb->h.th->source;
3274         t1->source = newsk->dummy_th.source;
3275         t1->seq = ntohl(buff->seq);
3276         t1->ack = 1;
3277         newsk->sent_seq = newsk->write_seq;
3278         t1->window = ntohs(tcp_select_window(newsk));
3279         t1->res1 = 0;
3280         t1->res2 = 0;
3281         t1->rst = 0;
3282         t1->urg = 0;
3283         t1->psh = 0;
3284         t1->syn = 1;
3285         t1->ack_seq = htonl(newsk->acked_seq);
3286         t1->doff = sizeof(*t1)/4+1;
3287         ptr = skb_put(buff,4);
3288         ptr[0] = 2;
3289         ptr[1] = 4;
3290         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3291         ptr[3] =(newsk->mtu) & 0xff;
3292 
3293         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3294         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3295         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3296         skb->sk = newsk;
3297 
3298         /*
3299          *      Charge the sock_buff to newsk. 
3300          */
3301          
3302         sk->rmem_alloc -= skb->truesize;
3303         newsk->rmem_alloc += skb->truesize;
3304         
3305         skb_queue_tail(&sk->receive_queue,skb);
3306         sk->ack_backlog++;
3307         release_sock(newsk);
3308         tcp_statistics.TcpOutSegs++;
3309 }
3310 
3311 
3312 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3313 {
3314         /*
3315          * We need to grab some memory, and put together a FIN, 
3316          * and then put it into the queue to be sent.
3317          */
3318         
3319         sk->inuse = 1;
3320         
3321         if(th_cache_sk==sk)
3322                 tcp_cache_zap();
3323         if(sk->state == TCP_LISTEN)
3324         {
3325                 /* Special case */
3326                 tcp_set_state(sk, TCP_CLOSE);
3327                 tcp_close_pending(sk);
3328                 release_sock(sk);
3329                 return;
3330         }
3331         
3332         sk->keepopen = 1;
3333         sk->shutdown = SHUTDOWN_MASK;
3334 
3335         if (!sk->dead) 
3336                 sk->state_change(sk);
3337 
3338         if (timeout == 0) 
3339         {
3340                 struct sk_buff *skb;
3341                 
3342                 /*
3343                  *  We need to flush the recv. buffs.  We do this only on the
3344                  *  descriptor close, not protocol-sourced closes, because the
3345                  *  reader process may not have drained the data yet!
3346                  */
3347                  
3348                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3349                         kfree_skb(skb, FREE_READ);
3350                 /*
3351                  *      Get rid off any half-completed packets. 
3352                  */
3353 
3354                 if (sk->partial) 
3355                         tcp_send_partial(sk);
3356         }
3357 
3358                 
3359         /*
3360          *      Timeout is not the same thing - however the code likes
3361          *      to send both the same way (sigh).
3362          */
3363          
3364         if(timeout)
3365         {
3366                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3367         }
3368         else
3369         {
3370                 if(tcp_close_state(sk,1)==1)
3371                 {
3372                         tcp_send_fin(sk);
3373                 }
3374         }
3375         release_sock(sk);
3376 }
3377 
3378 
3379 /*
3380  *      This routine takes stuff off of the write queue,
3381  *      and puts it in the xmit queue. This happens as incoming acks
3382  *      open up the remote window for us.
3383  */
3384  
3385 static void tcp_write_xmit(struct sock *sk)
     /*  */
3386 {
3387         struct sk_buff *skb;
3388 
3389         /*
3390          *      The bytes will have to remain here. In time closedown will
3391          *      empty the write queue and all will be happy 
3392          */
3393 
3394         if(sk->zapped)
3395                 return;
3396 
3397         /*
3398          *      Anything on the transmit queue that fits the window can
3399          *      be added providing we are not
3400          *
3401          *      a) retransmitting (Nagle's rule)
3402          *      b) exceeding our congestion window.
3403          */
3404          
3405         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3406                 before(skb->end_seq, sk->window_seq + 1) &&
3407                 (sk->retransmits == 0 ||
3408                  sk->ip_xmit_timeout != TIME_WRITE ||
3409                  before(skb->end_seq, sk->rcv_ack_seq + 1))
3410                 && sk->packets_out < sk->cong_window) 
3411         {
3412                 IS_SKB(skb);
3413                 skb_unlink(skb);
3414                 
3415                 /*
3416                  *      See if we really need to send the packet. 
3417                  */
3418                  
3419                 if (before(skb->end_seq, sk->rcv_ack_seq +1)) 
3420                 {
3421                         /*
3422                          *      This is acked data. We can discard it. This 
3423                          *      cannot currently occur.
3424                          */
3425                          
3426                         sk->retransmits = 0;
3427                         kfree_skb(skb, FREE_WRITE);
3428                         if (!sk->dead) 
3429                                 sk->write_space(sk);
3430                 } 
3431                 else
3432                 {
3433                         struct tcphdr *th;
3434                         struct iphdr *iph;
3435                         int size;
3436 /*
3437  * put in the ack seq and window at this point rather than earlier,
3438  * in order to keep them monotonic.  We really want to avoid taking
3439  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3440  * Ack and window will in general have changed since this packet was put
3441  * on the write queue.
3442  */
3443                         iph = skb->ip_hdr;
3444                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3445                         size = skb->len - (((unsigned char *) th) - skb->data);
3446 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3447                         if (size > sk->mtu - sizeof(struct iphdr))
3448                         {
3449                                 iph->frag_off &= ~htons(IP_DF);
3450                                 ip_send_check(iph);
3451                         }
3452 #endif
3453                         
3454                         th->ack_seq = htonl(sk->acked_seq);
3455                         th->window = htons(tcp_select_window(sk));
3456 
3457                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3458 
3459                         sk->sent_seq = skb->end_seq;
3460                         
3461                         /*
3462                          *      IP manages our queue for some crazy reason
3463                          */
3464                          
3465                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3466                         
3467                         
3468                         sk->ack_backlog = 0;
3469                         sk->bytes_rcv = 0;
3470 
3471                         /*
3472                          *      Again we slide the timer wrongly
3473                          */
3474                          
3475                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3476                 }
3477         }
3478 }
3479 
3480 
3481 /*
3482  *      This routine deals with incoming acks, but not outgoing ones.
3483  */
3484 
3485 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3486 {
3487         u32 ack;
3488         int flag = 0;
3489 
3490         /* 
3491          * 1 - there was data in packet as well as ack or new data is sent or 
3492          *     in shutdown state
3493          * 2 - data from retransmit queue was acked and removed
3494          * 4 - window shrunk or data from retransmit queue was acked and removed
3495          */
3496 
3497         if(sk->zapped)
3498                 return(1);      /* Dead, cant ack any more so why bother */
3499 
3500         /*
3501          *      Have we discovered a larger window
3502          */
3503          
3504         ack = ntohl(th->ack_seq);
3505 
3506         if (ntohs(th->window) > sk->max_window) 
3507         {
3508                 sk->max_window = ntohs(th->window);
3509 #ifdef CONFIG_INET_PCTCP
3510                 /* Hack because we don't send partial packets to non SWS
3511                    handling hosts */
3512                 sk->mss = min(sk->max_window>>1, sk->mtu);
3513 #else
3514                 sk->mss = min(sk->max_window, sk->mtu);
3515 #endif  
3516         }
3517 
3518         /*
3519          *      We have dropped back to keepalive timeouts. Thus we have
3520          *      no retransmits pending.
3521          */
3522          
3523         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3524                 sk->retransmits = 0;
3525 
3526         /*
3527          *      If the ack is newer than sent or older than previous acks
3528          *      then we can probably ignore it.
3529          */
3530          
3531         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3532         {
3533                 if(sk->debug)
3534                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3535                         
3536                 /*
3537                  *      Keepalive processing.
3538                  */
3539                  
3540                 if (after(ack, sk->sent_seq)) 
3541                 {
3542                         return(0);
3543                 }
3544                 
3545                 /*
3546                  *      Restart the keepalive timer.
3547                  */
3548                  
3549                 if (sk->keepopen) 
3550                 {
3551                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3552                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3553                 }
3554                 return(1);
3555         }
3556 
3557         /*
3558          *      If there is data set flag 1
3559          */
3560          
3561         if (len != th->doff*4) 
3562                 flag |= 1;
3563 
3564         /*
3565          *      See if our window has been shrunk. 
3566          */
3567 
3568         if (after(sk->window_seq, ack+ntohs(th->window))) 
3569         {
3570                 /*
3571                  * We may need to move packets from the send queue
3572                  * to the write queue, if the window has been shrunk on us.
3573                  * The RFC says you are not allowed to shrink your window
3574                  * like this, but if the other end does, you must be able
3575                  * to deal with it.
3576                  */
3577                 struct sk_buff *skb;
3578                 struct sk_buff *skb2;
3579                 struct sk_buff *wskb = NULL;
3580         
3581                 skb2 = sk->send_head;
3582                 sk->send_head = NULL;
3583                 sk->send_tail = NULL;
3584         
3585                 /*
3586                  *      This is an artifact of a flawed concept. We want one
3587                  *      queue and a smarter send routine when we send all.
3588                  */
3589         
3590                 flag |= 4;      /* Window changed */
3591         
3592                 sk->window_seq = ack + ntohs(th->window);
3593                 cli();
3594                 while (skb2 != NULL) 
3595                 {
3596                         skb = skb2;
3597                         skb2 = skb->link3;
3598                         skb->link3 = NULL;
3599                         if (after(skb->end_seq, sk->window_seq)) 
3600                         {
3601                                 if (sk->packets_out > 0) 
3602                                         sk->packets_out--;
3603                                 /* We may need to remove this from the dev send list. */
3604                                 if (skb->next != NULL) 
3605                                 {
3606                                         skb_unlink(skb);                                
3607                                 }
3608                                 /* Now add it to the write_queue. */
3609                                 if (wskb == NULL)
3610                                         skb_queue_head(&sk->write_queue,skb);
3611                                 else
3612                                         skb_append(wskb,skb);
3613                                 wskb = skb;
3614                         } 
3615                         else 
3616                         {
3617                                 if (sk->send_head == NULL) 
3618                                 {
3619                                         sk->send_head = skb;
3620                                         sk->send_tail = skb;
3621                                 }
3622                                 else
3623                                 {
3624                                         sk->send_tail->link3 = skb;
3625                                         sk->send_tail = skb;
3626                                 }
3627                                 skb->link3 = NULL;
3628                         }
3629                 }
3630                 sti();
3631         }
3632 
3633         /*
3634          *      Pipe has emptied
3635          */
3636          
3637         if (sk->send_tail == NULL || sk->send_head == NULL) 
3638         {
3639                 sk->send_head = NULL;
3640                 sk->send_tail = NULL;
3641                 sk->packets_out= 0;
3642         }
3643 
3644         /*
3645          *      Update the right hand window edge of the host
3646          */
3647          
3648         sk->window_seq = ack + ntohs(th->window);
3649 
3650         /*
3651          *      We don't want too many packets out there. 
3652          */
3653          
3654         if (sk->ip_xmit_timeout == TIME_WRITE && 
3655                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3656         {
3657                 /* 
3658                  * This is Jacobson's slow start and congestion avoidance. 
3659                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3660                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3661                  * counter and increment it once every cwnd times.  It's possible
3662                  * that this should be done only if sk->retransmits == 0.  I'm
3663                  * interpreting "new data is acked" as including data that has
3664                  * been retransmitted but is just now being acked.
3665                  */
3666                 if (sk->cong_window < sk->ssthresh)  
3667                         /* 
3668                          *      In "safe" area, increase
3669                          */
3670                         sk->cong_window++;
3671                 else 
3672                 {
3673                         /*
3674                          *      In dangerous area, increase slowly.  In theory this is
3675                          *      sk->cong_window += 1 / sk->cong_window
3676                          */
3677                         if (sk->cong_count >= sk->cong_window) 
3678                         {
3679                                 sk->cong_window++;
3680                                 sk->cong_count = 0;
3681                         }
3682                         else 
3683                                 sk->cong_count++;
3684                 }
3685         }
3686 
3687         /*
3688          *      Remember the highest ack received.
3689          */
3690          
3691         sk->rcv_ack_seq = ack;
3692         
3693         /*
3694          *      We passed data and got it acked, remove any soft error
3695          *      log. Something worked...
3696          */
3697          
3698         sk->err_soft = 0;
3699 
3700         /*
3701          *      If this ack opens up a zero window, clear backoff.  It was
3702          *      being used to time the probes, and is probably far higher than
3703          *      it needs to be for normal retransmission.
3704          */
3705 
3706         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3707         {
3708                 sk->retransmits = 0;    /* Our probe was answered */
3709                 
3710                 /*
3711                  *      Was it a usable window open ?
3712                  */
3713                  
3714                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3715                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
3716                 {
3717                         sk->backoff = 0;
3718                         
3719                         /*
3720                          *      Recompute rto from rtt.  this eliminates any backoff.
3721                          */
3722 
3723                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3724                         if (sk->rto > 120*HZ)
3725                                 sk->rto = 120*HZ;
3726                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
3727                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3728                                                    .2 of a second is going to need huge windows (SIGH) */
3729                         sk->rto = HZ/5;
3730                 }
3731         }
3732 
3733         /* 
3734          *      See if we can take anything off of the retransmit queue.
3735          */
3736    
3737         while(sk->send_head != NULL) 
3738         {
3739                 /* Check for a bug. */
3740                 if (sk->send_head->link3 &&
3741                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
3742                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3743                         
3744                 /*
3745                  *      If our packet is before the ack sequence we can
3746                  *      discard it as it's confirmed to have arrived the other end.
3747                  */
3748                  
3749                 if (before(sk->send_head->end_seq, ack+1)) 
3750                 {
3751                         struct sk_buff *oskb;   
3752                         if (sk->retransmits) 
3753                         {       
3754                                 /*
3755                                  *      We were retransmitting.  don't count this in RTT est 
3756                                  */
3757                                 flag |= 2;
3758 
3759                                 /*
3760                                  * even though we've gotten an ack, we're still
3761                                  * retransmitting as long as we're sending from
3762                                  * the retransmit queue.  Keeping retransmits non-zero
3763                                  * prevents us from getting new data interspersed with
3764                                  * retransmissions.
3765                                  */
3766 
3767                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3768                                         sk->retransmits = 1;
3769                                 else
3770                                         sk->retransmits = 0;
3771                         }
3772                         /*
3773                          * Note that we only reset backoff and rto in the
3774                          * rtt recomputation code.  And that doesn't happen
3775                          * if there were retransmissions in effect.  So the
3776                          * first new packet after the retransmissions is
3777                          * sent with the backoff still in effect.  Not until
3778                          * we get an ack from a non-retransmitted packet do
3779                          * we reset the backoff and rto.  This allows us to deal
3780                          * with a situation where the network delay has increased
3781                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3782                          */
3783 
3784                         /*
3785                          *      We have one less packet out there. 
3786                          */
3787                          
3788                         if (sk->packets_out > 0) 
3789                                 sk->packets_out --;
3790                         /* 
3791                          *      Wake up the process, it can probably write more. 
3792                          */
3793                         if (!sk->dead) 
3794                                 sk->write_space(sk);
3795                         oskb = sk->send_head;
3796 
3797                         if (!(flag&2))  /* Not retransmitting */
3798                         {
3799                                 long m;
3800         
3801                                 /*
3802                                  *      The following amusing code comes from Jacobson's
3803                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3804                                  *      are scaled versions of rtt and mean deviation.
3805                                  *      This is designed to be as fast as possible 
3806                                  *      m stands for "measurement".
3807                                  */
3808         
3809                                 m = jiffies - oskb->when;  /* RTT */
3810                                 if(m<=0)
3811                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3812                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3813                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3814                                 if (m < 0)
3815                                         m = -m;         /* m is now abs(error) */
3816                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3817                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3818         
3819                                 /*
3820                                  *      Now update timeout.  Note that this removes any backoff.
3821                                  */
3822                          
3823                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3824                                 if (sk->rto > 120*HZ)
3825                                         sk->rto = 120*HZ;
3826                                 if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3827                                         sk->rto = HZ/5;
3828                                 sk->backoff = 0;
3829                         }
3830                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3831                                            In this case as we just set it up */
3832                         cli();
3833                         oskb = sk->send_head;
3834                         IS_SKB(oskb);
3835                         sk->send_head = oskb->link3;
3836                         if (sk->send_head == NULL) 
3837                         {
3838                                 sk->send_tail = NULL;
3839                         }
3840 
3841                 /*
3842                  *      We may need to remove this from the dev send list. 
3843                  */
3844 
3845                         if (oskb->next)
3846                                 skb_unlink(oskb);
3847                         sti();
3848                         kfree_skb(oskb, FREE_WRITE); /* write. */
3849                         if (!sk->dead) 
3850                                 sk->write_space(sk);
3851                 }
3852                 else
3853                 {
3854                         break;
3855                 }
3856         }
3857 
3858         /*
3859          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3860          * returns non-NULL, we complete ignore the timer stuff in the else
3861          * clause.  We ought to organize the code so that else clause can
3862          * (should) be executed regardless, possibly moving the PROBE timer
3863          * reset over.  The skb_peek() thing should only move stuff to the
3864          * write queue, NOT also manage the timer functions.
3865          */
3866 
3867         /*
3868          * Maybe we can take some stuff off of the write queue,
3869          * and put it onto the xmit queue.
3870          */
3871         if (skb_peek(&sk->write_queue) != NULL) 
3872         {
3873                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3874                         (sk->retransmits == 0 || 
3875                          sk->ip_xmit_timeout != TIME_WRITE ||
3876                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3877                         && sk->packets_out < sk->cong_window) 
3878                 {
3879                         /*
3880                          *      Add more data to the send queue.
3881                          */
3882                         flag |= 1;
3883                         tcp_write_xmit(sk);
3884                 }
3885                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3886                         sk->send_head == NULL &&
3887                         sk->ack_backlog == 0 &&
3888                         sk->state != TCP_TIME_WAIT) 
3889                 {
3890                         /*
3891                          *      Data to queue but no room.
3892                          */
3893                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3894                 }               
3895         }
3896         else
3897         {
3898                 /*
3899                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3900                  * from TCP_CLOSE we don't do anything
3901                  *
3902                  * from anything else, if there is write data (or fin) pending,
3903                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3904                  * a KEEPALIVE timeout, else we delete the timer.
3905                  *
3906                  * We do not set flag for nominal write data, otherwise we may
3907                  * force a state where we start to write itsy bitsy tidbits
3908                  * of data.
3909                  */
3910 
3911                 switch(sk->state) {
3912                 case TCP_TIME_WAIT:
3913                         /*
3914                          * keep us in TIME_WAIT until we stop getting packets,
3915                          * reset the timeout.
3916                          */
3917                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3918                         break;
3919                 case TCP_CLOSE:
3920                         /*
3921                          * don't touch the timer.
3922                          */
3923                         break;
3924                 default:
3925                         /*
3926                          *      Must check send_head, write_queue, and ack_backlog
3927                          *      to determine which timeout to use.
3928                          */
3929                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3930                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3931                         } else if (sk->keepopen) {
3932                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3933                         } else {
3934                                 del_timer(&sk->retransmit_timer);
3935                                 sk->ip_xmit_timeout = 0;
3936                         }
3937                         break;
3938                 }
3939         }
3940 
3941         /*
3942          *      We have nothing queued but space to send. Send any partial
3943          *      packets immediately (end of Nagle rule application).
3944          */
3945          
3946         if (sk->packets_out == 0 && sk->partial != NULL &&
3947                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3948         {
3949                 flag |= 1;
3950                 tcp_send_partial(sk);
3951         }
3952 
3953         /*
3954          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3955          * we are now waiting for an acknowledge to our FIN.  The other end is
3956          * already in TIME_WAIT.
3957          *
3958          * Move to TCP_CLOSE on success.
3959          */
3960 
3961         if (sk->state == TCP_LAST_ACK) 
3962         {
3963                 if (!sk->dead)
3964                         sk->state_change(sk);
3965                 if(sk->debug)
3966                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3967                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3968                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3969                 {
3970                         flag |= 1;
3971                         sk->shutdown = SHUTDOWN_MASK;
3972                         tcp_set_state(sk,TCP_CLOSE);
3973                         return 1;
3974                 }
3975         }
3976 
3977         /*
3978          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3979          *
3980          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3981          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3982          */
3983 
3984         if (sk->state == TCP_FIN_WAIT1) 
3985         {
3986 
3987                 if (!sk->dead) 
3988                         sk->state_change(sk);
3989                 if (sk->rcv_ack_seq == sk->write_seq) 
3990                 {
3991                         flag |= 1;
3992                         sk->shutdown |= SEND_SHUTDOWN;
3993                         tcp_set_state(sk, TCP_FIN_WAIT2);
3994                 }
3995         }
3996 
3997         /*
3998          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3999          *
4000          *      Move to TIME_WAIT
4001          */
4002 
4003         if (sk->state == TCP_CLOSING) 
4004         {
4005 
4006                 if (!sk->dead) 
4007                         sk->state_change(sk);
4008                 if (sk->rcv_ack_seq == sk->write_seq) 
4009                 {
4010                         flag |= 1;
4011                         tcp_time_wait(sk);
4012                 }
4013         }
4014         
4015         /*
4016          *      Final ack of a three way shake 
4017          */
4018          
4019         if(sk->state==TCP_SYN_RECV)
4020         {
4021                 tcp_set_state(sk, TCP_ESTABLISHED);
4022                 tcp_options(sk,th);
4023                 sk->dummy_th.dest=th->source;
4024                 sk->copied_seq = sk->acked_seq;
4025                 if(!sk->dead)
4026                         sk->state_change(sk);
4027                 if(sk->max_window==0)
4028                 {
4029                         sk->max_window=32;      /* Sanity check */
4030                         sk->mss=min(sk->max_window,sk->mtu);
4031                 }
4032         }
4033         
4034         /*
4035          * I make no guarantees about the first clause in the following
4036          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
4037          * what conditions "!flag" would be true.  However I think the rest
4038          * of the conditions would prevent that from causing any
4039          * unnecessary retransmission. 
4040          *   Clearly if the first packet has expired it should be 
4041          * retransmitted.  The other alternative, "flag&2 && retransmits", is
4042          * harder to explain:  You have to look carefully at how and when the
4043          * timer is set and with what timeout.  The most recent transmission always
4044          * sets the timer.  So in general if the most recent thing has timed
4045          * out, everything before it has as well.  So we want to go ahead and
4046          * retransmit some more.  If we didn't explicitly test for this
4047          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
4048          * would not be true.  If you look at the pattern of timing, you can
4049          * show that rto is increased fast enough that the next packet would
4050          * almost never be retransmitted immediately.  Then you'd end up
4051          * waiting for a timeout to send each packet on the retransmission
4052          * queue.  With my implementation of the Karn sampling algorithm,
4053          * the timeout would double each time.  The net result is that it would
4054          * take a hideous amount of time to recover from a single dropped packet.
4055          * It's possible that there should also be a test for TIME_WRITE, but
4056          * I think as long as "send_head != NULL" and "retransmit" is on, we've
4057          * got to be in real retransmission mode.
4058          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
4059          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
4060          * As long as no further losses occur, this seems reasonable.
4061          */
4062         
4063         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
4064                (((flag&2) && sk->retransmits) ||
4065                (sk->send_head->when + sk->rto < jiffies))) 
4066         {
4067                 if(sk->send_head->when + sk->rto < jiffies)
4068                         tcp_retransmit(sk,0);   
4069                 else
4070                 {
4071                         tcp_do_retransmit(sk, 1);
4072                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4073                 }
4074         }
4075 
4076         return(1);
4077 }
4078 
4079 
4080 /*
4081  *      Process the FIN bit. This now behaves as it is supposed to work
4082  *      and the FIN takes effect when it is validly part of sequence
4083  *      space. Not before when we get holes.
4084  *
4085  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4086  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
4087  *      TIME-WAIT)
4088  *
4089  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
4090  *      close and we go into CLOSING (and later onto TIME-WAIT)
4091  *
4092  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4093  *
4094  */
4095  
4096 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
4097 {
4098         sk->fin_seq = skb->end_seq;
4099 
4100         if (!sk->dead) 
4101         {
4102                 sk->state_change(sk);
4103                 sock_wake_async(sk->socket, 1);
4104         }
4105 
4106         switch(sk->state) 
4107         {
4108                 case TCP_SYN_RECV:
4109                 case TCP_SYN_SENT:
4110                 case TCP_ESTABLISHED:
4111                         /*
4112                          * move to CLOSE_WAIT, tcp_data() already handled
4113                          * sending the ack.
4114                          */
4115                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4116                         if (th->rst)
4117                                 sk->shutdown = SHUTDOWN_MASK;
4118                         break;
4119 
4120                 case TCP_CLOSE_WAIT:
4121                 case TCP_CLOSING:
4122                         /*
4123                          * received a retransmission of the FIN, do
4124                          * nothing.
4125                          */
4126                         break;
4127                 case TCP_TIME_WAIT:
4128                         /*
4129                          * received a retransmission of the FIN,
4130                          * restart the TIME_WAIT timer.
4131                          */
4132                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4133                         return(0);
4134                 case TCP_FIN_WAIT1:
4135                         /*
4136                          * This case occurs when a simultaneous close
4137                          * happens, we must ack the received FIN and
4138                          * enter the CLOSING state.
4139                          *
4140                          * This causes a WRITE timeout, which will either
4141                          * move on to TIME_WAIT when we timeout, or resend
4142                          * the FIN properly (maybe we get rid of that annoying
4143                          * FIN lost hang). The TIME_WRITE code is already correct
4144                          * for handling this timeout.
4145                          */
4146 
4147                         if(sk->ip_xmit_timeout != TIME_WRITE)
4148                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4149                         tcp_set_state(sk,TCP_CLOSING);
4150                         break;
4151                 case TCP_FIN_WAIT2:
4152                         /*
4153                          * received a FIN -- send ACK and enter TIME_WAIT
4154                          */
4155                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4156                         sk->shutdown|=SHUTDOWN_MASK;
4157                         tcp_set_state(sk,TCP_TIME_WAIT);
4158                         break;
4159                 case TCP_CLOSE:
4160                         /*
4161                          * already in CLOSE
4162                          */
4163                         break;
4164                 default:
4165                         tcp_set_state(sk,TCP_LAST_ACK);
4166         
4167                         /* Start the timers. */
4168                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4169                         return(0);
4170         }
4171 
4172         return(0);
4173 }
4174 
4175 
4176 
4177 /*
4178  *      This routine handles the data.  If there is room in the buffer,
4179  *      it will be have already been moved into it.  If there is no
4180  *      room, then we will just have to discard the packet.
4181  */
4182 
4183 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4184          unsigned long saddr, unsigned short len)
4185 {
4186         struct sk_buff *skb1, *skb2;
4187         struct tcphdr *th;
4188         int dup_dumped=0;
4189         u32 new_seq, shut_seq;
4190 
4191         th = skb->h.th;
4192         skb_pull(skb,th->doff*4);
4193         skb_trim(skb,len-(th->doff*4));
4194 
4195         /*
4196          *      The bytes in the receive read/assembly queue has increased. Needed for the
4197          *      low memory discard algorithm 
4198          */
4199            
4200         sk->bytes_rcv += skb->len;
4201         
4202         if (skb->len == 0 && !th->fin) 
4203         {
4204                 /* 
4205                  *      Don't want to keep passing ack's back and forth. 
4206                  *      (someone sent us dataless, boring frame)
4207                  */
4208                 if (!th->ack)
4209                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4210                 kfree_skb(skb, FREE_READ);
4211                 return(0);
4212         }
4213         
4214         /*
4215          *      We no longer have anyone receiving data on this connection.
4216          */
4217 
4218 #ifndef TCP_DONT_RST_SHUTDOWN            
4219 
4220         if(sk->shutdown & RCV_SHUTDOWN)
4221         {
4222                 /*
4223                  *      FIXME: BSD has some magic to avoid sending resets to
4224                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4225                  *      BSD stacks still have broken keepalives so we want to
4226                  *      cope with it.
4227                  */
4228 
4229                 if(skb->len)    /* We don't care if it's just an ack or
4230                                    a keepalive/window probe */
4231                 {
4232                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
4233                         
4234                         /* Do this the way 4.4BSD treats it. Not what I'd
4235                            regard as the meaning of the spec but it's what BSD
4236                            does and clearly they know everything 8) */
4237 
4238                         /*
4239                          *      This is valid because of two things
4240                          *
4241                          *      a) The way tcp_data behaves at the bottom.
4242                          *      b) A fin takes effect when read not when received.
4243                          */
4244                          
4245                         shut_seq = sk->acked_seq+1;     /* Last byte */
4246                         
4247                         if(after(new_seq,shut_seq))
4248                         {
4249                                 if(sk->debug)
4250                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4251                                                 sk, new_seq, shut_seq, sk->blog);
4252                                 if(sk->dead)
4253                                 {
4254                                         sk->acked_seq = new_seq + th->fin;
4255                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4256                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4257                                         tcp_statistics.TcpEstabResets++;
4258                                         sk->err = EPIPE;
4259                                         sk->error_report(sk);
4260                                         sk->shutdown = SHUTDOWN_MASK;
4261                                         tcp_set_state(sk,TCP_CLOSE);
4262                                         kfree_skb(skb, FREE_READ);
4263                                         return 0;
4264                                 }
4265                         }
4266                 }
4267         }
4268 
4269 #endif
4270 
4271         /*
4272          *      Now we have to walk the chain, and figure out where this one
4273          *      goes into it.  This is set up so that the last packet we received
4274          *      will be the first one we look at, that way if everything comes
4275          *      in order, there will be no performance loss, and if they come
4276          *      out of order we will be able to fit things in nicely.
4277          *
4278          *      [AC: This is wrong. We should assume in order first and then walk
4279          *       forwards from the first hole based upon real traffic patterns.]
4280          *      
4281          */
4282 
4283         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4284         {
4285                 skb_queue_head(&sk->receive_queue,skb);
4286                 skb1= NULL;
4287         } 
4288         else
4289         {
4290                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4291                 {
4292                         if(sk->debug)
4293                         {
4294                                 printk("skb1=%p :", skb1);
4295                                 printk("skb1->seq = %d: ", skb1->seq);
4296                                 printk("skb->seq = %d\n",skb->seq);
4297                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4298                                                 sk->acked_seq);
4299                         }
4300                         
4301                         /*
4302                          *      Optimisation: Duplicate frame or extension of previous frame from
4303                          *      same sequence point (lost ack case).
4304                          *      The frame contains duplicate data or replaces a previous frame
4305                          *      discard the previous frame (safe as sk->inuse is set) and put
4306                          *      the new one in its place.
4307                          */
4308                          
4309                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
4310                         {
4311                                 skb_append(skb1,skb);
4312                                 skb_unlink(skb1);
4313                                 kfree_skb(skb1,FREE_READ);
4314                                 dup_dumped=1;
4315                                 skb1=NULL;
4316                                 break;
4317                         }
4318                         
4319                         /*
4320                          *      Found where it fits
4321                          */
4322                          
4323                         if (after(skb->seq+1, skb1->seq))
4324                         {
4325                                 skb_append(skb1,skb);
4326                                 break;
4327                         }
4328                         
4329                         /*
4330                          *      See if we've hit the start. If so insert.
4331                          */
4332                         if (skb1 == skb_peek(&sk->receive_queue))
4333                         {
4334                                 skb_queue_head(&sk->receive_queue, skb);
4335                                 break;
4336                         }
4337                 }
4338         }
4339 
4340         /*
4341          *      Figure out what the ack value for this frame is
4342          */
4343          
4344         if (before(sk->acked_seq, sk->copied_seq)) 
4345         {
4346                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4347                 sk->acked_seq = sk->copied_seq;
4348         }
4349 
4350         /*
4351          *      Now figure out if we can ack anything. This is very messy because we really want two
4352          *      receive queues, a completed and an assembly queue. We also want only one transmit
4353          *      queue.
4354          */
4355 
4356         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
4357         {
4358                 if (before(skb->seq, sk->acked_seq+1)) 
4359                 {
4360 
4361                         if (after(skb->end_seq, sk->acked_seq)) 
4362                                 sk->acked_seq = skb->end_seq;
4363 
4364                         skb->acked = 1;
4365 
4366                         /*
4367                          *      When we ack the fin, we do the FIN 
4368                          *      processing.
4369                          */
4370 
4371                         if (skb->h.th->fin) 
4372                         {
4373                                 tcp_fin(skb,sk,skb->h.th);
4374                         }
4375           
4376                         for(skb2 = skb->next;
4377                             skb2 != (struct sk_buff *)&sk->receive_queue;
4378                             skb2 = skb2->next) 
4379                         {
4380                                 if (before(skb2->seq, sk->acked_seq+1)) 
4381                                 {
4382                                         if (after(skb2->end_seq, sk->acked_seq))
4383                                                 sk->acked_seq = skb2->end_seq;
4384 
4385                                         skb2->acked = 1;
4386                                         /*
4387                                          *      When we ack the fin, we do
4388                                          *      the fin handling.
4389                                          */
4390                                         if (skb2->h.th->fin) 
4391                                         {
4392                                                 tcp_fin(skb,sk,skb->h.th);
4393                                         }
4394 
4395                                         /*
4396                                          *      Force an immediate ack.
4397                                          */
4398                                          
4399                                         sk->ack_backlog = sk->max_ack_backlog;
4400                                 }
4401                                 else
4402                                 {
4403                                         break;
4404                                 }
4405                         }
4406 
4407                         /*
4408                          *      This also takes care of updating the window.
4409                          *      This if statement needs to be simplified.
4410                          *
4411                          *      rules for delaying an ack:
4412                          *      - delay time <= 0.5 HZ
4413                          *      - we don't have a window update to send
4414                          *      - must send at least every 2 full sized packets
4415                          */
4416                         if (!sk->delay_acks ||
4417                             sk->ack_backlog >= sk->max_ack_backlog || 
4418                             sk->bytes_rcv > sk->max_unacked || th->fin ||
4419                             sk->ato > HZ/2 ||
4420                             tcp_raise_window(sk)) {
4421         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4422                         }
4423                         else 
4424                         {
4425                                 sk->ack_backlog++;
4426                                 
4427                                 if(sk->debug)                           
4428                                         printk("Ack queued.\n");
4429                                 reset_xmit_timer(sk, TIME_WRITE, sk->ato);
4430                         }
4431                 }
4432         }
4433 
4434         /*
4435          *      If we've missed a packet, send an ack.
4436          *      Also start a timer to send another.
4437          */
4438          
4439         if (!skb->acked) 
4440         {
4441         
4442         /*
4443          *      This is important.  If we don't have much room left,
4444          *      we need to throw out a few packets so we have a good
4445          *      window.  Note that mtu is used, not mss, because mss is really
4446          *      for the send side.  He could be sending us stuff as large as mtu.
4447          */
4448                  
4449                 while (sock_rspace(sk) < sk->mtu) 
4450                 {
4451                         skb1 = skb_peek(&sk->receive_queue);
4452                         if (skb1 == NULL) 
4453                         {
4454                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4455                                 break;
4456                         }
4457 
4458                         /*
4459                          *      Don't throw out something that has been acked. 
4460                          */
4461                  
4462                         if (skb1->acked) 
4463                         {
4464                                 break;
4465                         }
4466                 
4467                         skb_unlink(skb1);
4468                         kfree_skb(skb1, FREE_READ);
4469                 }
4470                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4471                 sk->ack_backlog++;
4472                 reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
4473         }
4474         else
4475         {
4476                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4477         }
4478 
4479         /*
4480          *      Now tell the user we may have some data. 
4481          */
4482          
4483         if (!sk->dead) 
4484         {
4485                 if(sk->debug)
4486                         printk("Data wakeup.\n");
4487                 sk->data_ready(sk,0);
4488         } 
4489         return(0);
4490 }
4491 
4492 
4493 /*
4494  *      This routine is only called when we have urgent data
4495  *      signalled. Its the 'slow' part of tcp_urg. It could be
4496  *      moved inline now as tcp_urg is only called from one
4497  *      place. We handle URGent data wrong. We have to - as
4498  *      BSD still doesn't use the correction from RFC961.
4499  */
4500  
4501 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4502 {
4503         u32 ptr = ntohs(th->urg_ptr);
4504 
4505         if (ptr)
4506                 ptr--;
4507         ptr += ntohl(th->seq);
4508 
4509         /* ignore urgent data that we've already seen and read */
4510         if (after(sk->copied_seq, ptr))
4511                 return;
4512 
4513         /* do we already have a newer (or duplicate) urgent pointer? */
4514         if (sk->urg_data && !after(ptr, sk->urg_seq))
4515                 return;
4516 
4517         /* tell the world about our new urgent pointer */
4518         if (sk->proc != 0) {
4519                 if (sk->proc > 0) {
4520                         kill_proc(sk->proc, SIGURG, 1);
4521                 } else {
4522                         kill_pg(-sk->proc, SIGURG, 1);
4523                 }
4524         }
4525         sk->urg_data = URG_NOTYET;
4526         sk->urg_seq = ptr;
4527 }
4528 
4529 /*
4530  *      This is the 'fast' part of urgent handling.
4531  */
4532  
4533 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4534         unsigned long saddr, unsigned long len)
4535 {
4536         u32 ptr;
4537 
4538         /*
4539          *      Check if we get a new urgent pointer - normally not 
4540          */
4541          
4542         if (th->urg)
4543                 tcp_check_urg(sk,th);
4544 
4545         /*
4546          *      Do we wait for any urgent data? - normally not
4547          */
4548          
4549         if (sk->urg_data != URG_NOTYET)
4550                 return 0;
4551 
4552         /*
4553          *      Is the urgent pointer pointing into this packet? 
4554          */
4555          
4556         ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4557         if (ptr >= len)
4558                 return 0;
4559 
4560         /*
4561          *      Ok, got the correct packet, update info 
4562          */
4563          
4564         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4565         if (!sk->dead)
4566                 sk->data_ready(sk,0);
4567         return 0;
4568 }
4569 
4570 /*
4571  *      This will accept the next outstanding connection. 
4572  */
4573  
4574 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4575 {
4576         struct sock *newsk;
4577         struct sk_buff *skb;
4578   
4579   /*
4580    * We need to make sure that this socket is listening,
4581    * and that it has something pending.
4582    */
4583 
4584         if (sk->state != TCP_LISTEN) 
4585         {
4586                 sk->err = EINVAL;
4587                 return(NULL); 
4588         }
4589 
4590         /* Avoid the race. */
4591         cli();
4592         sk->inuse = 1;
4593 
4594         while((skb = tcp_dequeue_established(sk)) == NULL) 
4595         {
4596                 if (flags & O_NONBLOCK) 
4597                 {
4598                         sti();
4599                         release_sock(sk);
4600                         sk->err = EAGAIN;
4601                         return(NULL);
4602                 }
4603 
4604                 release_sock(sk);
4605                 interruptible_sleep_on(sk->sleep);
4606                 if (current->signal & ~current->blocked) 
4607                 {
4608                         sti();
4609                         sk->err = ERESTARTSYS;
4610                         return(NULL);
4611                 }
4612                 sk->inuse = 1;
4613         }
4614         sti();
4615 
4616         /*
4617          *      Now all we need to do is return skb->sk. 
4618          */
4619 
4620         newsk = skb->sk;
4621 
4622         kfree_skb(skb, FREE_READ);
4623         sk->ack_backlog--;
4624         release_sock(sk);
4625         return(newsk);
4626 }
4627 
4628 
4629 /*
4630  *      This will initiate an outgoing connection. 
4631  */
4632  
4633 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4634 {
4635         struct sk_buff *buff;
4636         struct device *dev=NULL;
4637         unsigned char *ptr;
4638         int tmp;
4639         int atype;
4640         struct tcphdr *t1;
4641         struct rtable *rt;
4642 
4643         if (sk->state != TCP_CLOSE) 
4644                 return(-EISCONN);
4645 
4646         /*
4647          *      Don't allow a double connect.
4648          */
4649                 
4650         if(sk->daddr)
4651                 return -EINVAL;
4652         
4653         if (addr_len < 8) 
4654                 return(-EINVAL);
4655 
4656         if (usin->sin_family && usin->sin_family != AF_INET) 
4657                 return(-EAFNOSUPPORT);
4658 
4659         /*
4660          *      connect() to INADDR_ANY means loopback (BSD'ism).
4661          */
4662         
4663         if(usin->sin_addr.s_addr==INADDR_ANY)
4664                 usin->sin_addr.s_addr=ip_my_addr();
4665                   
4666         /*
4667          *      Don't want a TCP connection going to a broadcast address 
4668          */
4669 
4670         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4671                 return -ENETUNREACH;
4672   
4673         sk->inuse = 1;
4674         sk->daddr = usin->sin_addr.s_addr;
4675         sk->write_seq = tcp_init_seq();
4676         sk->window_seq = sk->write_seq;
4677         sk->rcv_ack_seq = sk->write_seq -1;
4678         sk->err = 0;
4679         sk->dummy_th.dest = usin->sin_port;
4680         release_sock(sk);
4681 
4682         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4683         if (buff == NULL) 
4684         {
4685                 return(-ENOMEM);
4686         }
4687         sk->inuse = 1;
4688         buff->sk = sk;
4689         buff->free = 0;
4690         buff->localroute = sk->localroute;
4691         
4692 
4693         /*
4694          *      Put in the IP header and routing stuff.
4695          */
4696          
4697         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4698                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4699         if (tmp < 0) 
4700         {
4701                 sock_wfree(sk, buff);
4702                 release_sock(sk);
4703                 return(-ENETUNREACH);
4704         }
4705         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4706                 sk->saddr = rt->rt_src;
4707         sk->rcv_saddr = sk->saddr;
4708 
4709         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4710 
4711         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4712         buff->seq = sk->write_seq++;
4713         t1->seq = htonl(buff->seq);
4714         sk->sent_seq = sk->write_seq;
4715         buff->end_seq = sk->write_seq;
4716         t1->ack = 0;
4717         t1->window = 2;
4718         t1->res1=0;
4719         t1->res2=0;
4720         t1->rst = 0;
4721         t1->urg = 0;
4722         t1->psh = 0;
4723         t1->syn = 1;
4724         t1->urg_ptr = 0;
4725         t1->doff = 6;
4726         /* use 512 or whatever user asked for */
4727         
4728         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4729                 sk->window_clamp=rt->rt_window;
4730         else
4731                 sk->window_clamp=0;
4732 
4733         if (sk->user_mss)
4734                 sk->mtu = sk->user_mss;
4735         else if (rt)
4736                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4737         else 
4738                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4739 
4740         /*
4741          *      but not bigger than device MTU 
4742          */
4743 
4744         if(sk->mtu <32)
4745                 sk->mtu = 32;   /* Sanity limit */
4746                 
4747         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4748 
4749 #ifdef CONFIG_SKIP
4750         
4751         /*
4752          *      SKIP devices set their MTU to 65535. This is so they can take packets
4753          *      unfragmented to security process then fragment. They could lie to the
4754          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4755          *      simply because the final package we want unfragmented is going to be
4756          *
4757          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4758          */
4759          
4760         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4761                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4762 #endif
4763         
4764         /*
4765          *      Put in the TCP options to say MTU. 
4766          */
4767 
4768         ptr = skb_put(buff,4);
4769         ptr[0] = 2;
4770         ptr[1] = 4;
4771         ptr[2] = (sk->mtu) >> 8;
4772         ptr[3] = (sk->mtu) & 0xff;
4773         tcp_send_check(t1, sk->saddr, sk->daddr,
4774                   sizeof(struct tcphdr) + 4, sk);
4775 
4776         /*
4777          *      This must go first otherwise a really quick response will get reset. 
4778          */
4779 
4780         tcp_cache_zap();
4781         tcp_set_state(sk,TCP_SYN_SENT);
4782         if(rt&&rt->rt_flags&RTF_IRTT)
4783                 sk->rto = rt->rt_irtt;
4784         else
4785                 sk->rto = TCP_TIMEOUT_INIT;
4786         sk->retransmit_timer.function=&retransmit_timer;
4787         sk->retransmit_timer.data = (unsigned long)sk;
4788         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4789         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4790                                                                                         initial setting */
4791 
4792         sk->prot->queue_xmit(sk, dev, buff, 0);  
4793         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4794         tcp_statistics.TcpActiveOpens++;
4795         tcp_statistics.TcpOutSegs++;
4796   
4797         release_sock(sk);
4798         return(0);
4799 }
4800 
4801 
4802 /*
4803  *      This functions checks to see if the tcp header is actually acceptable. 
4804  */
4805  
4806 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4807              struct options *opt, unsigned long saddr, struct device *dev)
4808 {
4809         u32 next_seq;
4810 
4811         next_seq = len - 4*th->doff;
4812         if (th->fin)
4813                 next_seq++;
4814         /* if we have a zero window, we can't have any data in the packet.. */
4815         if (next_seq && !sk->window)
4816                 goto ignore_it;
4817         next_seq += ntohl(th->seq);
4818 
4819         /*
4820          * This isn't quite right.  sk->acked_seq could be more recent
4821          * than sk->window.  This is however close enough.  We will accept
4822          * slightly more packets than we should, but it should not cause
4823          * problems unless someone is trying to forge packets.
4824          */
4825 
4826         /* have we already seen all of this packet? */
4827         if (!after(next_seq+1, sk->acked_seq))
4828                 goto ignore_it;
4829         /* or does it start beyond the window? */
4830         if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4831                 goto ignore_it;
4832 
4833         /* ok, at least part of this packet would seem interesting.. */
4834         return 1;
4835 
4836 ignore_it:
4837         if (th->rst)
4838                 return 0;
4839 
4840         /*
4841          *      Send a reset if we get something not ours and we are
4842          *      unsynchronized. Note: We don't do anything to our end. We
4843          *      are just killing the bogus remote connection then we will
4844          *      connect again and it will work (with luck).
4845          */
4846          
4847         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4848         {
4849                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4850                 return 1;
4851         }
4852 
4853         /* Try to resync things. */
4854         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4855         return 0;
4856 }
4857 
4858 /*
4859  *      When we get a reset we do this.
4860  */
4861 
4862 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4863 {
4864         sk->zapped = 1;
4865         sk->err = ECONNRESET;
4866         if (sk->state == TCP_SYN_SENT)
4867                 sk->err = ECONNREFUSED;
4868         if (sk->state == TCP_CLOSE_WAIT)
4869                 sk->err = EPIPE;
4870 #ifdef TCP_DO_RFC1337           
4871         /*
4872          *      Time wait assassination protection [RFC1337]
4873          */
4874         if(sk->state!=TCP_TIME_WAIT)
4875         {       
4876                 tcp_set_state(sk,TCP_CLOSE);
4877                 sk->shutdown = SHUTDOWN_MASK;
4878         }
4879 #else   
4880         tcp_set_state(sk,TCP_CLOSE);
4881         sk->shutdown = SHUTDOWN_MASK;
4882 #endif  
4883         if (!sk->dead) 
4884                 sk->state_change(sk);
4885         kfree_skb(skb, FREE_READ);
4886         release_sock(sk);
4887         return(0);
4888 }
4889 
4890 /*
4891  *      A TCP packet has arrived.
4892  *              skb->h.raw is the TCP header.
4893  */
4894  
4895 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4896         __u32 daddr, unsigned short len,
4897         __u32 saddr, int redo, struct inet_protocol * protocol)
4898 {
4899         struct tcphdr *th;
4900         struct sock *sk;
4901         int syn_ok=0;
4902 
4903         tcp_statistics.TcpInSegs++;
4904         if(skb->pkt_type!=PACKET_HOST)
4905         {
4906                 kfree_skb(skb,FREE_READ);
4907                 return(0);
4908         }
4909   
4910         th = skb->h.th;
4911 
4912         /*
4913          *      Find the socket, using the last hit cache if applicable.
4914          */
4915 
4916         if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4917         {
4918                 sk=(struct sock *)th_cache_sk;
4919                 /*
4920                  *      We think this is causing the bug so
4921                  */
4922                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4923                         printk("Cache mismatch on TCP.\n");
4924         }
4925         else
4926         {
4927                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4928                 th_cache_saddr=saddr;
4929                 th_cache_daddr=daddr;
4930                 th_cache_dport=th->dest;
4931                 th_cache_sport=th->source;
4932                 th_cache_sk=sk;
4933         }               
4934 
4935         /*
4936          *      If this socket has got a reset it's to all intents and purposes 
4937          *      really dead. Count closed sockets as dead.
4938          *
4939          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4940          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4941          *      exist so should cause resets as if the port was unreachable.
4942          */
4943          
4944         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4945                 sk=NULL;
4946 
4947         if (!redo) 
4948         {
4949                 /*
4950                  *      Pull up the IP header.
4951                  */
4952                 skb_pull(skb, skb->h.raw-skb->data);
4953                 /*
4954                  *      Try to use the device checksum if provided.
4955                  */
4956                 if (
4957                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4958                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4959                     )
4960                 {
4961                         skb->sk = NULL;
4962                         kfree_skb(skb,FREE_READ);
4963                         /*
4964                          *      We don't release the socket because it was
4965                          *      never marked in use.
4966                          */
4967                         return(0);
4968                 }
4969 
4970                 skb->seq = ntohl(th->seq);
4971                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4972                 skb->ack_seq = ntohl(th->ack_seq);
4973 
4974                 /* See if we know about the socket. */
4975                 if (sk == NULL) 
4976                 {
4977                         /*
4978                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4979                          */
4980                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4981                         skb->sk = NULL;
4982                         /*
4983                          *      Discard frame
4984                          */
4985                         kfree_skb(skb, FREE_READ);
4986                         return(0);
4987                 }
4988 
4989                 skb->acked = 0;
4990                 skb->used = 0;
4991                 skb->free = 0;
4992                 skb->saddr = daddr;
4993                 skb->daddr = saddr;
4994         
4995                 /* We may need to add it to the backlog here. */
4996                 cli();
4997                 if (sk->inuse) 
4998                 {
4999                         skb_queue_tail(&sk->back_log, skb);
5000                         sti();
5001                         return(0);
5002                 }
5003                 sk->inuse = 1;
5004                 sti();
5005         }
5006         else
5007         {
5008                 if (sk==NULL) 
5009                 {
5010                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
5011                         skb->sk = NULL;
5012                         kfree_skb(skb, FREE_READ);
5013                         return(0);
5014                 }
5015         }
5016 
5017 
5018         if (!sk->prot) 
5019         {
5020                 printk("IMPOSSIBLE 3\n");
5021                 return(0);
5022         }
5023 
5024 
5025         /*
5026          *      Charge the memory to the socket. 
5027          */
5028          
5029         skb->sk=sk;
5030         sk->rmem_alloc += skb->truesize;
5031 
5032         /*
5033          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
5034          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
5035          *      compatibility. We also set up variables more thoroughly [Karn notes in the
5036          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
5037          */
5038 
5039         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
5040         {
5041         
5042                 /*
5043                  *      Now deal with unusual cases.
5044                  */
5045          
5046                 if(sk->state==TCP_LISTEN)
5047                 {
5048                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
5049                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
5050 
5051                         /*
5052                          *      We don't care for RST, and non SYN are absorbed (old segments)
5053                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
5054                          *      netmask on a running connection it can go broadcast. Even Sun's have
5055                          *      this problem so I'm ignoring it 
5056                          */
5057                            
5058                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
5059                         {
5060                                 kfree_skb(skb, FREE_READ);
5061                                 release_sock(sk);
5062                                 return 0;
5063                         }
5064                 
5065                         /*      
5066                          *      Guess we need to make a new socket up 
5067                          */
5068                 
5069                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
5070                 
5071                         /*
5072                          *      Now we have several options: In theory there is nothing else
5073                          *      in the frame. KA9Q has an option to send data with the syn,
5074                          *      BSD accepts data with the syn up to the [to be] advertised window
5075                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
5076                          *      it, that fits the spec precisely and avoids incompatibilities. It
5077                          *      would be nice in future to drop through and process the data.
5078                          */
5079                          
5080                         release_sock(sk);
5081                         return 0;
5082                 }
5083         
5084                 /* retransmitted SYN? */
5085                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5086                 {
5087                         kfree_skb(skb, FREE_READ);
5088                         release_sock(sk);
5089                         return 0;
5090                 }
5091                 
5092                 /*
5093                  *      SYN sent means we have to look for a suitable ack and either reset
5094                  *      for bad matches or go to connected 
5095                  */
5096            
5097                 if(sk->state==TCP_SYN_SENT)
5098                 {
5099                         /* Crossed SYN or previous junk segment */
5100                         if(th->ack)
5101                         {
5102                                 /* We got an ack, but it's not a good ack */
5103                                 if(!tcp_ack(sk,th,saddr,len))
5104                                 {
5105                                         /* Reset the ack - its an ack from a 
5106                                            different connection  [ th->rst is checked in tcp_reset()] */
5107                                         tcp_statistics.TcpAttemptFails++;
5108                                         tcp_reset(daddr, saddr, th,
5109                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5110                                         kfree_skb(skb, FREE_READ);
5111                                         release_sock(sk);
5112                                         return(0);
5113                                 }
5114                                 if(th->rst)
5115                                         return tcp_std_reset(sk,skb);
5116                                 if(!th->syn)
5117                                 {
5118                                         /* A valid ack from a different connection
5119                                            start. Shouldn't happen but cover it */
5120                                         tcp_statistics.TcpAttemptFails++;
5121                                         tcp_reset(daddr, saddr, th,
5122                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5123                                         kfree_skb(skb, FREE_READ);
5124                                         release_sock(sk);
5125                                         return 0;
5126                                 }
5127                                 /*
5128                                  *      Ok.. it's good. Set up sequence numbers and
5129                                  *      move to established.
5130                                  */
5131                                 syn_ok=1;       /* Don't reset this connection for the syn */
5132                                 sk->acked_seq = skb->seq+1;
5133                                 sk->lastwin_seq = skb->seq+1;
5134                                 sk->fin_seq = skb->seq;
5135                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5136                                 tcp_set_state(sk, TCP_ESTABLISHED);
5137                                 tcp_options(sk,th);
5138                                 sk->dummy_th.dest=th->source;
5139                                 sk->copied_seq = sk->acked_seq;
5140                                 if(!sk->dead)
5141                                 {
5142                                         sk->state_change(sk);
5143                                         sock_wake_async(sk->socket, 0);
5144                                 }
5145                                 if(sk->max_window==0)
5146                                 {
5147                                         sk->max_window = 32;
5148                                         sk->mss = min(sk->max_window, sk->mtu);
5149                                 }
5150                         }
5151                         else
5152                         {
5153                                 /* See if SYN's cross. Drop if boring */
5154                                 if(th->syn && !th->rst)
5155                                 {
5156                                         /* Crossed SYN's are fine - but talking to
5157                                            yourself is right out... */
5158                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5159                                                 sk->dummy_th.source==th->source &&
5160                                                 sk->dummy_th.dest==th->dest)
5161                                         {
5162                                                 tcp_statistics.TcpAttemptFails++;
5163                                                 return tcp_std_reset(sk,skb);
5164                                         }
5165                                         tcp_set_state(sk,TCP_SYN_RECV);
5166                                         
5167                                         /*
5168                                          *      FIXME:
5169                                          *      Must send SYN|ACK here
5170                                          */
5171                                 }               
5172                                 /* Discard junk segment */
5173                                 kfree_skb(skb, FREE_READ);
5174                                 release_sock(sk);
5175                                 return 0;
5176                         }
5177                         /*
5178                          *      SYN_RECV with data maybe.. drop through
5179                          */
5180                         goto rfc_step6;
5181                 }
5182 
5183         /*
5184          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5185          *      a more complex suggestion for fixing these reuse issues in RFC1644
5186          *      but not yet ready for general use. Also see RFC1379.
5187          */
5188         
5189 #define BSD_TIME_WAIT
5190 #ifdef BSD_TIME_WAIT
5191                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5192                         after(skb->seq, sk->acked_seq) && !th->rst)
5193                 {
5194                         u32 seq = sk->write_seq;
5195                         if(sk->debug)
5196                                 printk("Doing a BSD time wait\n");
5197                         tcp_statistics.TcpEstabResets++;           
5198                         sk->rmem_alloc -= skb->truesize;
5199                         skb->sk = NULL;
5200                         sk->err=ECONNRESET;
5201                         tcp_set_state(sk, TCP_CLOSE);
5202                         sk->shutdown = SHUTDOWN_MASK;
5203                         release_sock(sk);
5204                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5205                         if (sk && sk->state==TCP_LISTEN)
5206                         {
5207                                 sk->inuse=1;
5208                                 skb->sk = sk;
5209                                 sk->rmem_alloc += skb->truesize;
5210                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5211                                 release_sock(sk);
5212                                 return 0;
5213                         }
5214                         kfree_skb(skb, FREE_READ);
5215                         return 0;
5216                 }
5217 #endif  
5218         }
5219 
5220         /*
5221          *      We are now in normal data flow (see the step list in the RFC)
5222          *      Note most of these are inline now. I'll inline the lot when
5223          *      I have time to test it hard and look at what gcc outputs 
5224          */
5225         
5226         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5227         {
5228                 kfree_skb(skb, FREE_READ);
5229                 release_sock(sk);
5230                 return 0;
5231         }
5232 
5233         if(th->rst)
5234                 return tcp_std_reset(sk,skb);
5235         
5236         /*
5237          *      !syn_ok is effectively the state test in RFC793.
5238          */
5239          
5240         if(th->syn && !syn_ok)
5241         {
5242                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5243                 return tcp_std_reset(sk,skb);   
5244         }
5245 
5246 
5247         /*
5248          *      Delayed ACK time estimator.
5249          */
5250         
5251         if (sk->lrcvtime == 0) 
5252         {
5253                 sk->lrcvtime = jiffies;
5254                 sk->ato = HZ/3;
5255         }
5256         else 
5257         {
5258                 int m;
5259                 
5260                 m = jiffies - sk->lrcvtime;
5261 
5262                 sk->lrcvtime = jiffies;
5263 
5264                 if (m <= 0)
5265                         m = 1;
5266 
5267                 if (m > (sk->rtt >> 3)) 
5268                 {
5269                         sk->ato = sk->rtt >> 3;
5270                         /*
5271                          * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
5272                          */
5273                 }
5274                 else 
5275                 {
5276                         sk->ato = (sk->ato >> 1) + m;
5277                         /*
5278                          * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
5279                          */
5280                 }
5281         }
5282           
5283         /*
5284          *      Process the ACK
5285          */
5286          
5287 
5288         if(th->ack && !tcp_ack(sk,th,saddr,len))
5289         {
5290                 /*
5291                  *      Our three way handshake failed.
5292                  */
5293                  
5294                 if(sk->state==TCP_SYN_RECV)
5295                 {
5296                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5297                 }
5298                 kfree_skb(skb, FREE_READ);
5299                 release_sock(sk);
5300                 return 0;
5301         }
5302         
5303 rfc_step6:              /* I'll clean this up later */
5304 
5305         /*
5306          *      If the accepted buffer put us over our queue size we
5307          *      now drop it (we must process the ack first to avoid
5308          *      deadlock cases).
5309          */
5310          
5311         if (sk->rmem_alloc  >= sk->rcvbuf) 
5312         {
5313                 kfree_skb(skb, FREE_READ);
5314                 release_sock(sk);
5315                 return(0);
5316         }
5317 
5318 
5319         /*
5320          *      Process urgent data
5321          */
5322                 
5323         if(tcp_urg(sk, th, saddr, len))
5324         {
5325                 kfree_skb(skb, FREE_READ);
5326                 release_sock(sk);
5327                 return 0;
5328         }
5329         
5330         /*
5331          *      Process the encapsulated data
5332          */
5333         
5334         if(tcp_data(skb,sk, saddr, len))
5335         {
5336                 kfree_skb(skb, FREE_READ);
5337                 release_sock(sk);
5338                 return 0;
5339         }
5340 
5341         /*
5342          *      And done
5343          */     
5344         
5345         release_sock(sk);
5346         return 0;
5347 }
5348 
5349 /*
5350  *      This routine sends a packet with an out of date sequence
5351  *      number. It assumes the other end will try to ack it.
5352  */
5353 
5354 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5355 {
5356         struct sk_buff *buff,*skb;
5357         struct tcphdr *t1;
5358         struct device *dev=NULL;
5359         int tmp;
5360 
5361         if (sk->zapped)
5362                 return; /* After a valid reset we can send no more */
5363 
5364         /*
5365          *      Write data can still be transmitted/retransmitted in the
5366          *      following states.  If any other state is encountered, return.
5367          *      [listen/close will never occur here anyway]
5368          */
5369 
5370         if (sk->state != TCP_ESTABLISHED && 
5371             sk->state != TCP_CLOSE_WAIT &&
5372             sk->state != TCP_FIN_WAIT1 && 
5373             sk->state != TCP_LAST_ACK &&
5374             sk->state != TCP_CLOSING
5375         ) 
5376         {
5377                 return;
5378         }
5379         if ( before(sk->sent_seq, sk->window_seq) && 
5380             (skb=skb_peek(&sk->write_queue)))
5381         {
5382                 /*
5383                  * We are probing the opening of a window
5384                  * but the window size is != 0
5385                  * must have been a result SWS advoidance ( sender )
5386                  */
5387             
5388                 struct iphdr *iph;
5389                 struct tcphdr *th;
5390                 struct tcphdr *nth;
5391                 unsigned long win_size;
5392 #if 0
5393                 unsigned long ow_size;
5394 #endif
5395                 void * tcp_data_start;
5396         
5397                 /*
5398                  *      How many bytes can we send ?
5399                  */
5400                  
5401                 win_size = sk->window_seq - sk->sent_seq;
5402 
5403                 /*
5404                  *      Recover the buffer pointers
5405                  */
5406                  
5407                 iph = (struct iphdr *)skb->ip_hdr;
5408                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5409 
5410                 /*
5411                  *      Grab the data for a temporary frame
5412                  */
5413                  
5414                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5415                                      (iph->ihl << 2) +
5416                                      sk->prot->max_header + 15, 
5417                                      1, GFP_ATOMIC);
5418                 if ( buff == NULL )
5419                         return;
5420 
5421                 /* 
5422                  *      If we strip the packet on the write queue we must
5423                  *      be ready to retransmit this one 
5424                  */
5425             
5426                 buff->free = /*0*/1;
5427 
5428                 buff->sk = sk;
5429                 buff->localroute = sk->localroute;
5430                 
5431                 /*
5432                  *      Put headers on the new packet
5433                  */
5434 
5435                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5436                                          IPPROTO_TCP, sk->opt, buff->truesize,
5437                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5438                 if (tmp < 0) 
5439                 {
5440                         sock_wfree(sk, buff);
5441                         return;
5442                 }
5443                 
5444                 /*
5445                  *      Move the TCP header over
5446                  */
5447 
5448                 buff->dev = dev;
5449 
5450                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5451 
5452                 memcpy(nth, th, th->doff * 4);
5453                 
5454                 /*
5455                  *      Correct the new header
5456                  */
5457                  
5458                 nth->ack = 1; 
5459                 nth->ack_seq = htonl(sk->acked_seq);
5460                 nth->window = htons(tcp_select_window(sk));
5461                 nth->check = 0;
5462 
5463                 /*
5464                  *      Find the first data byte.
5465                  */
5466                  
5467                 tcp_data_start = (char *) th + (th->doff << 2);
5468 
5469                 /*
5470                  *      Add it to our new buffer
5471                  */
5472                  
5473                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5474                 
5475                 /*
5476                  *      Remember our right edge sequence number.
5477                  */
5478                  
5479                 buff->end_seq = sk->sent_seq + win_size;
5480                 sk->sent_seq = buff->end_seq;           /* Hack */
5481                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5482                         nth->urg = 0;
5483 
5484                 /*
5485                  *      Checksum the split buffer
5486                  */
5487                  
5488                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5489                            nth->doff * 4 + win_size , sk);
5490         }
5491         else
5492         {       
5493                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5494                 if (buff == NULL) 
5495                         return;
5496 
5497                 buff->free = 1;
5498                 buff->sk = sk;
5499                 buff->localroute = sk->localroute;
5500 
5501                 /*
5502                  *      Put in the IP header and routing stuff. 
5503                  */
5504                  
5505                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5506                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5507                 if (tmp < 0) 
5508                 {
5509                         sock_wfree(sk, buff);
5510                         return;
5511                 }
5512 
5513                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5514                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5515 
5516                 /*
5517                  *      Use a previous sequence.
5518                  *      This should cause the other end to send an ack.
5519                  */
5520          
5521                 t1->seq = htonl(sk->sent_seq-1);
5522                 t1->ack = 1; 
5523                 t1->res1= 0;
5524                 t1->res2= 0;
5525                 t1->rst = 0;
5526                 t1->urg = 0;
5527                 t1->psh = 0;
5528                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5529                 t1->syn = 0;
5530                 t1->ack_seq = htonl(sk->acked_seq);
5531                 t1->window = htons(tcp_select_window(sk));
5532                 t1->doff = sizeof(*t1)/4;
5533                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5534 
5535         }               
5536 
5537         /*
5538          *      Send it.
5539          */
5540         
5541         sk->prot->queue_xmit(sk, dev, buff, 1);
5542         tcp_statistics.TcpOutSegs++;
5543 }
5544 
5545 /*
5546  *      A window probe timeout has occurred.
5547  */
5548 
5549 void tcp_send_probe0(struct sock *sk)
     /*  */
5550 {
5551         if (sk->zapped)
5552                 return;         /* After a valid reset we can send no more */
5553 
5554         tcp_write_wakeup(sk);
5555 
5556         sk->backoff++;
5557         sk->rto = min(sk->rto << 1, 120*HZ);
5558         sk->retransmits++;
5559         sk->prot->retransmits ++;
5560         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5561 }
5562 
5563 /*
5564  *      Socket option code for TCP. 
5565  */
5566   
5567 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5568 {
5569         int val,err;
5570 
5571         if(level!=SOL_TCP)
5572                 return ip_setsockopt(sk,level,optname,optval,optlen);
5573 
5574         if (optval == NULL) 
5575                 return(-EINVAL);
5576 
5577         err=verify_area(VERIFY_READ, optval, sizeof(int));
5578         if(err)
5579                 return err;
5580         
5581         val = get_user((int *)optval);
5582 
5583         switch(optname)
5584         {
5585                 case TCP_MAXSEG:
5586 /*
5587  * values greater than interface MTU won't take effect.  however at
5588  * the point when this call is done we typically don't yet know
5589  * which interface is going to be used
5590  */
5591                         if(val<1||val>MAX_WINDOW)
5592                                 return -EINVAL;
5593                         sk->user_mss=val;
5594                         return 0;
5595                 case TCP_NODELAY:
5596                         sk->nonagle=(val==0)?0:1;
5597                         return 0;
5598                 default:
5599                         return(-ENOPROTOOPT);
5600         }
5601 }
5602 
5603 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5604 {
5605         int val,err;
5606 
5607         if(level!=SOL_TCP)
5608                 return ip_getsockopt(sk,level,optname,optval,optlen);
5609                         
5610         switch(optname)
5611         {
5612                 case TCP_MAXSEG:
5613                         val=sk->user_mss;
5614                         break;
5615                 case TCP_NODELAY:
5616                         val=sk->nonagle;
5617                         break;
5618                 default:
5619                         return(-ENOPROTOOPT);
5620         }
5621         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5622         if(err)
5623                 return err;
5624         put_user(sizeof(int),(int *) optlen);
5625 
5626         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5627         if(err)
5628                 return err;
5629         put_user(val,(int *)optval);
5630 
5631         return(0);
5632 }       
5633 
5634 
5635 struct proto tcp_prot = {
5636         tcp_close,
5637         ip_build_header,
5638         tcp_connect,
5639         tcp_accept,
5640         ip_queue_xmit,
5641         tcp_retransmit,
5642         tcp_write_wakeup,
5643         tcp_read_wakeup,
5644         tcp_rcv,
5645         tcp_select,
5646         tcp_ioctl,
5647         NULL,
5648         tcp_shutdown,
5649         tcp_setsockopt,
5650         tcp_getsockopt,
5651         tcp_sendmsg,
5652         tcp_recvmsg,
5653         NULL,           /* No special bind() */
5654         128,
5655         0,
5656         "TCP",
5657         0, 0,
5658         {NULL,}
5659 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS