net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *
 184  *
 185  * To Fix:
 186  *              Fast path the code. Two things here - fix the window calculation
 187  *              so it doesn't iterate over the queue, also spot packets with no funny
 188  *              options arriving in order and process directly.
 189  *
 190  *              Rewrite output state machine to use a single queue and do low window
 191  *              situations as per the spec (RFC 1122)
 192  *              Speed up input assembly algorithm.
 193  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 194  *              could do with it working on IPv4
 195  *              User settable/learned rtt/max window/mtu
 196  *              Fix the window handling to use PR's new code.
 197  *
 198  *              Change the fundamental structure to a single send queue maintained
 199  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 200  *              active routes too]). Cut the queue off in tcp_retransmit/
 201  *              tcp_transmit.
 202  *              Change the receive queue to assemble as it goes. This lets us
 203  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 204  *              tcp_data/tcp_read as well as the window shrink crud.
 205  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 206  *              tcp_queue_skb seem obvious routines to extract.
 207  *      
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *              
 245  *      TCP_CLOSE               socket is finished
 246  */
 247 
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * 
 255  * Use of PSH (4.2.2.2)
 256  *   MAY aggregate data sent without the PSH flag. (does)
 257  *   MAY queue data recieved without the PSH flag. (does)
 258  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 259  *   MAY implement PSH on send calls. (doesn't, thus:)
 260  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 261  *     MUST set PSH on last segment (does)
 262  *   MAY pass received PSH to application layer (doesn't)
 263  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 264  * 
 265  * Window Size (4.2.2.3, 4.2.2.16)
 266  *   MUST treat window size as an unsigned number (does)
 267  *   SHOULD treat window size as a 32-bit number (does not)
 268  *   MUST NOT shrink window once it is offered (does not normally)
 269  *   
 270  * Urgent Pointer (4.2.2.4)
 271  * **MUST point urgent pointer to last byte of urgent data (not right
 272  *     after). (doesn't, to be like BSD)
 273  *   MUST inform application layer asynchronously of incoming urgent
 274  *     data. (does)
 275  *   MUST provide application with means of determining the amount of
 276  *     urgent data pending. (does)
 277  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 278  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 279  *      [Follows BSD 1 byte of urgent data]
 280  * 
 281  * TCP Options (4.2.2.5)
 282  *   MUST be able to recieve TCP options in any segment. (does)
 283  *   MUST ignore unsupported options (does)
 284  *   
 285  * Maximum Segment Size Option (4.2.2.6)
 286  *   MUST implement both sending and receiving MSS. (does)
 287  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 288  *     it always). (does, even when MSS == 536, which is legal)
 289  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 290  *   MUST calculate "effective send MSS" correctly:
 291  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 292  *     (does - but allows operator override)
 293  *  
 294  * TCP Checksum (4.2.2.7)
 295  *   MUST generate and check TCP checksum. (does)
 296  * 
 297  * Initial Sequence Number Selection (4.2.2.8)
 298  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 299  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 300  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 301  * 
 302  * Simultaneous Open Attempts (4.2.2.10)
 303  *   MUST support simultaneous open attempts (does)
 304  * 
 305  * Recovery from Old Duplicate SYN (4.2.2.11)
 306  *   MUST keep track of active vs. passive open (does)
 307  * 
 308  * RST segment (4.2.2.12)
 309  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 310  *     anything with it, which is standard)
 311  * 
 312  * Closing a Connection (4.2.2.13)
 313  *   MUST inform application of whether connectin was closed by RST or
 314  *     normal close. (does)
 315  *   MAY allow "half-duplex" close (treat connection as closed for the
 316  *     local app, even before handshake is done). (does)
 317  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 318  * 
 319  * Retransmission Timeout (4.2.2.15)
 320  *   MUST implement Jacobson's slow start and congestion avoidance
 321  *     stuff. (does) 
 322  * 
 323  * Probing Zero Windows (4.2.2.17)
 324  *   MUST support probing of zero windows. (does)
 325  *   MAY keep offered window closed indefinitely. (does)
 326  *   MUST allow remote window to stay closed indefinitely. (does)
 327  * 
 328  * Passive Open Calls (4.2.2.18)
 329  *   MUST NOT let new passive open affect other connections. (doesn't)
 330  *   MUST support passive opens (LISTENs) concurrently. (does)
 331  *   
 332  * Time to Live (4.2.2.19)
 333  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 334  * 
 335  * Event Processing (4.2.2.20)
 336  *   SHOULD queue out-of-order segments. (does)
 337  *   MUST aggregate ACK segments whenever possible. (does but badly)
 338  *   
 339  * Retransmission Timeout Calculation (4.2.3.1)
 340  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 341  *     calculation. (does, or at least explains them in the comments 8*b)
 342  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 343  * 
 344  * When to Send an ACK Segment (4.2.3.2)
 345  *   SHOULD implement delayed ACK. (does not)
 346  *   MUST keep ACK delay < 0.5 sec. (N/A)
 347  * 
 348  * When to Send a Window Update (4.2.3.3)
 349  *   MUST implement receiver-side SWS. (does)
 350  *   
 351  * When to Send Data (4.2.3.4)
 352  *   MUST implement sender-side SWS. (does)
 353  *   SHOULD implement Nagle algorithm. (does)
 354  * 
 355  * TCP Connection Failures (4.2.3.5)
 356  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 357  *   SHOULD inform application layer of soft errors. (does)
 358  *   
 359  * TCP Keep-Alives (4.2.3.6)
 360  *   MAY provide keep-alives. (does)
 361  *   MUST make keep-alives configurable on a per-connection basis. (does)
 362  *   MUST default to no keep-alives. (does)
 363  * **MUST make keep-alive interval configurable. (doesn't)
 364  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 365  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 366  *     connection. (doesn't)
 367  *   SHOULD send keep-alive with no data. (does)
 368  * 
 369  * TCP Multihoming (4.2.3.7)
 370  *   MUST get source address from IP layer before sending first
 371  *     SYN. (does)
 372  *   MUST use same local address for all segments of a connection. (does)
 373  * 
 374  * IP Options (4.2.3.8)
 375  *   MUST ignore unsupported IP options. (does)
 376  *   MAY support Time Stamp and Record Route. (does)
 377  *   MUST allow application to specify a source route. (does)
 378  *   MUST allow receieved Source Route option to set route for all future
 379  *     segments on this connection. (does not (security issues))
 380  * 
 381  * ICMP messages (4.2.3.9)
 382  *   MUST act on ICMP errors. (does)
 383  *   MUST slow transmission upon receipt of a Source Quench. (does)
 384  *   MUST NOT abort connection upon receipt of soft Destination
 385  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 386  *     Problems. (doesn't)
 387  *   SHOULD report soft Destination Unreachables etc. to the
 388  *     application. (does)
 389  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 390  *     messages (2, 3, 4). (does)
 391  * 
 392  * Remote Address Validation (4.2.3.10)
 393  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 394  *   MUST ignore SYN with invalid source address. (does)
 395  *   MUST silently discard incoming SYN for broadcast/multicast
 396  *     address. (does) 
 397  * 
 398  * Asynchronous Reports (4.2.4.1)
 399  * **MUST provide mechanism for reporting soft errors to application
 400  *     layer. (doesn't)
 401  * 
 402  * Type of Service (4.2.4.2)
 403  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 404  * 
 405  * (Whew. -- MS 950903)
 406  **/
 407 
 408 #include <linux/types.h>
 409 #include <linux/sched.h>
 410 #include <linux/mm.h>
 411 #include <linux/time.h>
 412 #include <linux/string.h>
 413 #include <linux/config.h>
 414 #include <linux/socket.h>
 415 #include <linux/sockios.h>
 416 #include <linux/termios.h>
 417 #include <linux/in.h>
 418 #include <linux/fcntl.h>
 419 #include <linux/inet.h>
 420 #include <linux/netdevice.h>
 421 #include <net/snmp.h>
 422 #include <net/ip.h>
 423 #include <net/protocol.h>
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426 #include <net/arp.h>
 427 #include <linux/skbuff.h>
 428 #include <net/sock.h>
 429 #include <net/route.h>
 430 #include <linux/errno.h>
 431 #include <linux/timer.h>
 432 #include <asm/system.h>
 433 #include <asm/segment.h>
 434 #include <linux/mm.h>
 435 #include <net/checksum.h>
 436 
 437 /*
 438  *      The MSL timer is the 'normal' timer.
 439  */
 440  
 441 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 442 
 443 #define SEQ_TICK 3
 444 unsigned long seq_offset;
 445 struct tcp_mib  tcp_statistics;
 446 
 447 /*
 448  *      Cached last hit socket
 449  */
 450  
 451 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 452 volatile unsigned short  th_cache_dport, th_cache_sport;
 453 volatile struct sock *th_cache_sk;
 454 
 455 void tcp_cache_zap(void)
     /*  */
 456 {
 457         unsigned long flags;
 458         save_flags(flags);
 459         cli();
 460         th_cache_saddr=0;
 461         th_cache_daddr=0;
 462         th_cache_dport=0;
 463         th_cache_sport=0;
 464         th_cache_sk=NULL;
 465         restore_flags(flags);
 466 }
 467 
 468 static void tcp_close(struct sock *sk, int timeout);
 469 
 470 
 471 /*
 472  *      The less said about this the better, but it works and will do for 1.2 
 473  */
 474 
 475 static struct wait_queue *master_select_wakeup;
 476 
 477 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 478 {
 479         if (a < b) 
 480                 return(a);
 481         return(b);
 482 }
 483 
 484 #undef STATE_TRACE
 485 
 486 #ifdef STATE_TRACE
 487 static char *statename[]={
 488         "Unused","Established","Syn Sent","Syn Recv",
 489         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 490         "Close Wait","Last ACK","Listen","Closing"
 491 };
 492 #endif
 493 
 494 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 495 {
 496         if(sk->state==TCP_ESTABLISHED)
 497                 tcp_statistics.TcpCurrEstab--;
 498 #ifdef STATE_TRACE
 499         if(sk->debug)
 500                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 501 #endif  
 502         /* This is a hack but it doesn't occur often and it's going to
 503            be a real        to fix nicely */
 504            
 505         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 506         {
 507                 wake_up_interruptible(&master_select_wakeup);
 508         }
 509         sk->state=state;
 510         if(state==TCP_ESTABLISHED)
 511                 tcp_statistics.TcpCurrEstab++;
 512         if(sk->state==TCP_CLOSE)
 513                 tcp_cache_zap();
 514 }
 515 
 516 /*
 517  *      This routine picks a TCP windows for a socket based on
 518  *      the following constraints
 519  *  
 520  *      1. The window can never be shrunk once it is offered (RFC 793)
 521  *      2. We limit memory per socket
 522  *   
 523  *      For now we use NET2E3's heuristic of offering half the memory
 524  *      we have handy. All is not as bad as this seems however because
 525  *      of two things. Firstly we will bin packets even within the window
 526  *      in order to get the data we are waiting for into the memory limit.
 527  *      Secondly we bin common duplicate forms at receive time
 528  *      Better heuristics welcome
 529  */
 530    
 531 int tcp_select_window(struct sock *sk)
     /*  */
 532 {
 533         int new_window = sock_rspace(sk);
 534         
 535         if(sk->window_clamp)
 536                 new_window=min(sk->window_clamp,new_window);
 537         /*
 538          *      Two things are going on here.  First, we don't ever offer a
 539          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 540          *      receiver side of SWS as specified in RFC1122.
 541          *      Second, we always give them at least the window they
 542          *      had before, in order to avoid retracting window.  This
 543          *      is technically allowed, but RFC1122 advises against it and
 544          *      in practice it causes trouble.
 545          *
 546          *      Fixme: This doesn't correctly handle the case where
 547          *      new_window > sk->window but not by enough to allow for the
 548          *      shift in sequence space. 
 549          */
 550         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 551                 return(sk->window);
 552         return(new_window);
 553 }
 554 
 555 /*
 556  *      Find someone to 'accept'. Must be called with
 557  *      sk->inuse=1 or cli()
 558  */ 
 559 
 560 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 561 {
 562         struct sk_buff *p=skb_peek(&s->receive_queue);
 563         if(p==NULL)
 564                 return NULL;
 565         do
 566         {
 567                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 568                         return p;
 569                 p=p->next;
 570         }
 571         while(p!=(struct sk_buff *)&s->receive_queue);
 572         return NULL;
 573 }
 574 
 575 /*
 576  *      Remove a completed connection and return it. This is used by
 577  *      tcp_accept() to get connections from the queue.
 578  */
 579 
 580 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 581 {
 582         struct sk_buff *skb;
 583         unsigned long flags;
 584         save_flags(flags);
 585         cli(); 
 586         skb=tcp_find_established(s);
 587         if(skb!=NULL)
 588                 skb_unlink(skb);        /* Take it off the queue */
 589         restore_flags(flags);
 590         return skb;
 591 }
 592 
 593 /* 
 594  *      This routine closes sockets which have been at least partially
 595  *      opened, but not yet accepted. Currently it is only called by
 596  *      tcp_close, and timeout mirrors the value there. 
 597  */
 598 
 599 static void tcp_close_pending (struct sock *sk) 
     /*  */
 600 {
 601         struct sk_buff *skb;
 602 
 603         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 604         {
 605                 skb->sk->dead=1;
 606                 tcp_close(skb->sk, 0);
 607                 kfree_skb(skb, FREE_READ);
 608         }
 609         return;
 610 }
 611 
 612 /*
 613  *      Enter the time wait state. 
 614  */
 615 
 616 static void tcp_time_wait(struct sock *sk)
     /*  */
 617 {
 618         tcp_set_state(sk,TCP_TIME_WAIT);
 619         sk->shutdown = SHUTDOWN_MASK;
 620         if (!sk->dead)
 621                 sk->state_change(sk);
 622         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 623 }
 624 
 625 /*
 626  *      A socket has timed out on its send queue and wants to do a
 627  *      little retransmitting. Currently this means TCP.
 628  */
 629 
 630 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 631 {
 632         struct sk_buff * skb;
 633         struct proto *prot;
 634         struct device *dev;
 635         int ct=0;
 636         struct rtable *rt;
 637 
 638         prot = sk->prot;
 639         skb = sk->send_head;
 640 
 641         while (skb != NULL)
 642         {
 643                 struct tcphdr *th;
 644                 struct iphdr *iph;
 645                 int size;
 646 
 647                 dev = skb->dev;
 648                 IS_SKB(skb);
 649                 skb->when = jiffies;
 650 
 651                 /*
 652                  *      Discard the surplus MAC header
 653                  */
 654                  
 655                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 656 
 657                 /*
 658                  * In general it's OK just to use the old packet.  However we
 659                  * need to use the current ack and window fields.  Urg and
 660                  * urg_ptr could possibly stand to be updated as well, but we
 661                  * don't keep the necessary data.  That shouldn't be a problem,
 662                  * if the other end is doing the right thing.  Since we're
 663                  * changing the packet, we have to issue a new IP identifier.
 664                  */
 665 
 666                 iph = (struct iphdr *)skb->data;
 667                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 668                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 669                 
 670                 /*
 671                  *      Note: We ought to check for window limits here but
 672                  *      currently this is done (less efficiently) elsewhere.
 673                  */
 674 
 675                 /*
 676                  *      Put a MAC header back on (may cause ARPing)
 677                  */
 678                  
 679                 {
 680                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 681                          */
 682                         struct options *  opt = (struct options*)skb->proto_priv;
 683                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 684                 }
 685 
 686                 iph->id = htons(ip_id_count++);
 687 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 688                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 689                         iph->frag_off &= ~htons(IP_DF);
 690 #endif
 691                 ip_send_check(iph);
 692                         
 693                 if (rt==NULL)   /* Deep poo */
 694                 {
 695                         if(skb->sk)
 696                         {
 697                                 skb->sk->err_soft=ENETUNREACH;
 698                                 skb->sk->error_report(skb->sk);
 699                         }
 700                 }
 701                 else
 702                 {
 703                         dev=rt->rt_dev;
 704                         skb->raddr=rt->rt_gateway;
 705                         skb->dev=dev;
 706                         skb->arp=1;
 707                         if (rt->rt_hh)
 708                         {
 709                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 710                                 if (!rt->rt_hh->hh_uptodate)
 711                                 {
 712                                         skb->arp = 0;
 713 #if RT_CACHE_DEBUG >= 2
 714                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 715 #endif
 716                                 }
 717                         }
 718                         else if (dev->hard_header)
 719                         {
 720                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 721                                         skb->arp=0;
 722                         }
 723                 
 724                         /*
 725                          *      This is not the right way to handle this. We have to
 726                          *      issue an up to date window and ack report with this 
 727                          *      retransmit to keep the odd buggy tcp that relies on 
 728                          *      the fact BSD does this happy. 
 729                          *      We don't however need to recalculate the entire 
 730                          *      checksum, so someone wanting a small problem to play
 731                          *      with might like to implement RFC1141/RFC1624 and speed
 732                          *      this up by avoiding a full checksum.
 733                          */
 734                  
 735                         th->ack_seq = htonl(sk->acked_seq);
 736                         th->window = ntohs(tcp_select_window(sk));
 737                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 738                 
 739                         /*
 740                          *      If the interface is (still) up and running, kick it.
 741                          */
 742         
 743                         if (dev->flags & IFF_UP)
 744                         {
 745                                 /*
 746                                  *      If the packet is still being sent by the device/protocol
 747                                  *      below then don't retransmit. This is both needed, and good -
 748                                  *      especially with connected mode AX.25 where it stops resends
 749                                  *      occurring of an as yet unsent anyway frame!
 750                                  *      We still add up the counts as the round trip time wants
 751                                  *      adjusting.
 752                                  */
 753                                 if (sk && !skb_device_locked(skb))
 754                                 {
 755                                         /* Remove it from any existing driver queue first! */
 756                                         skb_unlink(skb);
 757                                         /* Now queue it */
 758                                         ip_statistics.IpOutRequests++;
 759                                         dev_queue_xmit(skb, dev, sk->priority);
 760                                 }
 761                         }
 762                 }
 763                 
 764                 /*
 765                  *      Count retransmissions
 766                  */
 767                  
 768                 ct++;
 769                 sk->prot->retransmits ++;
 770                 tcp_statistics.TcpRetransSegs++;
 771                 
 772 
 773                 /*
 774                  *      Only one retransmit requested.
 775                  */
 776         
 777                 if (!all)
 778                         break;
 779 
 780                 /*
 781                  *      This should cut it off before we send too many packets.
 782                  */
 783 
 784                 if (ct >= sk->cong_window)
 785                         break;
 786                 skb = skb->link3;
 787         }
 788 }
 789 
 790 /*
 791  *      Reset the retransmission timer
 792  */
 793  
 794 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 795 {
 796         del_timer(&sk->retransmit_timer);
 797         sk->ip_xmit_timeout = why;
 798         if((int)when < 0)
 799         {
 800                 when=3;
 801                 printk("Error: Negative timer in xmit_timer\n");
 802         }
 803         sk->retransmit_timer.expires=jiffies+when;
 804         add_timer(&sk->retransmit_timer);
 805 }
 806 
 807 /*
 808  *      This is the normal code called for timeouts.  It does the retransmission
 809  *      and then does backoff.  tcp_do_retransmit is separated out because
 810  *      tcp_ack needs to send stuff from the retransmit queue without
 811  *      initiating a backoff.
 812  */
 813 
 814 
 815 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 816 {
 817         tcp_do_retransmit(sk, all);
 818 
 819         /*
 820          * Increase the timeout each time we retransmit.  Note that
 821          * we do not increase the rtt estimate.  rto is initialized
 822          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 823          * that doubling rto each time is the least we can get away with.
 824          * In KA9Q, Karn uses this for the first few times, and then
 825          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 826          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 827          * defined in the protocol as the maximum possible RTT.  I guess
 828          * we'll have to use something other than TCP to talk to the
 829          * University of Mars.
 830          *
 831          * PAWS allows us longer timeouts and large windows, so once
 832          * implemented ftp to mars will work nicely. We will have to fix
 833          * the 120 second clamps though!
 834          */
 835 
 836         sk->retransmits++;
 837         sk->prot->retransmits++;
 838         sk->backoff++;
 839         sk->rto = min(sk->rto << 1, 120*HZ);
 840         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 841 }
 842 
 843 
 844 /*
 845  *      A timer event has trigger a tcp retransmit timeout. The
 846  *      socket xmit queue is ready and set up to send. Because
 847  *      the ack receive code keeps the queue straight we do
 848  *      nothing clever here.
 849  */
 850 
 851 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 852 {
 853         if (all) 
 854         {
 855                 tcp_retransmit_time(sk, all);
 856                 return;
 857         }
 858 
 859         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 860         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 861         sk->cong_count = 0;
 862 
 863         sk->cong_window = 1;
 864 
 865         /* Do the actual retransmit. */
 866         tcp_retransmit_time(sk, all);
 867 }
 868 
 869 /*
 870  *      A write timeout has occurred. Process the after effects.
 871  */
 872 
 873 static int tcp_write_timeout(struct sock *sk)
     /*  */
 874 {
 875         /*
 876          *      Look for a 'soft' timeout.
 877          */
 878         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 879                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 880         {
 881                 /*
 882                  *      Attempt to recover if arp has changed (unlikely!) or
 883                  *      a route has shifted (not supported prior to 1.3).
 884                  */
 885                 ip_rt_advice(&sk->ip_route_cache, 0);
 886         }
 887         
 888         /*
 889          *      Have we tried to SYN too many times (repent repent 8))
 890          */
 891          
 892         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 893         {
 894                 if(sk->err_soft)
 895                         sk->err=sk->err_soft;
 896                 else
 897                         sk->err=ETIMEDOUT;
 898                 sk->error_report(sk);
 899                 del_timer(&sk->retransmit_timer);
 900                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 901                 tcp_set_state(sk,TCP_CLOSE);
 902                 /* Don't FIN, we got nothing back */
 903                 release_sock(sk);
 904                 return 0;
 905         }
 906         /*
 907          *      Has it gone just too far ?
 908          */
 909         if (sk->retransmits > TCP_RETR2) 
 910         {
 911                 if(sk->err_soft)
 912                         sk->err = sk->err_soft;
 913                 else
 914                         sk->err = ETIMEDOUT;
 915                 sk->error_report(sk);
 916                 del_timer(&sk->retransmit_timer);
 917                 /*
 918                  *      Time wait the socket 
 919                  */
 920                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 921                 {
 922                         tcp_set_state(sk,TCP_TIME_WAIT);
 923                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 924                 }
 925                 else
 926                 {
 927                         /*
 928                          *      Clean up time.
 929                          */
 930                         tcp_set_state(sk, TCP_CLOSE);
 931                         release_sock(sk);
 932                         return 0;
 933                 }
 934         }
 935         return 1;
 936 }
 937 
 938 /*
 939  *      The TCP retransmit timer. This lacks a few small details.
 940  *
 941  *      1.      An initial rtt timeout on the probe0 should cause what we can
 942  *              of the first write queue buffer to be split and sent.
 943  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 944  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 945  *              tcp_err should save a 'soft error' for us.
 946  */
 947 
 948 static void retransmit_timer(unsigned long data)
     /*  */
 949 {
 950         struct sock *sk = (struct sock*)data;
 951         int why = sk->ip_xmit_timeout;
 952 
 953         /* 
 954          * only process if socket is not in use
 955          */
 956 
 957         cli();
 958         if (sk->inuse || in_bh) 
 959         {
 960                 /* Try again in 1 second */
 961                 sk->retransmit_timer.expires = jiffies+HZ;
 962                 add_timer(&sk->retransmit_timer);
 963                 sti();
 964                 return;
 965         }
 966 
 967         sk->inuse = 1;
 968         sti();
 969 
 970         /* Always see if we need to send an ack. */
 971 
 972         if (sk->ack_backlog && !sk->zapped) 
 973         {
 974                 sk->prot->read_wakeup (sk);
 975                 if (! sk->dead)
 976                         sk->data_ready(sk,0);
 977         }
 978 
 979         /* Now we need to figure out why the socket was on the timer. */
 980 
 981         switch (why) 
 982         {
 983                 /* Window probing */
 984                 case TIME_PROBE0:
 985                         tcp_send_probe0(sk);
 986                         tcp_write_timeout(sk);
 987                         break;
 988                 /* Retransmitting */
 989                 case TIME_WRITE:
 990                         /* It could be we got here because we needed to send an ack.
 991                          * So we need to check for that.
 992                          */
 993                 {
 994                         struct sk_buff *skb;
 995                         unsigned long flags;
 996 
 997                         save_flags(flags);
 998                         cli();
 999                         skb = sk->send_head;
1000                         if (!skb) 
1001                         {
1002                                 restore_flags(flags);
1003                         } 
1004                         else 
1005                         {
1006                                 /*
1007                                  *      Kicked by a delayed ack. Reset timer
1008                                  *      correctly now
1009                                  */
1010                                 if (jiffies < skb->when + sk->rto) 
1011                                 {
1012                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1013                                         restore_flags(flags);
1014                                         break;
1015                                 }
1016                                 restore_flags(flags);
1017                                 /*
1018                                  *      Retransmission
1019                                  */
1020                                 sk->retransmits++;
1021                                 sk->prot->retransmits++;
1022                                 sk->prot->retransmit (sk, 0);
1023                                 tcp_write_timeout(sk);
1024                         }
1025                         break;
1026                 }
1027                 /* Sending Keepalives */
1028                 case TIME_KEEPOPEN:
1029                         /* 
1030                          * this reset_timer() call is a hack, this is not
1031                          * how KEEPOPEN is supposed to work.
1032                          */
1033                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1034 
1035                         /* Send something to keep the connection open. */
1036                         if (sk->prot->write_wakeup)
1037                                   sk->prot->write_wakeup (sk);
1038                         sk->retransmits++;
1039                         sk->prot->retransmits++;
1040                         tcp_write_timeout(sk);
1041                         break;
1042                 default:
1043                         printk ("rexmit_timer: timer expired - reason unknown\n");
1044                         break;
1045         }
1046         release_sock(sk);
1047 }
1048 
1049 /*
1050  * This routine is called by the ICMP module when it gets some
1051  * sort of error condition.  If err < 0 then the socket should
1052  * be closed and the error returned to the user.  If err > 0
1053  * it's just the icmp type << 8 | icmp code.  After adjustment
1054  * header points to the first 8 bytes of the tcp header.  We need
1055  * to find the appropriate port.
1056  */
1057 
1058 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1059         __u32 saddr, struct inet_protocol *protocol)
1060 {
1061         struct tcphdr *th = (struct tcphdr *)header;
1062         struct sock *sk;
1063         
1064         /*
1065          *      This one is _WRONG_. FIXME urgently.
1066          */
1067 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1068         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1069 #endif  
1070         th =(struct tcphdr *)header;
1071         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1072 
1073         if (sk == NULL) 
1074                 return;
1075   
1076         if (type == ICMP_SOURCE_QUENCH) 
1077         {
1078                 /*
1079                  * FIXME:
1080                  * For now we will just trigger a linear backoff.
1081                  * The slow start code should cause a real backoff here.
1082                  */
1083                 if (sk->cong_window > 4)
1084                         sk->cong_window--;
1085                 return;
1086         }
1087         
1088         if (type == ICMP_PARAMETERPROB)
1089         {
1090                 sk->err=EPROTO;
1091                 sk->error_report(sk);
1092         }
1093 
1094 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1095         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1096         {
1097                 struct rtable * rt;
1098                 /*
1099                  * Ugly trick to pass MTU to protocol layer.
1100                  * Really we should add argument "info" to error handler.
1101                  */
1102                 unsigned short new_mtu = ntohs(iph->id);
1103 
1104                 if ((rt = sk->ip_route_cache) != NULL)
1105                         if (rt->rt_mtu > new_mtu)
1106                                 rt->rt_mtu = new_mtu;
1107 
1108                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr))
1109                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1110 
1111                 return;
1112         }
1113 #endif
1114 
1115         /*
1116          * If we've already connected we will keep trying
1117          * until we time out, or the user gives up.
1118          */
1119 
1120         if (code < 13)
1121         {       
1122                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1123                 {
1124                         sk->err = icmp_err_convert[code].errno;
1125                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1126                         {
1127                                 tcp_statistics.TcpAttemptFails++;
1128                                 tcp_set_state(sk,TCP_CLOSE);
1129                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1130                         }
1131                 }
1132                 else    /* Only an error on timeout */
1133                         sk->err_soft = icmp_err_convert[code].errno;
1134         }
1135 }
1136 
1137 
1138 /*
1139  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1140  *      in the received data queue (ie a frame missing that needs sending to us). Not
1141  *      sorting using two queues as data arrives makes life so much harder.
1142  */
1143 
1144 static int tcp_readable(struct sock *sk)
     /*  */
1145 {
1146         unsigned long counted;
1147         unsigned long amount;
1148         struct sk_buff *skb;
1149         int sum;
1150         unsigned long flags;
1151 
1152         if(sk && sk->debug)
1153                 printk("tcp_readable: %p - ",sk);
1154 
1155         save_flags(flags);
1156         cli();
1157         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1158         {
1159                 restore_flags(flags);
1160                 if(sk && sk->debug) 
1161                         printk("empty\n");
1162                 return(0);
1163         }
1164   
1165         counted = sk->copied_seq;       /* Where we are at the moment */
1166         amount = 0;
1167   
1168         /* 
1169          *      Do until a push or until we are out of data. 
1170          */
1171          
1172         do 
1173         {
1174                 if (before(counted, skb->seq))          /* Found a hole so stops here */
1175                         break;
1176                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
1177                 if (skb->h.th->syn)
1178                         sum++;
1179                 if (sum > 0) 
1180                 {                                       /* Add it up, move on */
1181                         amount += sum;
1182                         if (skb->h.th->syn) 
1183                                 amount--;
1184                         counted += sum;
1185                 }
1186                 /*
1187                  * Don't count urg data ... but do it in the right place!
1188                  * Consider: "old_data (ptr is here) URG PUSH data"
1189                  * The old code would stop at the first push because
1190                  * it counted the urg (amount==1) and then does amount--
1191                  * *after* the loop.  This means tcp_readable() always
1192                  * returned zero if any URG PUSH was in the queue, even
1193                  * though there was normal data available. If we subtract
1194                  * the urg data right here, we even get it to work for more
1195                  * than one URG PUSH skb without normal data.
1196                  * This means that select() finally works now with urg data
1197                  * in the queue.  Note that rlogin was never affected
1198                  * because it doesn't use select(); it uses two processes
1199                  * and a blocking read().  And the queue scan in tcp_read()
1200                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1201                  */
1202                 if (skb->h.th->urg)
1203                         amount--;       /* don't count urg data */
1204                 if (amount && skb->h.th->psh) break;
1205                 skb = skb->next;
1206         }
1207         while(skb != (struct sk_buff *)&sk->receive_queue);
1208 
1209         restore_flags(flags);
1210         if(sk->debug)
1211                 printk("got %lu bytes.\n",amount);
1212         return(amount);
1213 }
1214 
1215 /*
1216  * LISTEN is a special case for select..
1217  */
1218 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1219 {
1220         if (sel_type == SEL_IN) {
1221                 int retval;
1222 
1223                 sk->inuse = 1;
1224                 retval = (tcp_find_established(sk) != NULL);
1225                 release_sock(sk);
1226                 if (!retval)
1227                         select_wait(&master_select_wakeup,wait);
1228                 return retval;
1229         }
1230         return 0;
1231 }
1232 
1233 
1234 /*
1235  *      Wait for a TCP event.
1236  *
1237  *      Note that we don't need to set "sk->inuse", as the upper select layers
1238  *      take care of normal races (between the test and the event) and we don't
1239  *      go look at any of the socket buffers directly.
1240  */
1241 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1242 {
1243         if (sk->state == TCP_LISTEN)
1244                 return tcp_listen_select(sk, sel_type, wait);
1245 
1246         switch(sel_type) {
1247         case SEL_IN:
1248                 if (sk->err)
1249                         return 1;
1250                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1251                         break;
1252 
1253                 if (sk->shutdown & RCV_SHUTDOWN)
1254                         return 1;
1255                         
1256                 if (sk->acked_seq == sk->copied_seq)
1257                         break;
1258 
1259                 if (sk->urg_seq != sk->copied_seq ||
1260                     sk->acked_seq != sk->copied_seq+1 ||
1261                     sk->urginline || !sk->urg_data)
1262                         return 1;
1263                 break;
1264 
1265         case SEL_OUT:
1266                 if (sk->err)
1267                         return 1;
1268                 if (sk->shutdown & SEND_SHUTDOWN) 
1269                         return 0;
1270                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1271                         break;
1272                 /*
1273                  * This is now right thanks to a small fix
1274                  * by Matt Dillon.
1275                  */
1276 
1277                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1278                         break;
1279                 return 1;
1280 
1281         case SEL_EX:
1282                 if (sk->urg_data)
1283                         return 1;
1284                 break;
1285         }
1286         select_wait(sk->sleep, wait);
1287         return 0;
1288 }
1289 
1290 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1291 {
1292         int err;
1293         switch(cmd) 
1294         {
1295 
1296                 case TIOCINQ:
1297 #ifdef FIXME    /* FIXME: */
1298                 case FIONREAD:
1299 #endif
1300                 {
1301                         unsigned long amount;
1302 
1303                         if (sk->state == TCP_LISTEN) 
1304                                 return(-EINVAL);
1305 
1306                         sk->inuse = 1;
1307                         amount = tcp_readable(sk);
1308                         release_sock(sk);
1309                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1310                         if(err)
1311                                 return err;
1312                         put_user(amount, (int *)arg);
1313                         return(0);
1314                 }
1315                 case SIOCATMARK:
1316                 {
1317                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1318 
1319                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1320                         if (err)
1321                                 return err;
1322                         put_user(answ,(int *) arg);
1323                         return(0);
1324                 }
1325                 case TIOCOUTQ:
1326                 {
1327                         unsigned long amount;
1328 
1329                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1330                         amount = sock_wspace(sk);
1331                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1332                         if(err)
1333                                 return err;
1334                         put_user(amount, (int *)arg);
1335                         return(0);
1336                 }
1337                 default:
1338                         return(-EINVAL);
1339         }
1340 }
1341 
1342 
1343 /*
1344  *      This routine computes a TCP checksum. 
1345  *
1346  *      Modified January 1995 from a go-faster DOS routine by
1347  *      Jorge Cwik <jorge@laser.satlink.net>
1348  */
1349  
1350 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1351           unsigned long saddr, unsigned long daddr, unsigned long base)
1352 {     
1353         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1354 }
1355 
1356 
1357 
1358 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1359                 unsigned long daddr, int len, struct sock *sk)
1360 {
1361         th->check = 0;
1362         th->check = tcp_check(th, len, saddr, daddr,
1363                 csum_partial((char *)th,len,0));
1364         return;
1365 }
1366 
1367 /*
1368  *      This is the main buffer sending routine. We queue the buffer
1369  *      having checked it is sane seeming.
1370  */
1371  
1372 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1373 {
1374         int size;
1375         struct tcphdr * th = skb->h.th;
1376 
1377         /*
1378          *      length of packet (not counting length of pre-tcp headers) 
1379          */
1380          
1381         size = skb->len - ((unsigned char *) th - skb->data);
1382 
1383         /*
1384          *      Sanity check it.. 
1385          */
1386          
1387         if (size < sizeof(struct tcphdr) || size > skb->len) 
1388         {
1389                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1390                         skb, skb->data, th, skb->len);
1391                 kfree_skb(skb, FREE_WRITE);
1392                 return;
1393         }
1394 
1395         /*
1396          *      If we have queued a header size packet.. (these crash a few
1397          *      tcp stacks if ack is not set)
1398          */
1399          
1400         if (size == sizeof(struct tcphdr)) 
1401         {
1402                 /* If it's got a syn or fin it's notionally included in the size..*/
1403                 if(!th->syn && !th->fin) 
1404                 {
1405                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1406                         kfree_skb(skb,FREE_WRITE);
1407                         return;
1408                 }
1409         }
1410 
1411         /*
1412          *      Actual processing.
1413          */
1414          
1415         tcp_statistics.TcpOutSegs++;  
1416         skb->seq = ntohl(th->seq);
1417         skb->end_seq = skb->seq + size - 4*th->doff;
1418         
1419         /*
1420          *      We must queue if
1421          *
1422          *      a) The right edge of this frame exceeds the window
1423          *      b) We are retransmitting (Nagle's rule)
1424          *      c) We have too many packets 'in flight'
1425          */
1426          
1427         if (after(skb->end_seq, sk->window_seq) ||
1428             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1429              sk->packets_out >= sk->cong_window) 
1430         {
1431                 /* checksum will be supplied by tcp_write_xmit.  So
1432                  * we shouldn't need to set it at all.  I'm being paranoid */
1433                 th->check = 0;
1434                 if (skb->next != NULL) 
1435                 {
1436                         printk("tcp_send_partial: next != NULL\n");
1437                         skb_unlink(skb);
1438                 }
1439                 skb_queue_tail(&sk->write_queue, skb);
1440                 
1441                 /*
1442                  *      If we don't fit we have to start the zero window
1443                  *      probes. This is broken - we really need to do a partial
1444                  *      send _first_ (This is what causes the Cisco and PC/TCP
1445                  *      grief).
1446                  */
1447                  
1448                 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1449                     sk->send_head == NULL && sk->ack_backlog == 0)
1450                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1451         } 
1452         else 
1453         {
1454                 /*
1455                  *      This is going straight out
1456                  */
1457                  
1458                 th->ack_seq = htonl(sk->acked_seq);
1459                 th->window = htons(tcp_select_window(sk));
1460 
1461                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1462 
1463                 sk->sent_seq = sk->write_seq;
1464                 
1465                 /*
1466                  *      This is mad. The tcp retransmit queue is put together
1467                  *      by the ip layer. This causes half the problems with
1468                  *      unroutable FIN's and other things.
1469                  */
1470                  
1471                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1472                 
1473                 /*
1474                  *      Set for next retransmit based on expected ACK time.
1475                  *      FIXME: We set this every time which means our 
1476                  *      retransmits are really about a window behind.
1477                  */
1478 
1479                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1480         }
1481 }
1482 
1483 /*
1484  *      Locking problems lead us to a messy situation where we can have
1485  *      multiple partially complete buffers queued up. This is really bad
1486  *      as we don't want to be sending partial buffers. Fix this with
1487  *      a semaphore or similar to lock tcp_write per socket.
1488  *
1489  *      These routines are pretty self descriptive.
1490  */
1491  
1492 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1493 {
1494         struct sk_buff * skb;
1495         unsigned long flags;
1496 
1497         save_flags(flags);
1498         cli();
1499         skb = sk->partial;
1500         if (skb) {
1501                 sk->partial = NULL;
1502                 del_timer(&sk->partial_timer);
1503         }
1504         restore_flags(flags);
1505         return skb;
1506 }
1507 
1508 /*
1509  *      Empty the partial queue
1510  */
1511  
1512 static void tcp_send_partial(struct sock *sk)
     /*  */
1513 {
1514         struct sk_buff *skb;
1515 
1516         if (sk == NULL)
1517                 return;
1518         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1519                 tcp_send_skb(sk, skb);
1520 }
1521 
1522 /*
1523  *      Queue a partial frame
1524  */
1525  
1526 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1527 {
1528         struct sk_buff * tmp;
1529         unsigned long flags;
1530 
1531         save_flags(flags);
1532         cli();
1533         tmp = sk->partial;
1534         if (tmp)
1535                 del_timer(&sk->partial_timer);
1536         sk->partial = skb;
1537         init_timer(&sk->partial_timer);
1538         /*
1539          *      Wait up to 1 second for the buffer to fill.
1540          */
1541         sk->partial_timer.expires = jiffies+HZ;
1542         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1543         sk->partial_timer.data = (unsigned long) sk;
1544         add_timer(&sk->partial_timer);
1545         restore_flags(flags);
1546         if (tmp)
1547                 tcp_send_skb(sk, tmp);
1548 }
1549 
1550 
1551 /*
1552  *      This routine sends an ack and also updates the window. 
1553  */
1554  
1555 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1556              struct sock *sk,
1557              struct tcphdr *th, unsigned long daddr)
1558 {
1559         struct sk_buff *buff;
1560         struct tcphdr *t1;
1561         struct device *dev = NULL;
1562         int tmp;
1563 
1564         if(sk->zapped)
1565                 return;         /* We have been reset, we may not send again */
1566                 
1567         /*
1568          * We need to grab some memory, and put together an ack,
1569          * and then put it into the queue to be sent.
1570          */
1571 
1572         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1573         if (buff == NULL) 
1574         {
1575                 /* 
1576                  *      Force it to send an ack. We don't have to do this
1577                  *      (ACK is unreliable) but it's much better use of 
1578                  *      bandwidth on slow links to send a spare ack than
1579                  *      resend packets. 
1580                  */
1581                  
1582                 sk->ack_backlog++;
1583                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1584                 {
1585                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1586                 }
1587                 return;
1588         }
1589 
1590         /*
1591          *      Assemble a suitable TCP frame
1592          */
1593          
1594         buff->sk = sk;
1595         buff->localroute = sk->localroute;
1596 
1597         /* 
1598          *      Put in the IP header and routing stuff. 
1599          */
1600          
1601         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1602                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1603         if (tmp < 0) 
1604         {
1605                 buff->free = 1;
1606                 sock_wfree(sk, buff);
1607                 return;
1608         }
1609         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1610 
1611         memcpy(t1, th, sizeof(*t1));
1612 
1613         /*
1614          *      Swap the send and the receive. 
1615          */
1616          
1617         t1->dest = th->source;
1618         t1->source = th->dest;
1619         t1->seq = ntohl(sequence);
1620         t1->ack = 1;
1621         sk->window = tcp_select_window(sk);
1622         t1->window = ntohs(sk->window);
1623         t1->res1 = 0;
1624         t1->res2 = 0;
1625         t1->rst = 0;
1626         t1->urg = 0;
1627         t1->syn = 0;
1628         t1->psh = 0;
1629         t1->fin = 0;
1630         
1631         /*
1632          *      If we have nothing queued for transmit and the transmit timer
1633          *      is on we are just doing an ACK timeout and need to switch
1634          *      to a keepalive.
1635          */
1636          
1637         if (ack == sk->acked_seq) 
1638         {
1639                 sk->ack_backlog = 0;
1640                 sk->bytes_rcv = 0;
1641                 sk->ack_timed = 0;
1642                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1643                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1644                 {
1645                         if(sk->keepopen) {
1646                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1647                         } else {
1648                                 delete_timer(sk);
1649                         }
1650                 }
1651         }
1652         
1653         /*
1654          *      Fill in the packet and send it
1655          */
1656          
1657         t1->ack_seq = htonl(ack);
1658         t1->doff = sizeof(*t1)/4;
1659         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1660         if (sk->debug)
1661                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1662         tcp_statistics.TcpOutSegs++;
1663         sk->prot->queue_xmit(sk, dev, buff, 1);
1664 }
1665 
1666 
1667 /* 
1668  *      This routine builds a generic TCP header. 
1669  */
1670  
1671 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1672 {
1673 
1674         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1675         th->seq = htonl(sk->write_seq);
1676         th->psh =(push == 0) ? 1 : 0;
1677         th->doff = sizeof(*th)/4;
1678         th->ack = 1;
1679         th->fin = 0;
1680         sk->ack_backlog = 0;
1681         sk->bytes_rcv = 0;
1682         sk->ack_timed = 0;
1683         th->ack_seq = htonl(sk->acked_seq);
1684         sk->window = tcp_select_window(sk);
1685         th->window = htons(sk->window);
1686 
1687         return(sizeof(*th));
1688 }
1689 
1690 /*
1691  *      This routine copies from a user buffer into a socket,
1692  *      and starts the transmit system.
1693  */
1694 
1695 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1696           int len, int nonblock, int flags)
1697 {
1698         int copied = 0;
1699         int copy;
1700         int tmp;
1701         int seglen;
1702         int iovct=0;
1703         struct sk_buff *skb;
1704         struct sk_buff *send_tmp;
1705         struct proto *prot;
1706         struct device *dev = NULL;
1707         unsigned char *from;
1708         
1709         /*
1710          *      Do sanity checking for sendmsg/sendto/send
1711          */
1712          
1713         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1714                 return -EINVAL;
1715         if (msg->msg_name)
1716         {
1717                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1718                 if(sk->state == TCP_CLOSE)
1719                         return -ENOTCONN;
1720                 if (msg->msg_namelen < sizeof(*addr))
1721                         return -EINVAL;
1722                 if (addr->sin_family && addr->sin_family != AF_INET) 
1723                         return -EINVAL;
1724                 if (addr->sin_port != sk->dummy_th.dest) 
1725                         return -EISCONN;
1726                 if (addr->sin_addr.s_addr != sk->daddr) 
1727                         return -EISCONN;
1728         }
1729         
1730         /*
1731          *      Ok commence sending
1732          */
1733         
1734         while(iovct<msg->msg_iovlen)
1735         {
1736                 seglen=msg->msg_iov[iovct].iov_len;
1737                 from=msg->msg_iov[iovct++].iov_base;
1738                 sk->inuse=1;
1739                 prot = sk->prot;
1740                 while(seglen > 0) 
1741                 {
1742                         if (sk->err) 
1743                         {                       /* Stop on an error */
1744                                 release_sock(sk);
1745                                 if (copied) 
1746                                         return(copied);
1747                                 return sock_error(sk);
1748                         }
1749 
1750                         /*
1751                          *      First thing we do is make sure that we are established. 
1752                          */
1753         
1754                         if (sk->shutdown & SEND_SHUTDOWN) 
1755                         {
1756                                 release_sock(sk);
1757                                 sk->err = EPIPE;
1758                                 if (copied) 
1759                                         return(copied);
1760                                 sk->err = 0;
1761                                 return(-EPIPE);
1762                         }
1763 
1764                         /* 
1765                          *      Wait for a connection to finish.
1766                          */
1767                 
1768                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1769                         {
1770                                 if (sk->err) 
1771                                 {
1772                                         release_sock(sk);
1773                                         if (copied) 
1774                                                 return(copied);
1775                                         return sock_error(sk);
1776                                 }               
1777         
1778                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1779                                 {
1780                                         release_sock(sk);
1781                                         if (copied) 
1782                                                 return(copied);
1783         
1784                                         if (sk->err) 
1785                                                 return sock_error(sk);
1786 
1787                                         if (sk->keepopen) 
1788                                         {
1789                                                 send_sig(SIGPIPE, current, 0);
1790                                         }
1791                                         return(-EPIPE);
1792                                 }
1793         
1794                                 if (nonblock || copied) 
1795                                 {
1796                                         release_sock(sk);
1797                                         if (copied) 
1798                                                 return(copied);
1799                                         return(-EAGAIN);
1800                                 }
1801         
1802                                 release_sock(sk);
1803                                 cli();
1804                         
1805                                 if (sk->state != TCP_ESTABLISHED &&
1806                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1807                                 {
1808                                         interruptible_sleep_on(sk->sleep);      
1809                                         if (current->signal & ~current->blocked)
1810                                         {
1811                                                 sti();
1812                                                 if (copied) 
1813                                                         return(copied);
1814                                                 return(-ERESTARTSYS);
1815                                         }
1816                                 }
1817                                 sk->inuse = 1;
1818                                 sti();
1819                         }
1820         
1821                 /*
1822                  * The following code can result in copy <= if sk->mss is ever
1823                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1824                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1825                  * had better not get here until we've seen his SYN and at least one
1826                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1827                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1828                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1829                  * before the exchange of SYN's.  If the initial ack from the other
1830                  * end has a window of 0, max_window and thus mss will both be 0.
1831                  */
1832         
1833                 /* 
1834                  *      Now we need to check if we have a half built packet. 
1835                  */
1836 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1837                 /*
1838                  *      FIXME:  I'm almost sure that this fragment is BUG,
1839                  *              but it works... I do not know why 8) --ANK
1840                  *
1841                  *      Really, we should rebuild all the queues...
1842                  *      It's difficult. Temprorary hack is to send all
1843                  *      queued segments with allowed fragmentation.
1844                  */
1845                 {
1846                         int new_mss = min(sk->mtu, sk->max_window);
1847                         if (new_mss < sk->mss)
1848                         {
1849                                 tcp_send_partial(sk);
1850                                 sk->mss = new_mss;
1851                         }
1852                 }
1853 #endif
1854         
1855                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1856                         {
1857                                 int hdrlen;
1858 
1859                                  /* IP header + TCP header */
1860                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1861                                          + sizeof(struct tcphdr);
1862         
1863                                 /* Add more stuff to the end of skb->len */
1864                                 if (!(flags & MSG_OOB)) 
1865                                 {
1866                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1867                                         if (copy <= 0) 
1868                                         {
1869                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1870                                                 return -EFAULT;
1871                                         }                 
1872                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1873                                         from += copy;
1874                                         copied += copy;
1875                                         len -= copy;
1876                                         sk->write_seq += copy;
1877                                         seglen -= copy;
1878                                 }
1879                                 if ((skb->len - hdrlen) >= sk->mss ||
1880                                         (flags & MSG_OOB) || !sk->packets_out)
1881                                         tcp_send_skb(sk, skb);
1882                                 else
1883                                         tcp_enqueue_partial(skb, sk);
1884                                 continue;
1885                         }
1886 
1887                 /*
1888                  * We also need to worry about the window.
1889                  * If window < 1/2 the maximum window we've seen from this
1890                  *   host, don't use it.  This is sender side
1891                  *   silly window prevention, as specified in RFC1122.
1892                  *   (Note that this is different than earlier versions of
1893                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1894                  *   use the whole MSS.  Since the results in the right
1895                  *   edge of the packet being outside the window, it will
1896                  *   be queued for later rather than sent.
1897                  */
1898 
1899                         copy = sk->window_seq - sk->write_seq;
1900                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1901                                 copy = sk->mss;
1902                         if (copy > seglen)
1903                                 copy = seglen;
1904 
1905                 /*
1906                  *      We should really check the window here also. 
1907                  */
1908                  
1909                         send_tmp = NULL;
1910                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1911                         {
1912                                 /*
1913                                  *      We will release the socket in case we sleep here. 
1914                                  */
1915                                 release_sock(sk);
1916                                 /*
1917                                  *      NB: following must be mtu, because mss can be increased.
1918                                  *      mss is always <= mtu 
1919                                  */
1920                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1921                                 sk->inuse = 1;
1922                                 send_tmp = skb;
1923                         } 
1924                         else 
1925                         {
1926                                 /*
1927                                  *      We will release the socket in case we sleep here. 
1928                                  */
1929                                 release_sock(sk);
1930                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1931                                 sk->inuse = 1;
1932                         }
1933         
1934                         /*
1935                          *      If we didn't get any memory, we need to sleep. 
1936                          */
1937         
1938                         if (skb == NULL) 
1939                         {
1940                                 sk->socket->flags |= SO_NOSPACE;
1941                                 if (nonblock) 
1942                                 {
1943                                         release_sock(sk);
1944                                         if (copied) 
1945                                                 return(copied);
1946                                         return(-EAGAIN);
1947                                 }
1948 
1949                                 /*
1950                                  *      FIXME: here is another race condition. 
1951                                  */
1952 
1953                                 tmp = sk->wmem_alloc;
1954                                 release_sock(sk);
1955                                 cli();
1956                                 /*
1957                                  *      Again we will try to avoid it. 
1958                                  */
1959                                 if (tmp <= sk->wmem_alloc &&
1960                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1961                                         && sk->err == 0) 
1962                                 {
1963                                         sk->socket->flags &= ~SO_NOSPACE;
1964                                         interruptible_sleep_on(sk->sleep);
1965                                         if (current->signal & ~current->blocked) 
1966                                         {
1967                                                 sti();
1968                                                 if (copied) 
1969                                                         return(copied);
1970                                                 return(-ERESTARTSYS);
1971                                         }
1972                                 }
1973                                 sk->inuse = 1;
1974                                 sti();
1975                                 continue;
1976                         }
1977 
1978                         skb->sk = sk;
1979                         skb->free = 0;
1980                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1981         
1982                         /*
1983                          * FIXME: we need to optimize this.
1984                          * Perhaps some hints here would be good.
1985                          */
1986                 
1987                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1988                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1989                         if (tmp < 0 ) 
1990                         {
1991                                 sock_wfree(sk, skb);
1992                                 release_sock(sk);
1993                                 if (copied) 
1994                                         return(copied);
1995                                 return(tmp);
1996                         }
1997 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1998                         skb->ip_hdr->frag_off |= htons(IP_DF);
1999 #endif
2000                         skb->dev = dev;
2001                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
2002                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2003                         if (tmp < 0) 
2004                         {
2005                                 sock_wfree(sk, skb);
2006                                 release_sock(sk);
2007                                 if (copied) 
2008                                         return(copied);
2009                                 return(tmp);
2010                         }
2011         
2012                         if (flags & MSG_OOB) 
2013                         {
2014                                 skb->h.th->urg = 1;
2015                                 skb->h.th->urg_ptr = ntohs(copy);
2016                         }
2017 
2018                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2019                 
2020                         from += copy;
2021                         copied += copy;
2022                         len -= copy;
2023                         seglen -= copy;
2024                         skb->free = 0;
2025                         sk->write_seq += copy;
2026                 
2027                         if (send_tmp != NULL && sk->packets_out) 
2028                         {
2029                                 tcp_enqueue_partial(send_tmp, sk);
2030                                 continue;
2031                         }
2032                         tcp_send_skb(sk, skb);
2033                 }
2034         }
2035         sk->err = 0;
2036 
2037 /*
2038  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2039  *      interactive fast network servers. It's meant to be on and
2040  *      it really improves the throughput though not the echo time
2041  *      on my slow slip link - Alan
2042  */
2043 
2044 /*
2045  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2046  */
2047  
2048         if(sk->partial && ((!sk->packets_out) 
2049      /* If not nagling we can send on the before case too.. */
2050               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2051         ))
2052                 tcp_send_partial(sk);
2053 
2054         release_sock(sk);
2055         return(copied);
2056 }
2057 
2058 /*
2059  *      Send an ack if one is backlogged at this point. Ought to merge
2060  *      this with tcp_send_ack().
2061  */
2062  
2063 static void tcp_read_wakeup(struct sock *sk)
     /*  */
2064 {
2065         int tmp;
2066         struct device *dev = NULL;
2067         struct tcphdr *t1;
2068         struct sk_buff *buff;
2069 
2070         if (!sk->ack_backlog) 
2071                 return;
2072 
2073         /*
2074          * If we're closed, don't send an ack, or we'll get a RST
2075          * from the closed destination.
2076          */
2077         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2078                 return; 
2079 
2080         /*
2081          * FIXME: we need to put code here to prevent this routine from
2082          * being called.  Being called once in a while is ok, so only check
2083          * if this is the second time in a row.
2084          */
2085 
2086         /*
2087          * We need to grab some memory, and put together an ack,
2088          * and then put it into the queue to be sent.
2089          */
2090 
2091         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2092         if (buff == NULL) 
2093         {
2094                 /* Try again real soon. */
2095                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2096                 return;
2097         }
2098 
2099         buff->sk = sk;
2100         buff->localroute = sk->localroute;
2101         
2102         /*
2103          *      Put in the IP header and routing stuff. 
2104          */
2105 
2106         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2107                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2108         if (tmp < 0) 
2109         {
2110                 buff->free = 1;
2111                 sock_wfree(sk, buff);
2112                 return;
2113         }
2114 
2115         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2116 
2117         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2118         t1->seq = htonl(sk->sent_seq);
2119         t1->ack = 1;
2120         t1->res1 = 0;
2121         t1->res2 = 0;
2122         t1->rst = 0;
2123         t1->urg = 0;
2124         t1->syn = 0;
2125         t1->psh = 0;
2126         sk->ack_backlog = 0;
2127         sk->bytes_rcv = 0;
2128         sk->window = tcp_select_window(sk);
2129         t1->window = htons(sk->window);
2130         t1->ack_seq = htonl(sk->acked_seq);
2131         t1->doff = sizeof(*t1)/4;
2132         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2133         sk->prot->queue_xmit(sk, dev, buff, 1);
2134         tcp_statistics.TcpOutSegs++;
2135 }
2136 
2137 
2138 /*
2139  *      FIXME:
2140  *      This routine frees used buffers.
2141  *      It should consider sending an ACK to let the
2142  *      other end know we now have a bigger window.
2143  */
2144 
2145 static void cleanup_rbuf(struct sock *sk)
     /*  */
2146 {
2147         unsigned long flags;
2148         unsigned long left;
2149         struct sk_buff *skb;
2150         unsigned long rspace;
2151 
2152         if(sk->debug)
2153                 printk("cleaning rbuf for sk=%p\n", sk);
2154   
2155         save_flags(flags);
2156         cli();
2157   
2158         left = sock_rspace(sk);
2159  
2160         /*
2161          *      We have to loop through all the buffer headers,
2162          *      and try to free up all the space we can.
2163          */
2164 
2165         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2166         {
2167                 if (!skb->used || skb->users) 
2168                         break;
2169                 skb_unlink(skb);
2170                 skb->sk = sk;
2171                 kfree_skb(skb, FREE_READ);
2172         }
2173 
2174         restore_flags(flags);
2175 
2176         /*
2177          *      FIXME:
2178          *      At this point we should send an ack if the difference
2179          *      in the window, and the amount of space is bigger than
2180          *      TCP_WINDOW_DIFF.
2181          */
2182 
2183         if(sk->debug)
2184                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2185                                             left);
2186         if ((rspace=sock_rspace(sk)) != left) 
2187         {
2188                 /*
2189                  * This area has caused the most trouble.  The current strategy
2190                  * is to simply do nothing if the other end has room to send at
2191                  * least 3 full packets, because the ack from those will auto-
2192                  * matically update the window.  If the other end doesn't think
2193                  * we have much space left, but we have room for at least 1 more
2194                  * complete packet than it thinks we do, we will send an ack
2195                  * immediately.  Otherwise we will wait up to .5 seconds in case
2196                  * the user reads some more.
2197                  */
2198                 sk->ack_backlog++;
2199         /*
2200          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2201          * if the other end is offering a window smaller than the agreed on MSS
2202          * (called sk->mtu here).  In theory there's no connection between send
2203          * and receive, and so no reason to think that they're going to send
2204          * small packets.  For the moment I'm using the hack of reducing the mss
2205          * only on the send side, so I'm putting mtu here.
2206          */
2207 
2208                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2209                 {
2210                         /* Send an ack right now. */
2211                         tcp_read_wakeup(sk);
2212                 } 
2213                 else 
2214                 {
2215                         /* Force it to send an ack soon. */
2216                         int was_active = del_timer(&sk->retransmit_timer);
2217                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2218                         {
2219                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2220                         } 
2221                         else
2222                                 add_timer(&sk->retransmit_timer);
2223                 }
2224         }
2225 } 
2226 
2227 
2228 /*
2229  *      Handle reading urgent data. BSD has very simple semantics for
2230  *      this, no blocking and very strange errors 8)
2231  */
2232  
2233 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2234              struct msghdr *msg, int len, int flags, int *addr_len)
2235 {
2236         /*
2237          *      No URG data to read
2238          */
2239         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2240                 return -EINVAL; /* Yes this is right ! */
2241                 
2242         if (sk->err) 
2243                 return sock_error(sk);
2244                 
2245         if (sk->state == TCP_CLOSE || sk->done) 
2246         {
2247                 if (!sk->done) 
2248                 {
2249                         sk->done = 1;
2250                         return 0;
2251                 }
2252                 return -ENOTCONN;
2253         }
2254 
2255         if (sk->shutdown & RCV_SHUTDOWN) 
2256         {
2257                 sk->done = 1;
2258                 return 0;
2259         }
2260         sk->inuse = 1;
2261         if (sk->urg_data & URG_VALID) 
2262         {
2263                 char c = sk->urg_data;
2264                 if (!(flags & MSG_PEEK))
2265                         sk->urg_data = URG_READ;
2266                 memcpy_toiovec(msg->msg_iov, &c, 1);
2267                 if(msg->msg_name)
2268                 {
2269                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2270                         sin->sin_family=AF_INET;
2271                         sin->sin_addr.s_addr=sk->daddr;
2272                         sin->sin_port=sk->dummy_th.dest;
2273                 }
2274                 if(addr_len)
2275                         *addr_len=sizeof(struct sockaddr_in);
2276                 release_sock(sk);
2277                 return 1;
2278         }
2279         release_sock(sk);
2280         
2281         /*
2282          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2283          * the available implementations agree in this case:
2284          * this call should never block, independent of the
2285          * blocking state of the socket.
2286          * Mike <pall@rz.uni-karlsruhe.de>
2287          */
2288         return -EAGAIN;
2289 }
2290 
2291 
2292 /*
2293  *      This routine copies from a sock struct into the user buffer. 
2294  */
2295  
2296 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2297         int len, int nonblock, int flags, int *addr_len)
2298 {
2299         struct wait_queue wait = { current, NULL };
2300         int copied = 0;
2301         u32 peek_seq;
2302         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2303         unsigned long used;
2304 
2305         /* 
2306          *      This error should be checked. 
2307          */
2308          
2309         if (sk->state == TCP_LISTEN)
2310                 return -ENOTCONN;
2311 
2312         /*
2313          *      Urgent data needs to be handled specially. 
2314          */
2315          
2316         if (flags & MSG_OOB)
2317                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2318 
2319         /*
2320          *      Copying sequence to update. This is volatile to handle
2321          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2322          *      inline and thus not flush cached variables otherwise).
2323          */
2324          
2325         peek_seq = sk->copied_seq;
2326         seq = &sk->copied_seq;
2327         if (flags & MSG_PEEK)
2328                 seq = &peek_seq;
2329 
2330         add_wait_queue(sk->sleep, &wait);
2331         sk->inuse = 1;
2332         while (len > 0) 
2333         {
2334                 struct sk_buff * skb;
2335                 u32 offset;
2336         
2337                 /*
2338                  * Are we at urgent data? Stop if we have read anything.
2339                  */
2340                  
2341                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2342                         break;
2343 
2344                 /*
2345                  *      Next get a buffer.
2346                  */
2347                  
2348                 current->state = TASK_INTERRUPTIBLE;
2349 
2350                 skb = skb_peek(&sk->receive_queue);
2351                 do 
2352                 {
2353                         if (!skb)
2354                                 break;
2355                         if (before(*seq, skb->seq))
2356                                 break;
2357                         offset = *seq - skb->seq;
2358                         if (skb->h.th->syn)
2359                                 offset--;
2360                         if (offset < skb->len)
2361                                 goto found_ok_skb;
2362                         if (skb->h.th->fin)
2363                                 goto found_fin_ok;
2364                         if (!(flags & MSG_PEEK))
2365                                 skb->used = 1;
2366                         skb = skb->next;
2367                 }
2368                 while (skb != (struct sk_buff *)&sk->receive_queue);
2369 
2370                 if (copied)
2371                         break;
2372 
2373                 if (sk->err) 
2374                 {
2375                         copied = sock_error(sk);
2376                         break;
2377                 }
2378 
2379                 if (sk->state == TCP_CLOSE) 
2380                 {
2381                         if (!sk->done) 
2382                         {
2383                                 sk->done = 1;
2384                                 break;
2385                         }
2386                         copied = -ENOTCONN;
2387                         break;
2388                 }
2389 
2390                 if (sk->shutdown & RCV_SHUTDOWN) 
2391                 {
2392                         sk->done = 1;
2393                         break;
2394                 }
2395                         
2396                 if (nonblock) 
2397                 {
2398                         copied = -EAGAIN;
2399                         break;
2400                 }
2401 
2402                 cleanup_rbuf(sk);
2403                 release_sock(sk);
2404                 sk->socket->flags |= SO_WAITDATA;
2405                 schedule();
2406                 sk->socket->flags &= ~SO_WAITDATA;
2407                 sk->inuse = 1;
2408 
2409                 if (current->signal & ~current->blocked) 
2410                 {
2411                         copied = -ERESTARTSYS;
2412                         break;
2413                 }
2414                 continue;
2415 
2416         found_ok_skb:
2417                 /*
2418                  *      Lock the buffer. We can be fairly relaxed as
2419                  *      an interrupt will never steal a buffer we are 
2420                  *      using unless I've missed something serious in
2421                  *      tcp_data.
2422                  */
2423                 
2424                 skb->users++;
2425                 
2426                 /*
2427                  *      Ok so how much can we use ? 
2428                  */
2429                  
2430                 used = skb->len - offset;
2431                 if (len < used)
2432                         used = len;
2433                 /*
2434                  *      Do we have urgent data here? 
2435                  */
2436                 
2437                 if (sk->urg_data) 
2438                 {
2439                         u32 urg_offset = sk->urg_seq - *seq;
2440                         if (urg_offset < used) 
2441                         {
2442                                 if (!urg_offset) 
2443                                 {
2444                                         if (!sk->urginline) 
2445                                         {
2446                                                 ++*seq;
2447                                                 offset++;
2448                                                 used--;
2449                                         }
2450                                 }
2451                                 else
2452                                         used = urg_offset;
2453                         }
2454                 }
2455                 
2456                 /*
2457                  *      Copy it - We _MUST_ update *seq first so that we
2458                  *      don't ever double read when we have dual readers
2459                  */
2460                  
2461                 *seq += used;
2462 
2463                 /*
2464                  *      This memcpy_tofs can sleep. If it sleeps and we
2465                  *      do a second read it relies on the skb->users to avoid
2466                  *      a crash when cleanup_rbuf() gets called.
2467                  */
2468                  
2469                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2470                         skb->h.th->doff*4 + offset, used);
2471                 copied += used;
2472                 len -= used;
2473                 
2474                 /*
2475                  *      We now will not sleep again until we are finished
2476                  *      with skb. Sorry if you are doing the SMP port
2477                  *      but you'll just have to fix it neatly ;)
2478                  */
2479                  
2480                 skb->users --;
2481                 
2482                 if (after(sk->copied_seq,sk->urg_seq))
2483                         sk->urg_data = 0;
2484                 if (used + offset < skb->len)
2485                         continue;
2486                 
2487                 /*
2488                  *      Process the FIN.
2489                  */
2490 
2491                 if (skb->h.th->fin)
2492                         goto found_fin_ok;
2493                 if (flags & MSG_PEEK)
2494                         continue;
2495                 skb->used = 1;
2496                 continue;
2497 
2498         found_fin_ok:
2499                 ++*seq;
2500                 if (flags & MSG_PEEK)
2501                         break;
2502                         
2503                 /*
2504                  *      All is done
2505                  */
2506                  
2507                 skb->used = 1;
2508                 sk->shutdown |= RCV_SHUTDOWN;
2509                 break;
2510 
2511         }
2512         
2513         if(copied>0 && msg->msg_name)
2514         {
2515                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2516                 sin->sin_family=AF_INET;
2517                 sin->sin_addr.s_addr=sk->daddr;
2518                 sin->sin_port=sk->dummy_th.dest;
2519         }
2520         if(addr_len)
2521                 *addr_len=sizeof(struct sockaddr_in);
2522                 
2523         remove_wait_queue(sk->sleep, &wait);
2524         current->state = TASK_RUNNING;
2525 
2526         /* Clean up data we have read: This will do ACK frames */
2527         cleanup_rbuf(sk);
2528         release_sock(sk);
2529         return copied;
2530 }
2531 
2532 
2533 
2534 /*
2535  *      State processing on a close. This implements the state shift for
2536  *      sending our FIN frame. Note that we only send a FIN for some 
2537  *      states. A shutdown() may have already sent the FIN, or we may be
2538  *      closed.
2539  */
2540  
2541 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2542 {
2543         int ns=TCP_CLOSE;
2544         int send_fin=0;
2545         switch(sk->state)
2546         {
2547                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2548                         break;
2549                 case TCP_SYN_RECV:
2550                 case TCP_ESTABLISHED:   /* Closedown begin */
2551                         ns=TCP_FIN_WAIT1;
2552                         send_fin=1;
2553                         break;
2554                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2555                 case TCP_FIN_WAIT2:
2556                 case TCP_CLOSING:
2557                         ns=sk->state;
2558                         break;
2559                 case TCP_CLOSE:
2560                 case TCP_LISTEN:
2561                         break;
2562                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2563                                            wait only for the ACK */
2564                         ns=TCP_LAST_ACK;
2565                         send_fin=1;
2566         }
2567         
2568         tcp_set_state(sk,ns);
2569                 
2570         /*
2571          *      This is a (useful) BSD violating of the RFC. There is a
2572          *      problem with TCP as specified in that the other end could
2573          *      keep a socket open forever with no application left this end.
2574          *      We use a 3 minute timeout (about the same as BSD) then kill
2575          *      our end. If they send after that then tough - BUT: long enough
2576          *      that we won't make the old 4*rto = almost no time - whoops
2577          *      reset mistake.
2578          */
2579         if(dead && ns==TCP_FIN_WAIT2)
2580         {
2581                 int timer_active=del_timer(&sk->timer);
2582                 if(timer_active)
2583                         add_timer(&sk->timer);
2584                 else
2585                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2586         }
2587         
2588         return send_fin;
2589 }
2590 
2591 /*
2592  *      Send a fin.
2593  */
2594 
2595 static void tcp_send_fin(struct sock *sk)
     /*  */
2596 {
2597         struct proto *prot =(struct proto *)sk->prot;
2598         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2599         struct tcphdr *t1;
2600         struct sk_buff *buff;
2601         struct device *dev=NULL;
2602         int tmp;
2603                 
2604         release_sock(sk); /* in case the malloc sleeps. */
2605         
2606         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2607         sk->inuse = 1;
2608 
2609         if (buff == NULL)
2610         {
2611                 /* This is a disaster if it occurs */
2612                 printk("tcp_send_fin: Impossible malloc failure");
2613                 return;
2614         }
2615 
2616         /*
2617          *      Administrivia
2618          */
2619          
2620         buff->sk = sk;
2621         buff->localroute = sk->localroute;
2622 
2623         /*
2624          *      Put in the IP header and routing stuff. 
2625          */
2626 
2627         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2628                            IPPROTO_TCP, sk->opt,
2629                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2630         if (tmp < 0) 
2631         {
2632                 int t;
2633                 /*
2634                  *      Finish anyway, treat this as a send that got lost. 
2635                  *      (Not good).
2636                  */
2637                  
2638                 buff->free = 1;
2639                 sock_wfree(sk,buff);
2640                 sk->write_seq++;
2641                 t=del_timer(&sk->timer);
2642                 if(t)
2643                         add_timer(&sk->timer);
2644                 else
2645                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2646                 return;
2647         }
2648         
2649         /*
2650          *      We ought to check if the end of the queue is a buffer and
2651          *      if so simply add the fin to that buffer, not send it ahead.
2652          */
2653 
2654         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2655         buff->dev = dev;
2656         memcpy(t1, th, sizeof(*t1));
2657         buff->seq = sk->write_seq;
2658         sk->write_seq++;
2659         buff->end_seq = sk->write_seq;
2660         t1->seq = htonl(buff->seq);
2661         t1->ack = 1;
2662         t1->ack_seq = htonl(sk->acked_seq);
2663         t1->window = htons(sk->window=tcp_select_window(sk));
2664         t1->fin = 1;
2665         t1->rst = 0;
2666         t1->doff = sizeof(*t1)/4;
2667         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2668 
2669         /*
2670          * If there is data in the write queue, the fin must be appended to
2671          * the write queue.
2672          */
2673         
2674         if (skb_peek(&sk->write_queue) != NULL) 
2675         {
2676                 buff->free = 0;
2677                 if (buff->next != NULL) 
2678                 {
2679                         printk("tcp_send_fin: next != NULL\n");
2680                         skb_unlink(buff);
2681                 }
2682                 skb_queue_tail(&sk->write_queue, buff);
2683         } 
2684         else 
2685         {
2686                 sk->sent_seq = sk->write_seq;
2687                 sk->prot->queue_xmit(sk, dev, buff, 0);
2688                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2689         }
2690 }
2691 
2692 /*
2693  *      Shutdown the sending side of a connection. Much like close except
2694  *      that we don't receive shut down or set sk->dead=1.
2695  */
2696 
2697 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2698 {
2699         /*
2700          *      We need to grab some memory, and put together a FIN,
2701          *      and then put it into the queue to be sent.
2702          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2703          */
2704 
2705         if (!(how & SEND_SHUTDOWN)) 
2706                 return;
2707          
2708         /*
2709          *      If we've already sent a FIN, or it's a closed state
2710          */
2711          
2712         if (sk->state == TCP_FIN_WAIT1 ||
2713             sk->state == TCP_FIN_WAIT2 ||
2714             sk->state == TCP_CLOSING ||
2715             sk->state == TCP_LAST_ACK ||
2716             sk->state == TCP_TIME_WAIT || 
2717             sk->state == TCP_CLOSE ||
2718             sk->state == TCP_LISTEN
2719           )
2720         {
2721                 return;
2722         }
2723         sk->inuse = 1;
2724 
2725         /*
2726          * flag that the sender has shutdown
2727          */
2728 
2729         sk->shutdown |= SEND_SHUTDOWN;
2730 
2731         /*
2732          *  Clear out any half completed packets. 
2733          */
2734 
2735         if (sk->partial)
2736                 tcp_send_partial(sk);
2737                 
2738         /*
2739          *      FIN if needed
2740          */
2741          
2742         if(tcp_close_state(sk,0))
2743                 tcp_send_fin(sk);
2744                 
2745         release_sock(sk);
2746 }
2747 
2748 /*
2749  *      This routine will send an RST to the other tcp. 
2750  */
2751  
2752 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2753           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2754 {
2755         struct sk_buff *buff;
2756         struct tcphdr *t1;
2757         int tmp;
2758         struct device *ndev=NULL;
2759 
2760         /*
2761          *      Cannot reset a reset (Think about it).
2762          */
2763          
2764         if(th->rst)
2765                 return;
2766   
2767         /*
2768          * We need to grab some memory, and put together an RST,
2769          * and then put it into the queue to be sent.
2770          */
2771 
2772         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2773         if (buff == NULL) 
2774                 return;
2775 
2776         buff->sk = NULL;
2777         buff->dev = dev;
2778         buff->localroute = 0;
2779 
2780         /*
2781          *      Put in the IP header and routing stuff. 
2782          */
2783 
2784         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2785                            sizeof(struct tcphdr),tos,ttl,NULL);
2786         if (tmp < 0) 
2787         {
2788                 buff->free = 1;
2789                 sock_wfree(NULL, buff);
2790                 return;
2791         }
2792 
2793         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2794         memcpy(t1, th, sizeof(*t1));
2795 
2796         /*
2797          *      Swap the send and the receive. 
2798          */
2799 
2800         t1->dest = th->source;
2801         t1->source = th->dest;
2802         t1->rst = 1;  
2803         t1->window = 0;
2804   
2805         if(th->ack)
2806         {
2807                 t1->ack = 0;
2808                 t1->seq = th->ack_seq;
2809                 t1->ack_seq = 0;
2810         }
2811         else
2812         {
2813                 t1->ack = 1;
2814                 if(!th->syn)
2815                         t1->ack_seq = th->seq;
2816                 else
2817                         t1->ack_seq = htonl(ntohl(th->seq)+1);
2818                 t1->seq = 0;
2819         }
2820 
2821         t1->syn = 0;
2822         t1->urg = 0;
2823         t1->fin = 0;
2824         t1->psh = 0;
2825         t1->doff = sizeof(*t1)/4;
2826         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2827         prot->queue_xmit(NULL, ndev, buff, 1);
2828         tcp_statistics.TcpOutSegs++;
2829 }
2830 
2831 
2832 /*
2833  *      Look for tcp options. Parses everything but only knows about MSS.
2834  *      This routine is always called with the packet containing the SYN.
2835  *      However it may also be called with the ack to the SYN.  So you
2836  *      can't assume this is always the SYN.  It's always called after
2837  *      we have set up sk->mtu to our own MTU.
2838  *
2839  *      We need at minimum to add PAWS support here. Possibly large windows
2840  *      as Linux gets deployed on 100Mb/sec networks.
2841  */
2842  
2843 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2844 {
2845         unsigned char *ptr;
2846         int length=(th->doff*4)-sizeof(struct tcphdr);
2847         int mss_seen = 0;
2848     
2849         ptr = (unsigned char *)(th + 1);
2850   
2851         while(length>0)
2852         {
2853                 int opcode=*ptr++;
2854                 int opsize=*ptr++;
2855                 switch(opcode)
2856                 {
2857                         case TCPOPT_EOL:
2858                                 return;
2859                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2860                                 length--;
2861                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2862                                 continue;
2863                         
2864                         default:
2865                                 if(opsize<=2)   /* Avoid silly options looping forever */
2866                                         return;
2867                                 switch(opcode)
2868                                 {
2869                                         case TCPOPT_MSS:
2870                                                 if(opsize==4 && th->syn)
2871                                                 {
2872                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2873                                                         mss_seen = 1;
2874                                                 }
2875                                                 break;
2876                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2877                                 }
2878                                 ptr+=opsize-2;
2879                                 length-=opsize;
2880                 }
2881         }
2882         if (th->syn) 
2883         {
2884                 if (! mss_seen)
2885                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2886         }
2887 #ifdef CONFIG_INET_PCTCP
2888         sk->mss = min(sk->max_window >> 1, sk->mtu);
2889 #else    
2890         sk->mss = min(sk->max_window, sk->mtu);
2891 #endif  
2892 }
2893 
2894 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2895 {
2896         dst = ntohl(dst);
2897         if (IN_CLASSA(dst))
2898                 return htonl(IN_CLASSA_NET);
2899         if (IN_CLASSB(dst))
2900                 return htonl(IN_CLASSB_NET);
2901         return htonl(IN_CLASSC_NET);
2902 }
2903 
2904 /*
2905  *      Default sequence number picking algorithm.
2906  *      As close as possible to RFC 793, which
2907  *      suggests using a 250kHz clock.
2908  *      Further reading shows this assumes 2MB/s networks.
2909  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2910  *      That's funny, Linux has one built in!  Use it!
2911  */
2912 
2913 extern inline u32 tcp_init_seq(void)
     /*  */
2914 {
2915         struct timeval tv;
2916         do_gettimeofday(&tv);
2917         return tv.tv_usec+tv.tv_sec*1000000;
2918 }
2919 
2920 /*
2921  *      This routine handles a connection request.
2922  *      It should make sure we haven't already responded.
2923  *      Because of the way BSD works, we have to send a syn/ack now.
2924  *      This also means it will be harder to close a socket which is
2925  *      listening.
2926  */
2927  
2928 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2929                  unsigned long daddr, unsigned long saddr,
2930                  struct options *opt, struct device *dev, u32 seq)
2931 {
2932         struct sk_buff *buff;
2933         struct tcphdr *t1;
2934         unsigned char *ptr;
2935         struct sock *newsk;
2936         struct tcphdr *th;
2937         struct device *ndev=NULL;
2938         int tmp;
2939         struct rtable *rt;
2940   
2941         th = skb->h.th;
2942 
2943         /* If the socket is dead, don't accept the connection. */
2944         if (!sk->dead) 
2945         {
2946                 sk->data_ready(sk,0);
2947         }
2948         else 
2949         {
2950                 if(sk->debug)
2951                         printk("Reset on %p: Connect on dead socket.\n",sk);
2952                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2953                 tcp_statistics.TcpAttemptFails++;
2954                 kfree_skb(skb, FREE_READ);
2955                 return;
2956         }
2957 
2958         /*
2959          * Make sure we can accept more.  This will prevent a
2960          * flurry of syns from eating up all our memory.
2961          */
2962 
2963         if (sk->ack_backlog >= sk->max_ack_backlog) 
2964         {
2965                 tcp_statistics.TcpAttemptFails++;
2966                 kfree_skb(skb, FREE_READ);
2967                 return;
2968         }
2969 
2970         /*
2971          * We need to build a new sock struct.
2972          * It is sort of bad to have a socket without an inode attached
2973          * to it, but the wake_up's will just wake up the listening socket,
2974          * and if the listening socket is destroyed before this is taken
2975          * off of the queue, this will take care of it.
2976          */
2977 
2978         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2979         if (newsk == NULL) 
2980         {
2981                 /* just ignore the syn.  It will get retransmitted. */
2982                 tcp_statistics.TcpAttemptFails++;
2983                 kfree_skb(skb, FREE_READ);
2984                 return;
2985         }
2986 
2987         memcpy(newsk, sk, sizeof(*newsk));
2988         newsk->opt = NULL;
2989         newsk->ip_route_cache  = NULL;
2990         if (opt && opt->optlen) {
2991           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2992           if (!sk->opt) {
2993                 kfree_s(newsk, sizeof(struct sock));
2994                 tcp_statistics.TcpAttemptFails++;
2995                 kfree_skb(skb, FREE_READ);
2996                 return;
2997           }
2998           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2999                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
3000                 kfree_s(newsk, sizeof(struct sock));
3001                 tcp_statistics.TcpAttemptFails++;
3002                 kfree_skb(skb, FREE_READ);
3003                 return;
3004           }
3005         }
3006         skb_queue_head_init(&newsk->write_queue);
3007         skb_queue_head_init(&newsk->receive_queue);
3008         newsk->send_head = NULL;
3009         newsk->send_tail = NULL;
3010         skb_queue_head_init(&newsk->back_log);
3011         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3012         newsk->rto = TCP_TIMEOUT_INIT;
3013         newsk->mdev = 0;
3014         newsk->max_window = 0;
3015         newsk->cong_window = 1;
3016         newsk->cong_count = 0;
3017         newsk->ssthresh = 0;
3018         newsk->backoff = 0;
3019         newsk->blog = 0;
3020         newsk->intr = 0;
3021         newsk->proc = 0;
3022         newsk->done = 0;
3023         newsk->partial = NULL;
3024         newsk->pair = NULL;
3025         newsk->wmem_alloc = 0;
3026         newsk->rmem_alloc = 0;
3027         newsk->localroute = sk->localroute;
3028 
3029         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3030 
3031         newsk->err = 0;
3032         newsk->shutdown = 0;
3033         newsk->ack_backlog = 0;
3034         newsk->acked_seq = skb->seq+1;
3035         newsk->copied_seq = skb->seq+1;
3036         newsk->fin_seq = skb->seq;
3037         newsk->state = TCP_SYN_RECV;
3038         newsk->timeout = 0;
3039         newsk->ip_xmit_timeout = 0;
3040         newsk->write_seq = seq; 
3041         newsk->window_seq = newsk->write_seq;
3042         newsk->rcv_ack_seq = newsk->write_seq;
3043         newsk->urg_data = 0;
3044         newsk->retransmits = 0;
3045         newsk->linger=0;
3046         newsk->destroy = 0;
3047         init_timer(&newsk->timer);
3048         newsk->timer.data = (unsigned long)newsk;
3049         newsk->timer.function = &net_timer;
3050         init_timer(&newsk->retransmit_timer);
3051         newsk->retransmit_timer.data = (unsigned long)newsk;
3052         newsk->retransmit_timer.function=&retransmit_timer;
3053         newsk->dummy_th.source = skb->h.th->dest;
3054         newsk->dummy_th.dest = skb->h.th->source;
3055         
3056         /*
3057          *      Swap these two, they are from our point of view. 
3058          */
3059          
3060         newsk->daddr = saddr;
3061         newsk->saddr = daddr;
3062         newsk->rcv_saddr = daddr;
3063 
3064         put_sock(newsk->num,newsk);
3065         newsk->dummy_th.res1 = 0;
3066         newsk->dummy_th.doff = 6;
3067         newsk->dummy_th.fin = 0;
3068         newsk->dummy_th.syn = 0;
3069         newsk->dummy_th.rst = 0;        
3070         newsk->dummy_th.psh = 0;
3071         newsk->dummy_th.ack = 0;
3072         newsk->dummy_th.urg = 0;
3073         newsk->dummy_th.res2 = 0;
3074         newsk->acked_seq = skb->seq + 1;
3075         newsk->copied_seq = skb->seq + 1;
3076         newsk->socket = NULL;
3077 
3078         /*
3079          *      Grab the ttl and tos values and use them 
3080          */
3081 
3082         newsk->ip_ttl=sk->ip_ttl;
3083         newsk->ip_tos=skb->ip_hdr->tos;
3084 
3085         /*
3086          *      Use 512 or whatever user asked for 
3087          */
3088 
3089         /*
3090          *      Note use of sk->user_mss, since user has no direct access to newsk 
3091          */
3092 
3093         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3094         newsk->ip_route_cache = rt;
3095         
3096         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3097                 newsk->window_clamp = rt->rt_window;
3098         else
3099                 newsk->window_clamp = 0;
3100                 
3101         if (sk->user_mss)
3102                 newsk->mtu = sk->user_mss;
3103         else if (rt)
3104                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3105         else 
3106                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3107 
3108         /*
3109          *      But not bigger than device MTU 
3110          */
3111 
3112         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3113 
3114 #ifdef CONFIG_SKIP
3115         
3116         /*
3117          *      SKIP devices set their MTU to 65535. This is so they can take packets
3118          *      unfragmented to security process then fragment. They could lie to the
3119          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3120          *      simply because the final package we want unfragmented is going to be
3121          *
3122          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3123          */
3124          
3125         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3126                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3127 #endif
3128         /*
3129          *      This will min with what arrived in the packet 
3130          */
3131 
3132         tcp_options(newsk,skb->h.th);
3133         
3134         tcp_cache_zap();
3135 
3136         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3137         if (buff == NULL) 
3138         {
3139                 sk->err = ENOMEM;
3140                 newsk->dead = 1;
3141                 newsk->state = TCP_CLOSE;
3142                 /* And this will destroy it */
3143                 release_sock(newsk);
3144                 kfree_skb(skb, FREE_READ);
3145                 tcp_statistics.TcpAttemptFails++;
3146                 return;
3147         }
3148   
3149         buff->sk = newsk;
3150         buff->localroute = newsk->localroute;
3151 
3152         /*
3153          *      Put in the IP header and routing stuff. 
3154          */
3155 
3156         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3157                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3158 
3159         /*
3160          *      Something went wrong. 
3161          */
3162 
3163         if (tmp < 0) 
3164         {
3165                 sk->err = tmp;
3166                 buff->free = 1;
3167                 kfree_skb(buff,FREE_WRITE);
3168                 newsk->dead = 1;
3169                 newsk->state = TCP_CLOSE;
3170                 release_sock(newsk);
3171                 skb->sk = sk;
3172                 kfree_skb(skb, FREE_READ);
3173                 tcp_statistics.TcpAttemptFails++;
3174                 return;
3175         }
3176 
3177         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3178   
3179         memcpy(t1, skb->h.th, sizeof(*t1));
3180         buff->seq = newsk->write_seq++;
3181         buff->end_seq = newsk->write_seq;
3182         /*
3183          *      Swap the send and the receive. 
3184          */
3185         t1->dest = skb->h.th->source;
3186         t1->source = newsk->dummy_th.source;
3187         t1->seq = ntohl(buff->seq);
3188         t1->ack = 1;
3189         newsk->window = tcp_select_window(newsk);
3190         newsk->sent_seq = newsk->write_seq;
3191         t1->window = ntohs(newsk->window);
3192         t1->res1 = 0;
3193         t1->res2 = 0;
3194         t1->rst = 0;
3195         t1->urg = 0;
3196         t1->psh = 0;
3197         t1->syn = 1;
3198         t1->ack_seq = htonl(newsk->acked_seq);
3199         t1->doff = sizeof(*t1)/4+1;
3200         ptr = skb_put(buff,4);
3201         ptr[0] = 2;
3202         ptr[1] = 4;
3203         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3204         ptr[3] =(newsk->mtu) & 0xff;
3205 
3206         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3207         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3208         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3209         skb->sk = newsk;
3210 
3211         /*
3212          *      Charge the sock_buff to newsk. 
3213          */
3214          
3215         sk->rmem_alloc -= skb->truesize;
3216         newsk->rmem_alloc += skb->truesize;
3217         
3218         skb_queue_tail(&sk->receive_queue,skb);
3219         sk->ack_backlog++;
3220         release_sock(newsk);
3221         tcp_statistics.TcpOutSegs++;
3222 }
3223 
3224 
3225 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3226 {
3227         /*
3228          * We need to grab some memory, and put together a FIN, 
3229          * and then put it into the queue to be sent.
3230          */
3231         
3232         sk->inuse = 1;
3233         
3234         if(th_cache_sk==sk)
3235                 tcp_cache_zap();
3236         if(sk->state == TCP_LISTEN)
3237         {
3238                 /* Special case */
3239                 tcp_set_state(sk, TCP_CLOSE);
3240                 tcp_close_pending(sk);
3241                 release_sock(sk);
3242                 return;
3243         }
3244         
3245         sk->keepopen = 1;
3246         sk->shutdown = SHUTDOWN_MASK;
3247 
3248         if (!sk->dead) 
3249                 sk->state_change(sk);
3250 
3251         if (timeout == 0) 
3252         {
3253                 struct sk_buff *skb;
3254                 
3255                 /*
3256                  *  We need to flush the recv. buffs.  We do this only on the
3257                  *  descriptor close, not protocol-sourced closes, because the
3258                  *  reader process may not have drained the data yet!
3259                  */
3260                  
3261                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3262                         kfree_skb(skb, FREE_READ);
3263                 /*
3264                  *      Get rid off any half-completed packets. 
3265                  */
3266 
3267                 if (sk->partial) 
3268                         tcp_send_partial(sk);
3269         }
3270 
3271                 
3272         /*
3273          *      Timeout is not the same thing - however the code likes
3274          *      to send both the same way (sigh).
3275          */
3276          
3277         if(timeout)
3278         {
3279                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3280         }
3281         else
3282         {
3283                 if(tcp_close_state(sk,1)==1)
3284                 {
3285                         tcp_send_fin(sk);
3286                 }
3287         }
3288         release_sock(sk);
3289 }
3290 
3291 
3292 /*
3293  *      This routine takes stuff off of the write queue,
3294  *      and puts it in the xmit queue. This happens as incoming acks
3295  *      open up the remote window for us.
3296  */
3297  
3298 static void tcp_write_xmit(struct sock *sk)
     /*  */
3299 {
3300         struct sk_buff *skb;
3301 
3302         /*
3303          *      The bytes will have to remain here. In time closedown will
3304          *      empty the write queue and all will be happy 
3305          */
3306 
3307         if(sk->zapped)
3308                 return;
3309 
3310         /*
3311          *      Anything on the transmit queue that fits the window can
3312          *      be added providing we are not
3313          *
3314          *      a) retransmitting (Nagle's rule)
3315          *      b) exceeding our congestion window.
3316          */
3317          
3318         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3319                 before(skb->end_seq, sk->window_seq + 1) &&
3320                 (sk->retransmits == 0 ||
3321                  sk->ip_xmit_timeout != TIME_WRITE ||
3322                  before(skb->end_seq, sk->rcv_ack_seq + 1))
3323                 && sk->packets_out < sk->cong_window) 
3324         {
3325                 IS_SKB(skb);
3326                 skb_unlink(skb);
3327                 
3328                 /*
3329                  *      See if we really need to send the packet. 
3330                  */
3331                  
3332                 if (before(skb->end_seq, sk->rcv_ack_seq +1)) 
3333                 {
3334                         /*
3335                          *      This is acked data. We can discard it. This 
3336                          *      cannot currently occur.
3337                          */
3338                          
3339                         sk->retransmits = 0;
3340                         kfree_skb(skb, FREE_WRITE);
3341                         if (!sk->dead) 
3342                                 sk->write_space(sk);
3343                 } 
3344                 else
3345                 {
3346                         struct tcphdr *th;
3347                         struct iphdr *iph;
3348                         int size;
3349 /*
3350  * put in the ack seq and window at this point rather than earlier,
3351  * in order to keep them monotonic.  We really want to avoid taking
3352  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3353  * Ack and window will in general have changed since this packet was put
3354  * on the write queue.
3355  */
3356                         iph = skb->ip_hdr;
3357                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3358                         size = skb->len - (((unsigned char *) th) - skb->data);
3359 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3360                         if (size > sk->mtu - sizeof(struct iphdr))
3361                         {
3362                                 iph->frag_off &= ~htons(IP_DF);
3363                                 ip_send_check(iph);
3364                         }
3365 #endif
3366                         
3367                         th->ack_seq = htonl(sk->acked_seq);
3368                         th->window = htons(tcp_select_window(sk));
3369 
3370                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3371 
3372                         sk->sent_seq = skb->end_seq;
3373                         
3374                         /*
3375                          *      IP manages our queue for some crazy reason
3376                          */
3377                          
3378                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3379                         
3380                         /*
3381                          *      Again we slide the timer wrongly
3382                          */
3383                          
3384                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3385                 }
3386         }
3387 }
3388 
3389 
3390 /*
3391  *      This routine deals with incoming acks, but not outgoing ones.
3392  */
3393 
3394 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3395 {
3396         u32 ack;
3397         int flag = 0;
3398 
3399         /* 
3400          * 1 - there was data in packet as well as ack or new data is sent or 
3401          *     in shutdown state
3402          * 2 - data from retransmit queue was acked and removed
3403          * 4 - window shrunk or data from retransmit queue was acked and removed
3404          */
3405 
3406         if(sk->zapped)
3407                 return(1);      /* Dead, cant ack any more so why bother */
3408 
3409         /*
3410          *      Have we discovered a larger window
3411          */
3412          
3413         ack = ntohl(th->ack_seq);
3414 
3415         if (ntohs(th->window) > sk->max_window) 
3416         {
3417                 sk->max_window = ntohs(th->window);
3418 #ifdef CONFIG_INET_PCTCP
3419                 /* Hack because we don't send partial packets to non SWS
3420                    handling hosts */
3421                 sk->mss = min(sk->max_window>>1, sk->mtu);
3422 #else
3423                 sk->mss = min(sk->max_window, sk->mtu);
3424 #endif  
3425         }
3426 
3427         /*
3428          *      We have dropped back to keepalive timeouts. Thus we have
3429          *      no retransmits pending.
3430          */
3431          
3432         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3433                 sk->retransmits = 0;
3434 
3435         /*
3436          *      If the ack is newer than sent or older than previous acks
3437          *      then we can probably ignore it.
3438          */
3439          
3440         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3441         {
3442                 if(sk->debug)
3443                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3444                         
3445                 /*
3446                  *      Keepalive processing.
3447                  */
3448                  
3449                 if (after(ack, sk->sent_seq)) 
3450                 {
3451                         return(0);
3452                 }
3453                 
3454                 /*
3455                  *      Restart the keepalive timer.
3456                  */
3457                  
3458                 if (sk->keepopen) 
3459                 {
3460                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3461                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3462                 }
3463                 return(1);
3464         }
3465 
3466         /*
3467          *      If there is data set flag 1
3468          */
3469          
3470         if (len != th->doff*4) 
3471                 flag |= 1;
3472 
3473         /*
3474          *      See if our window has been shrunk. 
3475          */
3476 
3477         if (after(sk->window_seq, ack+ntohs(th->window))) 
3478         {
3479                 /*
3480                  * We may need to move packets from the send queue
3481                  * to the write queue, if the window has been shrunk on us.
3482                  * The RFC says you are not allowed to shrink your window
3483                  * like this, but if the other end does, you must be able
3484                  * to deal with it.
3485                  */
3486                 struct sk_buff *skb;
3487                 struct sk_buff *skb2;
3488                 struct sk_buff *wskb = NULL;
3489         
3490                 skb2 = sk->send_head;
3491                 sk->send_head = NULL;
3492                 sk->send_tail = NULL;
3493         
3494                 /*
3495                  *      This is an artifact of a flawed concept. We want one
3496                  *      queue and a smarter send routine when we send all.
3497                  */
3498         
3499                 flag |= 4;      /* Window changed */
3500         
3501                 sk->window_seq = ack + ntohs(th->window);
3502                 cli();
3503                 while (skb2 != NULL) 
3504                 {
3505                         skb = skb2;
3506                         skb2 = skb->link3;
3507                         skb->link3 = NULL;
3508                         if (after(skb->end_seq, sk->window_seq)) 
3509                         {
3510                                 if (sk->packets_out > 0) 
3511                                         sk->packets_out--;
3512                                 /* We may need to remove this from the dev send list. */
3513                                 if (skb->next != NULL) 
3514                                 {
3515                                         skb_unlink(skb);                                
3516                                 }
3517                                 /* Now add it to the write_queue. */
3518                                 if (wskb == NULL)
3519                                         skb_queue_head(&sk->write_queue,skb);
3520                                 else
3521                                         skb_append(wskb,skb);
3522                                 wskb = skb;
3523                         } 
3524                         else 
3525                         {
3526                                 if (sk->send_head == NULL) 
3527                                 {
3528                                         sk->send_head = skb;
3529                                         sk->send_tail = skb;
3530                                 }
3531                                 else
3532                                 {
3533                                         sk->send_tail->link3 = skb;
3534                                         sk->send_tail = skb;
3535                                 }
3536                                 skb->link3 = NULL;
3537                         }
3538                 }
3539                 sti();
3540         }
3541 
3542         /*
3543          *      Pipe has emptied
3544          */
3545          
3546         if (sk->send_tail == NULL || sk->send_head == NULL) 
3547         {
3548                 sk->send_head = NULL;
3549                 sk->send_tail = NULL;
3550                 sk->packets_out= 0;
3551         }
3552 
3553         /*
3554          *      Update the right hand window edge of the host
3555          */
3556          
3557         sk->window_seq = ack + ntohs(th->window);
3558 
3559         /*
3560          *      We don't want too many packets out there. 
3561          */
3562          
3563         if (sk->ip_xmit_timeout == TIME_WRITE && 
3564                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3565         {
3566                 /* 
3567                  * This is Jacobson's slow start and congestion avoidance. 
3568                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3569                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3570                  * counter and increment it once every cwnd times.  It's possible
3571                  * that this should be done only if sk->retransmits == 0.  I'm
3572                  * interpreting "new data is acked" as including data that has
3573                  * been retransmitted but is just now being acked.
3574                  */
3575                 if (sk->cong_window < sk->ssthresh)  
3576                         /* 
3577                          *      In "safe" area, increase
3578                          */
3579                         sk->cong_window++;
3580                 else 
3581                 {
3582                         /*
3583                          *      In dangerous area, increase slowly.  In theory this is
3584                          *      sk->cong_window += 1 / sk->cong_window
3585                          */
3586                         if (sk->cong_count >= sk->cong_window) 
3587                         {
3588                                 sk->cong_window++;
3589                                 sk->cong_count = 0;
3590                         }
3591                         else 
3592                                 sk->cong_count++;
3593                 }
3594         }
3595 
3596         /*
3597          *      Remember the highest ack received.
3598          */
3599          
3600         sk->rcv_ack_seq = ack;
3601         
3602         /*
3603          *      We passed data and got it acked, remove any soft error
3604          *      log. Something worked...
3605          */
3606          
3607         sk->err_soft = 0;
3608 
3609         /*
3610          *      If this ack opens up a zero window, clear backoff.  It was
3611          *      being used to time the probes, and is probably far higher than
3612          *      it needs to be for normal retransmission.
3613          */
3614 
3615         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3616         {
3617                 sk->retransmits = 0;    /* Our probe was answered */
3618                 
3619                 /*
3620                  *      Was it a usable window open ?
3621                  */
3622                  
3623                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3624                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
3625                 {
3626                         sk->backoff = 0;
3627                         
3628                         /*
3629                          *      Recompute rto from rtt.  this eliminates any backoff.
3630                          */
3631 
3632                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3633                         if (sk->rto > 120*HZ)
3634                                 sk->rto = 120*HZ;
3635                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3636                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3637                                                    .2 of a second is going to need huge windows (SIGH) */
3638                         sk->rto = 20;
3639                 }
3640         }
3641 
3642         /* 
3643          *      See if we can take anything off of the retransmit queue.
3644          */
3645    
3646         while(sk->send_head != NULL) 
3647         {
3648                 /* Check for a bug. */
3649                 if (sk->send_head->link3 &&
3650                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
3651                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3652                         
3653                 /*
3654                  *      If our packet is before the ack sequence we can
3655                  *      discard it as it's confirmed to have arrived the other end.
3656                  */
3657                  
3658                 if (before(sk->send_head->end_seq, ack+1)) 
3659                 {
3660                         struct sk_buff *oskb;   
3661                         if (sk->retransmits) 
3662                         {       
3663                                 /*
3664                                  *      We were retransmitting.  don't count this in RTT est 
3665                                  */
3666                                 flag |= 2;
3667 
3668                                 /*
3669                                  * even though we've gotten an ack, we're still
3670                                  * retransmitting as long as we're sending from
3671                                  * the retransmit queue.  Keeping retransmits non-zero
3672                                  * prevents us from getting new data interspersed with
3673                                  * retransmissions.
3674                                  */
3675 
3676                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3677                                         sk->retransmits = 1;
3678                                 else
3679                                         sk->retransmits = 0;
3680                         }
3681                         /*
3682                          * Note that we only reset backoff and rto in the
3683                          * rtt recomputation code.  And that doesn't happen
3684                          * if there were retransmissions in effect.  So the
3685                          * first new packet after the retransmissions is
3686                          * sent with the backoff still in effect.  Not until
3687                          * we get an ack from a non-retransmitted packet do
3688                          * we reset the backoff and rto.  This allows us to deal
3689                          * with a situation where the network delay has increased
3690                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3691                          */
3692 
3693                         /*
3694                          *      We have one less packet out there. 
3695                          */
3696                          
3697                         if (sk->packets_out > 0) 
3698                                 sk->packets_out --;
3699                         /* 
3700                          *      Wake up the process, it can probably write more. 
3701                          */
3702                         if (!sk->dead) 
3703                                 sk->write_space(sk);
3704                         oskb = sk->send_head;
3705 
3706                         if (!(flag&2))  /* Not retransmitting */
3707                         {
3708                                 long m;
3709         
3710                                 /*
3711                                  *      The following amusing code comes from Jacobson's
3712                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3713                                  *      are scaled versions of rtt and mean deviation.
3714                                  *      This is designed to be as fast as possible 
3715                                  *      m stands for "measurement".
3716                                  */
3717         
3718                                 m = jiffies - oskb->when;  /* RTT */
3719                                 if(m<=0)
3720                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3721                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3722                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3723                                 if (m < 0)
3724                                         m = -m;         /* m is now abs(error) */
3725                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3726                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3727         
3728                                 /*
3729                                  *      Now update timeout.  Note that this removes any backoff.
3730                                  */
3731                          
3732                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3733                                 if (sk->rto > 120*HZ)
3734                                         sk->rto = 120*HZ;
3735                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3736                                         sk->rto = 20;
3737                                 sk->backoff = 0;
3738                         }
3739                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3740                                            In this case as we just set it up */
3741                         cli();
3742                         oskb = sk->send_head;
3743                         IS_SKB(oskb);
3744                         sk->send_head = oskb->link3;
3745                         if (sk->send_head == NULL) 
3746                         {
3747                                 sk->send_tail = NULL;
3748                         }
3749 
3750                 /*
3751                  *      We may need to remove this from the dev send list. 
3752                  */
3753 
3754                         if (oskb->next)
3755                                 skb_unlink(oskb);
3756                         sti();
3757                         kfree_skb(oskb, FREE_WRITE); /* write. */
3758                         if (!sk->dead) 
3759                                 sk->write_space(sk);
3760                 }
3761                 else
3762                 {
3763                         break;
3764                 }
3765         }
3766 
3767         /*
3768          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3769          * returns non-NULL, we complete ignore the timer stuff in the else
3770          * clause.  We ought to organize the code so that else clause can
3771          * (should) be executed regardless, possibly moving the PROBE timer
3772          * reset over.  The skb_peek() thing should only move stuff to the
3773          * write queue, NOT also manage the timer functions.
3774          */
3775 
3776         /*
3777          * Maybe we can take some stuff off of the write queue,
3778          * and put it onto the xmit queue.
3779          */
3780         if (skb_peek(&sk->write_queue) != NULL) 
3781         {
3782                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3783                         (sk->retransmits == 0 || 
3784                          sk->ip_xmit_timeout != TIME_WRITE ||
3785                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3786                         && sk->packets_out < sk->cong_window) 
3787                 {
3788                         /*
3789                          *      Add more data to the send queue.
3790                          */
3791                         flag |= 1;
3792                         tcp_write_xmit(sk);
3793                 }
3794                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3795                         sk->send_head == NULL &&
3796                         sk->ack_backlog == 0 &&
3797                         sk->state != TCP_TIME_WAIT) 
3798                 {
3799                         /*
3800                          *      Data to queue but no room.
3801                          */
3802                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3803                 }               
3804         }
3805         else
3806         {
3807                 /*
3808                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3809                  * from TCP_CLOSE we don't do anything
3810                  *
3811                  * from anything else, if there is write data (or fin) pending,
3812                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3813                  * a KEEPALIVE timeout, else we delete the timer.
3814                  *
3815                  * We do not set flag for nominal write data, otherwise we may
3816                  * force a state where we start to write itsy bitsy tidbits
3817                  * of data.
3818                  */
3819 
3820                 switch(sk->state) {
3821                 case TCP_TIME_WAIT:
3822                         /*
3823                          * keep us in TIME_WAIT until we stop getting packets,
3824                          * reset the timeout.
3825                          */
3826                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3827                         break;
3828                 case TCP_CLOSE:
3829                         /*
3830                          * don't touch the timer.
3831                          */
3832                         break;
3833                 default:
3834                         /*
3835                          *      Must check send_head, write_queue, and ack_backlog
3836                          *      to determine which timeout to use.
3837                          */
3838                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3839                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3840                         } else if (sk->keepopen) {
3841                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3842                         } else {
3843                                 del_timer(&sk->retransmit_timer);
3844                                 sk->ip_xmit_timeout = 0;
3845                         }
3846                         break;
3847                 }
3848         }
3849 
3850         /*
3851          *      We have nothing queued but space to send. Send any partial
3852          *      packets immediately (end of Nagle rule application).
3853          */
3854          
3855         if (sk->packets_out == 0 && sk->partial != NULL &&
3856                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3857         {
3858                 flag |= 1;
3859                 tcp_send_partial(sk);
3860         }
3861 
3862         /*
3863          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3864          * we are now waiting for an acknowledge to our FIN.  The other end is
3865          * already in TIME_WAIT.
3866          *
3867          * Move to TCP_CLOSE on success.
3868          */
3869 
3870         if (sk->state == TCP_LAST_ACK) 
3871         {
3872                 if (!sk->dead)
3873                         sk->state_change(sk);
3874                 if(sk->debug)
3875                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3876                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3877                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3878                 {
3879                         flag |= 1;
3880                         tcp_set_state(sk,TCP_CLOSE);
3881                         sk->shutdown = SHUTDOWN_MASK;
3882                 }
3883         }
3884 
3885         /*
3886          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3887          *
3888          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3889          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3890          */
3891 
3892         if (sk->state == TCP_FIN_WAIT1) 
3893         {
3894 
3895                 if (!sk->dead) 
3896                         sk->state_change(sk);
3897                 if (sk->rcv_ack_seq == sk->write_seq) 
3898                 {
3899                         flag |= 1;
3900                         sk->shutdown |= SEND_SHUTDOWN;
3901                         tcp_set_state(sk, TCP_FIN_WAIT2);
3902                 }
3903         }
3904 
3905         /*
3906          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3907          *
3908          *      Move to TIME_WAIT
3909          */
3910 
3911         if (sk->state == TCP_CLOSING) 
3912         {
3913 
3914                 if (!sk->dead) 
3915                         sk->state_change(sk);
3916                 if (sk->rcv_ack_seq == sk->write_seq) 
3917                 {
3918                         flag |= 1;
3919                         tcp_time_wait(sk);
3920                 }
3921         }
3922         
3923         /*
3924          *      Final ack of a three way shake 
3925          */
3926          
3927         if(sk->state==TCP_SYN_RECV)
3928         {
3929                 tcp_set_state(sk, TCP_ESTABLISHED);
3930                 tcp_options(sk,th);
3931                 sk->dummy_th.dest=th->source;
3932                 sk->copied_seq = sk->acked_seq;
3933                 if(!sk->dead)
3934                         sk->state_change(sk);
3935                 if(sk->max_window==0)
3936                 {
3937                         sk->max_window=32;      /* Sanity check */
3938                         sk->mss=min(sk->max_window,sk->mtu);
3939                 }
3940         }
3941         
3942         /*
3943          * I make no guarantees about the first clause in the following
3944          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3945          * what conditions "!flag" would be true.  However I think the rest
3946          * of the conditions would prevent that from causing any
3947          * unnecessary retransmission. 
3948          *   Clearly if the first packet has expired it should be 
3949          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3950          * harder to explain:  You have to look carefully at how and when the
3951          * timer is set and with what timeout.  The most recent transmission always
3952          * sets the timer.  So in general if the most recent thing has timed
3953          * out, everything before it has as well.  So we want to go ahead and
3954          * retransmit some more.  If we didn't explicitly test for this
3955          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3956          * would not be true.  If you look at the pattern of timing, you can
3957          * show that rto is increased fast enough that the next packet would
3958          * almost never be retransmitted immediately.  Then you'd end up
3959          * waiting for a timeout to send each packet on the retransmission
3960          * queue.  With my implementation of the Karn sampling algorithm,
3961          * the timeout would double each time.  The net result is that it would
3962          * take a hideous amount of time to recover from a single dropped packet.
3963          * It's possible that there should also be a test for TIME_WRITE, but
3964          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3965          * got to be in real retransmission mode.
3966          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3967          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3968          * As long as no further losses occur, this seems reasonable.
3969          */
3970         
3971         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3972                (((flag&2) && sk->retransmits) ||
3973                (sk->send_head->when + sk->rto < jiffies))) 
3974         {
3975                 if(sk->send_head->when + sk->rto < jiffies)
3976                         tcp_retransmit(sk,0);   
3977                 else
3978                 {
3979                         tcp_do_retransmit(sk, 1);
3980                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3981                 }
3982         }
3983 
3984         return(1);
3985 }
3986 
3987 
3988 /*
3989  *      Process the FIN bit. This now behaves as it is supposed to work
3990  *      and the FIN takes effect when it is validly part of sequence
3991  *      space. Not before when we get holes.
3992  *
3993  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3994  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3995  *      TIME-WAIT)
3996  *
3997  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3998  *      close and we go into CLOSING (and later onto TIME-WAIT)
3999  *
4000  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4001  *
4002  */
4003  
4004 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
4005 {
4006         sk->fin_seq = skb->end_seq;
4007 
4008         if (!sk->dead) 
4009         {
4010                 sk->state_change(sk);
4011                 sock_wake_async(sk->socket, 1);
4012         }
4013 
4014         switch(sk->state) 
4015         {
4016                 case TCP_SYN_RECV:
4017                 case TCP_SYN_SENT:
4018                 case TCP_ESTABLISHED:
4019                         /*
4020                          * move to CLOSE_WAIT, tcp_data() already handled
4021                          * sending the ack.
4022                          */
4023                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4024                         if (th->rst)
4025                                 sk->shutdown = SHUTDOWN_MASK;
4026                         break;
4027 
4028                 case TCP_CLOSE_WAIT:
4029                 case TCP_CLOSING:
4030                         /*
4031                          * received a retransmission of the FIN, do
4032                          * nothing.
4033                          */
4034                         break;
4035                 case TCP_TIME_WAIT:
4036                         /*
4037                          * received a retransmission of the FIN,
4038                          * restart the TIME_WAIT timer.
4039                          */
4040                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4041                         return(0);
4042                 case TCP_FIN_WAIT1:
4043                         /*
4044                          * This case occurs when a simultaneous close
4045                          * happens, we must ack the received FIN and
4046                          * enter the CLOSING state.
4047                          *
4048                          * This causes a WRITE timeout, which will either
4049                          * move on to TIME_WAIT when we timeout, or resend
4050                          * the FIN properly (maybe we get rid of that annoying
4051                          * FIN lost hang). The TIME_WRITE code is already correct
4052                          * for handling this timeout.
4053                          */
4054 
4055                         if(sk->ip_xmit_timeout != TIME_WRITE)
4056                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4057                         tcp_set_state(sk,TCP_CLOSING);
4058                         break;
4059                 case TCP_FIN_WAIT2:
4060                         /*
4061                          * received a FIN -- send ACK and enter TIME_WAIT
4062                          */
4063                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4064                         sk->shutdown|=SHUTDOWN_MASK;
4065                         tcp_set_state(sk,TCP_TIME_WAIT);
4066                         break;
4067                 case TCP_CLOSE:
4068                         /*
4069                          * already in CLOSE
4070                          */
4071                         break;
4072                 default:
4073                         tcp_set_state(sk,TCP_LAST_ACK);
4074         
4075                         /* Start the timers. */
4076                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4077                         return(0);
4078         }
4079 
4080         return(0);
4081 }
4082 
4083 
4084 
4085 /*
4086  *      This routine handles the data.  If there is room in the buffer,
4087  *      it will be have already been moved into it.  If there is no
4088  *      room, then we will just have to discard the packet.
4089  */
4090 
4091 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4092          unsigned long saddr, unsigned short len)
4093 {
4094         struct sk_buff *skb1, *skb2;
4095         struct tcphdr *th;
4096         int dup_dumped=0;
4097         u32 new_seq, shut_seq;
4098 
4099         th = skb->h.th;
4100         skb_pull(skb,th->doff*4);
4101         skb_trim(skb,len-(th->doff*4));
4102 
4103         /*
4104          *      The bytes in the receive read/assembly queue has increased. Needed for the
4105          *      low memory discard algorithm 
4106          */
4107            
4108         sk->bytes_rcv += skb->len;
4109         
4110         if (skb->len == 0 && !th->fin) 
4111         {
4112                 /* 
4113                  *      Don't want to keep passing ack's back and forth. 
4114                  *      (someone sent us dataless, boring frame)
4115                  */
4116                 if (!th->ack)
4117                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4118                 kfree_skb(skb, FREE_READ);
4119                 return(0);
4120         }
4121         
4122         /*
4123          *      We no longer have anyone receiving data on this connection.
4124          */
4125 
4126 #ifndef TCP_DONT_RST_SHUTDOWN            
4127 
4128         if(sk->shutdown & RCV_SHUTDOWN)
4129         {
4130                 /*
4131                  *      FIXME: BSD has some magic to avoid sending resets to
4132                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4133                  *      BSD stacks still have broken keepalives so we want to
4134                  *      cope with it.
4135                  */
4136 
4137                 if(skb->len)    /* We don't care if it's just an ack or
4138                                    a keepalive/window probe */
4139                 {
4140                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
4141                         
4142                         /* Do this the way 4.4BSD treats it. Not what I'd
4143                            regard as the meaning of the spec but it's what BSD
4144                            does and clearly they know everything 8) */
4145 
4146                         /*
4147                          *      This is valid because of two things
4148                          *
4149                          *      a) The way tcp_data behaves at the bottom.
4150                          *      b) A fin takes effect when read not when received.
4151                          */
4152                          
4153                         shut_seq = sk->acked_seq+1;     /* Last byte */
4154                         
4155                         if(after(new_seq,shut_seq))
4156                         {
4157                                 if(sk->debug)
4158                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4159                                                 sk, new_seq, shut_seq, sk->blog);
4160                                 if(sk->dead)
4161                                 {
4162                                         sk->acked_seq = new_seq + th->fin;
4163                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4164                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4165                                         tcp_statistics.TcpEstabResets++;
4166                                         tcp_set_state(sk,TCP_CLOSE);
4167                                         sk->err = EPIPE;
4168                                         sk->shutdown = SHUTDOWN_MASK;
4169                                         kfree_skb(skb, FREE_READ);
4170                                         return 0;
4171                                 }
4172                         }
4173                 }
4174         }
4175 
4176 #endif
4177 
4178         /*
4179          *      Now we have to walk the chain, and figure out where this one
4180          *      goes into it.  This is set up so that the last packet we received
4181          *      will be the first one we look at, that way if everything comes
4182          *      in order, there will be no performance loss, and if they come
4183          *      out of order we will be able to fit things in nicely.
4184          *
4185          *      [AC: This is wrong. We should assume in order first and then walk
4186          *       forwards from the first hole based upon real traffic patterns.]
4187          *      
4188          */
4189 
4190         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4191         {
4192                 skb_queue_head(&sk->receive_queue,skb);
4193                 skb1= NULL;
4194         } 
4195         else
4196         {
4197                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4198                 {
4199                         if(sk->debug)
4200                         {
4201                                 printk("skb1=%p :", skb1);
4202                                 printk("skb1->seq = %d: ", skb1->seq);
4203                                 printk("skb->seq = %d\n",skb->seq);
4204                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4205                                                 sk->acked_seq);
4206                         }
4207                         
4208                         /*
4209                          *      Optimisation: Duplicate frame or extension of previous frame from
4210                          *      same sequence point (lost ack case).
4211                          *      The frame contains duplicate data or replaces a previous frame
4212                          *      discard the previous frame (safe as sk->inuse is set) and put
4213                          *      the new one in its place.
4214                          */
4215                          
4216                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
4217                         {
4218                                 skb_append(skb1,skb);
4219                                 skb_unlink(skb1);
4220                                 kfree_skb(skb1,FREE_READ);
4221                                 dup_dumped=1;
4222                                 skb1=NULL;
4223                                 break;
4224                         }
4225                         
4226                         /*
4227                          *      Found where it fits
4228                          */
4229                          
4230                         if (after(skb->seq+1, skb1->seq))
4231                         {
4232                                 skb_append(skb1,skb);
4233                                 break;
4234                         }
4235                         
4236                         /*
4237                          *      See if we've hit the start. If so insert.
4238                          */
4239                         if (skb1 == skb_peek(&sk->receive_queue))
4240                         {
4241                                 skb_queue_head(&sk->receive_queue, skb);
4242                                 break;
4243                         }
4244                 }
4245         }
4246 
4247         /*
4248          *      Figure out what the ack value for this frame is
4249          */
4250          
4251         if (before(sk->acked_seq, sk->copied_seq)) 
4252         {
4253                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4254                 sk->acked_seq = sk->copied_seq;
4255         }
4256 
4257         /*
4258          *      Now figure out if we can ack anything. This is very messy because we really want two
4259          *      receive queues, a completed and an assembly queue. We also want only one transmit
4260          *      queue.
4261          */
4262 
4263         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
4264         {
4265                 if (before(skb->seq, sk->acked_seq+1)) 
4266                 {
4267                         int newwindow;
4268 
4269                         if (after(skb->end_seq, sk->acked_seq)) 
4270                         {
4271                                 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4272                                 if (newwindow < 0)
4273                                         newwindow = 0;  
4274                                 sk->window = newwindow;
4275                                 sk->acked_seq = skb->end_seq;
4276                         }
4277                         skb->acked = 1;
4278 
4279                         /*
4280                          *      When we ack the fin, we do the FIN 
4281                          *      processing.
4282                          */
4283 
4284                         if (skb->h.th->fin) 
4285                         {
4286                                 tcp_fin(skb,sk,skb->h.th);
4287                         }
4288           
4289                         for(skb2 = skb->next;
4290                             skb2 != (struct sk_buff *)&sk->receive_queue;
4291                             skb2 = skb2->next) 
4292                         {
4293                                 if (before(skb2->seq, sk->acked_seq+1)) 
4294                                 {
4295                                         if (after(skb2->end_seq, sk->acked_seq))
4296                                         {
4297                                                 newwindow = sk->window -
4298                                                  (skb2->end_seq - sk->acked_seq);
4299                                                 if (newwindow < 0)
4300                                                         newwindow = 0;  
4301                                                 sk->window = newwindow;
4302                                                 sk->acked_seq = skb2->end_seq;
4303                                         }
4304                                         skb2->acked = 1;
4305                                         /*
4306                                          *      When we ack the fin, we do
4307                                          *      the fin handling.
4308                                          */
4309                                         if (skb2->h.th->fin) 
4310                                         {
4311                                                 tcp_fin(skb,sk,skb->h.th);
4312                                         }
4313 
4314                                         /*
4315                                          *      Force an immediate ack.
4316                                          */
4317                                          
4318                                         sk->ack_backlog = sk->max_ack_backlog;
4319                                 }
4320                                 else
4321                                 {
4322                                         break;
4323                                 }
4324                         }
4325 
4326                         /*
4327                          *      This also takes care of updating the window.
4328                          *      This if statement needs to be simplified.
4329                          */
4330                         if (!sk->delay_acks ||
4331                             sk->ack_backlog >= sk->max_ack_backlog || 
4332                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4333         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4334                         }
4335                         else 
4336                         {
4337                                 sk->ack_backlog++;
4338                                 if(sk->debug)
4339                                         printk("Ack queued.\n");
4340                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4341                         }
4342                 }
4343         }
4344 
4345         /*
4346          *      If we've missed a packet, send an ack.
4347          *      Also start a timer to send another.
4348          */
4349          
4350         if (!skb->acked) 
4351         {
4352         
4353         /*
4354          *      This is important.  If we don't have much room left,
4355          *      we need to throw out a few packets so we have a good
4356          *      window.  Note that mtu is used, not mss, because mss is really
4357          *      for the send side.  He could be sending us stuff as large as mtu.
4358          */
4359                  
4360                 while (sock_rspace(sk) < sk->mtu) 
4361                 {
4362                         skb1 = skb_peek(&sk->receive_queue);
4363                         if (skb1 == NULL) 
4364                         {
4365                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4366                                 break;
4367                         }
4368 
4369                         /*
4370                          *      Don't throw out something that has been acked. 
4371                          */
4372                  
4373                         if (skb1->acked) 
4374                         {
4375                                 break;
4376                         }
4377                 
4378                         skb_unlink(skb1);
4379                         kfree_skb(skb1, FREE_READ);
4380                 }
4381                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4382                 sk->ack_backlog++;
4383                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4384         }
4385         else
4386         {
4387                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4388         }
4389 
4390         /*
4391          *      Now tell the user we may have some data. 
4392          */
4393          
4394         if (!sk->dead) 
4395         {
4396                 if(sk->debug)
4397                         printk("Data wakeup.\n");
4398                 sk->data_ready(sk,0);
4399         } 
4400         return(0);
4401 }
4402 
4403 
4404 /*
4405  *      This routine is only called when we have urgent data
4406  *      signalled. Its the 'slow' part of tcp_urg. It could be
4407  *      moved inline now as tcp_urg is only called from one
4408  *      place. We handle URGent data wrong. We have to - as
4409  *      BSD still doesn't use the correction from RFC961.
4410  */
4411  
4412 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4413 {
4414         u32 ptr = ntohs(th->urg_ptr);
4415 
4416         if (ptr)
4417                 ptr--;
4418         ptr += ntohl(th->seq);
4419 
4420         /* ignore urgent data that we've already seen and read */
4421         if (after(sk->copied_seq, ptr))
4422                 return;
4423 
4424         /* do we already have a newer (or duplicate) urgent pointer? */
4425         if (sk->urg_data && !after(ptr, sk->urg_seq))
4426                 return;
4427 
4428         /* tell the world about our new urgent pointer */
4429         if (sk->proc != 0) {
4430                 if (sk->proc > 0) {
4431                         kill_proc(sk->proc, SIGURG, 1);
4432                 } else {
4433                         kill_pg(-sk->proc, SIGURG, 1);
4434                 }
4435         }
4436         sk->urg_data = URG_NOTYET;
4437         sk->urg_seq = ptr;
4438 }
4439 
4440 /*
4441  *      This is the 'fast' part of urgent handling.
4442  */
4443  
4444 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4445         unsigned long saddr, unsigned long len)
4446 {
4447         u32 ptr;
4448 
4449         /*
4450          *      Check if we get a new urgent pointer - normally not 
4451          */
4452          
4453         if (th->urg)
4454                 tcp_check_urg(sk,th);
4455 
4456         /*
4457          *      Do we wait for any urgent data? - normally not
4458          */
4459          
4460         if (sk->urg_data != URG_NOTYET)
4461                 return 0;
4462 
4463         /*
4464          *      Is the urgent pointer pointing into this packet? 
4465          */
4466          
4467         ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4468         if (ptr >= len)
4469                 return 0;
4470 
4471         /*
4472          *      Ok, got the correct packet, update info 
4473          */
4474          
4475         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4476         if (!sk->dead)
4477                 sk->data_ready(sk,0);
4478         return 0;
4479 }
4480 
4481 /*
4482  *      This will accept the next outstanding connection. 
4483  */
4484  
4485 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4486 {
4487         struct sock *newsk;
4488         struct sk_buff *skb;
4489   
4490   /*
4491    * We need to make sure that this socket is listening,
4492    * and that it has something pending.
4493    */
4494 
4495         if (sk->state != TCP_LISTEN) 
4496         {
4497                 sk->err = EINVAL;
4498                 return(NULL); 
4499         }
4500 
4501         /* Avoid the race. */
4502         cli();
4503         sk->inuse = 1;
4504 
4505         while((skb = tcp_dequeue_established(sk)) == NULL) 
4506         {
4507                 if (flags & O_NONBLOCK) 
4508                 {
4509                         sti();
4510                         release_sock(sk);
4511                         sk->err = EAGAIN;
4512                         return(NULL);
4513                 }
4514 
4515                 release_sock(sk);
4516                 interruptible_sleep_on(sk->sleep);
4517                 if (current->signal & ~current->blocked) 
4518                 {
4519                         sti();
4520                         sk->err = ERESTARTSYS;
4521                         return(NULL);
4522                 }
4523                 sk->inuse = 1;
4524         }
4525         sti();
4526 
4527         /*
4528          *      Now all we need to do is return skb->sk. 
4529          */
4530 
4531         newsk = skb->sk;
4532 
4533         kfree_skb(skb, FREE_READ);
4534         sk->ack_backlog--;
4535         release_sock(sk);
4536         return(newsk);
4537 }
4538 
4539 
4540 /*
4541  *      This will initiate an outgoing connection. 
4542  */
4543  
4544 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4545 {
4546         struct sk_buff *buff;
4547         struct device *dev=NULL;
4548         unsigned char *ptr;
4549         int tmp;
4550         int atype;
4551         struct tcphdr *t1;
4552         struct rtable *rt;
4553 
4554         if (sk->state != TCP_CLOSE) 
4555                 return(-EISCONN);
4556 
4557         /*
4558          *      Don't allow a double connect.
4559          */
4560                 
4561         if(sk->daddr)
4562                 return -EINVAL;
4563         
4564         if (addr_len < 8) 
4565                 return(-EINVAL);
4566 
4567         if (usin->sin_family && usin->sin_family != AF_INET) 
4568                 return(-EAFNOSUPPORT);
4569 
4570         /*
4571          *      connect() to INADDR_ANY means loopback (BSD'ism).
4572          */
4573         
4574         if(usin->sin_addr.s_addr==INADDR_ANY)
4575                 usin->sin_addr.s_addr=ip_my_addr();
4576                   
4577         /*
4578          *      Don't want a TCP connection going to a broadcast address 
4579          */
4580 
4581         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4582                 return -ENETUNREACH;
4583   
4584         sk->inuse = 1;
4585         sk->daddr = usin->sin_addr.s_addr;
4586         sk->write_seq = tcp_init_seq();
4587         sk->window_seq = sk->write_seq;
4588         sk->rcv_ack_seq = sk->write_seq -1;
4589         sk->err = 0;
4590         sk->dummy_th.dest = usin->sin_port;
4591         release_sock(sk);
4592 
4593         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4594         if (buff == NULL) 
4595         {
4596                 return(-ENOMEM);
4597         }
4598         sk->inuse = 1;
4599         buff->sk = sk;
4600         buff->free = 0;
4601         buff->localroute = sk->localroute;
4602         
4603 
4604         /*
4605          *      Put in the IP header and routing stuff.
4606          */
4607          
4608         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4609                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4610         if (tmp < 0) 
4611         {
4612                 sock_wfree(sk, buff);
4613                 release_sock(sk);
4614                 return(-ENETUNREACH);
4615         }
4616         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4617                 sk->saddr = rt->rt_src;
4618         sk->rcv_saddr = sk->saddr;
4619 
4620         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4621 
4622         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4623         buff->seq = sk->write_seq++;
4624         t1->seq = htonl(buff->seq);
4625         sk->sent_seq = sk->write_seq;
4626         buff->end_seq = sk->write_seq;
4627         t1->ack = 0;
4628         t1->window = 2;
4629         t1->res1=0;
4630         t1->res2=0;
4631         t1->rst = 0;
4632         t1->urg = 0;
4633         t1->psh = 0;
4634         t1->syn = 1;
4635         t1->urg_ptr = 0;
4636         t1->doff = 6;
4637         /* use 512 or whatever user asked for */
4638         
4639         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4640                 sk->window_clamp=rt->rt_window;
4641         else
4642                 sk->window_clamp=0;
4643 
4644         if (sk->user_mss)
4645                 sk->mtu = sk->user_mss;
4646         else if (rt)
4647                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4648         else 
4649                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4650 
4651         /*
4652          *      but not bigger than device MTU 
4653          */
4654 
4655         if(sk->mtu <32)
4656                 sk->mtu = 32;   /* Sanity limit */
4657                 
4658         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4659 
4660 #ifdef CONFIG_SKIP
4661         
4662         /*
4663          *      SKIP devices set their MTU to 65535. This is so they can take packets
4664          *      unfragmented to security process then fragment. They could lie to the
4665          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4666          *      simply because the final package we want unfragmented is going to be
4667          *
4668          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4669          */
4670          
4671         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4672                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4673 #endif
4674         
4675         /*
4676          *      Put in the TCP options to say MTU. 
4677          */
4678 
4679         ptr = skb_put(buff,4);
4680         ptr[0] = 2;
4681         ptr[1] = 4;
4682         ptr[2] = (sk->mtu) >> 8;
4683         ptr[3] = (sk->mtu) & 0xff;
4684         tcp_send_check(t1, sk->saddr, sk->daddr,
4685                   sizeof(struct tcphdr) + 4, sk);
4686 
4687         /*
4688          *      This must go first otherwise a really quick response will get reset. 
4689          */
4690 
4691         tcp_cache_zap();
4692         tcp_set_state(sk,TCP_SYN_SENT);
4693         if(rt&&rt->rt_flags&RTF_IRTT)
4694                 sk->rto = rt->rt_irtt;
4695         else
4696                 sk->rto = TCP_TIMEOUT_INIT;
4697         sk->retransmit_timer.function=&retransmit_timer;
4698         sk->retransmit_timer.data = (unsigned long)sk;
4699         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4700         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4701                                                                                         initial setting */
4702 
4703         sk->prot->queue_xmit(sk, dev, buff, 0);  
4704         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4705         tcp_statistics.TcpActiveOpens++;
4706         tcp_statistics.TcpOutSegs++;
4707   
4708         release_sock(sk);
4709         return(0);
4710 }
4711 
4712 
4713 /*
4714  *      This functions checks to see if the tcp header is actually acceptable. 
4715  */
4716  
4717 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4718              struct options *opt, unsigned long saddr, struct device *dev)
4719 {
4720         u32 next_seq;
4721 
4722         next_seq = len - 4*th->doff;
4723         if (th->fin)
4724                 next_seq++;
4725         /* if we have a zero window, we can't have any data in the packet.. */
4726         if (next_seq && !sk->window)
4727                 goto ignore_it;
4728         next_seq += ntohl(th->seq);
4729 
4730         /*
4731          * This isn't quite right.  sk->acked_seq could be more recent
4732          * than sk->window.  This is however close enough.  We will accept
4733          * slightly more packets than we should, but it should not cause
4734          * problems unless someone is trying to forge packets.
4735          */
4736 
4737         /* have we already seen all of this packet? */
4738         if (!after(next_seq+1, sk->acked_seq))
4739                 goto ignore_it;
4740         /* or does it start beyond the window? */
4741         if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4742                 goto ignore_it;
4743 
4744         /* ok, at least part of this packet would seem interesting.. */
4745         return 1;
4746 
4747 ignore_it:
4748         if (th->rst)
4749                 return 0;
4750 
4751         /*
4752          *      Send a reset if we get something not ours and we are
4753          *      unsynchronized. Note: We don't do anything to our end. We
4754          *      are just killing the bogus remote connection then we will
4755          *      connect again and it will work (with luck).
4756          */
4757          
4758         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4759         {
4760                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4761                 return 1;
4762         }
4763 
4764         /* Try to resync things. */
4765         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4766         return 0;
4767 }
4768 
4769 /*
4770  *      When we get a reset we do this.
4771  */
4772 
4773 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4774 {
4775         sk->zapped = 1;
4776         sk->err = ECONNRESET;
4777         if (sk->state == TCP_SYN_SENT)
4778                 sk->err = ECONNREFUSED;
4779         if (sk->state == TCP_CLOSE_WAIT)
4780                 sk->err = EPIPE;
4781 #ifdef TCP_DO_RFC1337           
4782         /*
4783          *      Time wait assassination protection [RFC1337]
4784          */
4785         if(sk->state!=TCP_TIME_WAIT)
4786         {       
4787                 tcp_set_state(sk,TCP_CLOSE);
4788                 sk->shutdown = SHUTDOWN_MASK;
4789         }
4790 #else   
4791         tcp_set_state(sk,TCP_CLOSE);
4792         sk->shutdown = SHUTDOWN_MASK;
4793 #endif  
4794         if (!sk->dead) 
4795                 sk->state_change(sk);
4796         kfree_skb(skb, FREE_READ);
4797         release_sock(sk);
4798         return(0);
4799 }
4800 
4801 /*
4802  *      A TCP packet has arrived.
4803  *              skb->h.raw is the TCP header.
4804  */
4805  
4806 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4807         __u32 daddr, unsigned short len,
4808         __u32 saddr, int redo, struct inet_protocol * protocol)
4809 {
4810         struct tcphdr *th;
4811         struct sock *sk;
4812         int syn_ok=0;
4813         
4814         tcp_statistics.TcpInSegs++;
4815         if(skb->pkt_type!=PACKET_HOST)
4816         {
4817                 kfree_skb(skb,FREE_READ);
4818                 return(0);
4819         }
4820   
4821         th = skb->h.th;
4822 
4823         /*
4824          *      Find the socket, using the last hit cache if applicable.
4825          */
4826 
4827         if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4828         {
4829                 sk=(struct sock *)th_cache_sk;
4830                 /*
4831                  *      We think this is causing the bug so
4832                  */
4833                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4834                         printk("Cache mismatch on TCP.\n");
4835         }
4836         else
4837         {
4838                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4839                 th_cache_saddr=saddr;
4840                 th_cache_daddr=daddr;
4841                 th_cache_dport=th->dest;
4842                 th_cache_sport=th->source;
4843                 th_cache_sk=sk;
4844         }               
4845 
4846         /*
4847          *      If this socket has got a reset it's to all intents and purposes 
4848          *      really dead. Count closed sockets as dead.
4849          *
4850          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4851          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4852          *      exist so should cause resets as if the port was unreachable.
4853          */
4854          
4855         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4856                 sk=NULL;
4857 
4858         if (!redo) 
4859         {
4860                 /*
4861                  *      Pull up the IP header.
4862                  */
4863                 skb_pull(skb, skb->h.raw-skb->data);
4864                 /*
4865                  *      Try to use the device checksum if provided.
4866                  */
4867                 if (
4868                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4869                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4870                     )
4871                 {
4872                         skb->sk = NULL;
4873                         kfree_skb(skb,FREE_READ);
4874                         /*
4875                          *      We don't release the socket because it was
4876                          *      never marked in use.
4877                          */
4878                         return(0);
4879                 }
4880 
4881                 skb->seq = ntohl(th->seq);
4882                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4883                 skb->ack_seq = ntohl(th->ack_seq);
4884 
4885                 /* See if we know about the socket. */
4886                 if (sk == NULL) 
4887                 {
4888                         /*
4889                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4890                          */
4891                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4892                         skb->sk = NULL;
4893                         /*
4894                          *      Discard frame
4895                          */
4896                         kfree_skb(skb, FREE_READ);
4897                         return(0);
4898                 }
4899 
4900                 skb->acked = 0;
4901                 skb->used = 0;
4902                 skb->free = 0;
4903                 skb->saddr = daddr;
4904                 skb->daddr = saddr;
4905         
4906                 /* We may need to add it to the backlog here. */
4907                 cli();
4908                 if (sk->inuse) 
4909                 {
4910                         skb_queue_tail(&sk->back_log, skb);
4911                         sti();
4912                         return(0);
4913                 }
4914                 sk->inuse = 1;
4915                 sti();
4916         }
4917         else
4918         {
4919                 if (sk==NULL) 
4920                 {
4921                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4922                         skb->sk = NULL;
4923                         kfree_skb(skb, FREE_READ);
4924                         return(0);
4925                 }
4926         }
4927 
4928 
4929         if (!sk->prot) 
4930         {
4931                 printk("IMPOSSIBLE 3\n");
4932                 return(0);
4933         }
4934 
4935 
4936         /*
4937          *      Charge the memory to the socket. 
4938          */
4939          
4940         skb->sk=sk;
4941         sk->rmem_alloc += skb->truesize;
4942 
4943         /*
4944          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4945          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4946          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4947          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4948          */
4949 
4950         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4951         {
4952         
4953                 /*
4954                  *      Now deal with unusual cases.
4955                  */
4956          
4957                 if(sk->state==TCP_LISTEN)
4958                 {
4959                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4960                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4961 
4962                         /*
4963                          *      We don't care for RST, and non SYN are absorbed (old segments)
4964                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4965                          *      netmask on a running connection it can go broadcast. Even Sun's have
4966                          *      this problem so I'm ignoring it 
4967                          */
4968                            
4969                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4970                         {
4971                                 kfree_skb(skb, FREE_READ);
4972                                 release_sock(sk);
4973                                 return 0;
4974                         }
4975                 
4976                         /*      
4977                          *      Guess we need to make a new socket up 
4978                          */
4979                 
4980                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4981                 
4982                         /*
4983                          *      Now we have several options: In theory there is nothing else
4984                          *      in the frame. KA9Q has an option to send data with the syn,
4985                          *      BSD accepts data with the syn up to the [to be] advertised window
4986                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4987                          *      it, that fits the spec precisely and avoids incompatibilities. It
4988                          *      would be nice in future to drop through and process the data.
4989                          */
4990                          
4991                         release_sock(sk);
4992                         return 0;
4993                 }
4994         
4995                 /* retransmitted SYN? */
4996                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
4997                 {
4998                         kfree_skb(skb, FREE_READ);
4999                         release_sock(sk);
5000                         return 0;
5001                 }
5002                 
5003                 /*
5004                  *      SYN sent means we have to look for a suitable ack and either reset
5005                  *      for bad matches or go to connected 
5006                  */
5007            
5008                 if(sk->state==TCP_SYN_SENT)
5009                 {
5010                         /* Crossed SYN or previous junk segment */
5011                         if(th->ack)
5012                         {
5013                                 /* We got an ack, but it's not a good ack */
5014                                 if(!tcp_ack(sk,th,saddr,len))
5015                                 {
5016                                         /* Reset the ack - its an ack from a 
5017                                            different connection  [ th->rst is checked in tcp_reset()] */
5018                                         tcp_statistics.TcpAttemptFails++;
5019                                         tcp_reset(daddr, saddr, th,
5020                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5021                                         kfree_skb(skb, FREE_READ);
5022                                         release_sock(sk);
5023                                         return(0);
5024                                 }
5025                                 if(th->rst)
5026                                         return tcp_std_reset(sk,skb);
5027                                 if(!th->syn)
5028                                 {
5029                                         /* A valid ack from a different connection
5030                                            start. Shouldn't happen but cover it */
5031                                         kfree_skb(skb, FREE_READ);
5032                                         release_sock(sk);
5033                                         return 0;
5034                                 }
5035                                 /*
5036                                  *      Ok.. it's good. Set up sequence numbers and
5037                                  *      move to established.
5038                                  */
5039                                 syn_ok=1;       /* Don't reset this connection for the syn */
5040                                 sk->acked_seq = skb->seq+1;
5041                                 sk->fin_seq = skb->seq;
5042                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5043                                 tcp_set_state(sk, TCP_ESTABLISHED);
5044                                 tcp_options(sk,th);
5045                                 sk->dummy_th.dest=th->source;
5046                                 sk->copied_seq = sk->acked_seq;
5047                                 if(!sk->dead)
5048                                 {
5049                                         sk->state_change(sk);
5050                                         sock_wake_async(sk->socket, 0);
5051                                 }
5052                                 if(sk->max_window==0)
5053                                 {
5054                                         sk->max_window = 32;
5055                                         sk->mss = min(sk->max_window, sk->mtu);
5056                                 }
5057                         }
5058                         else
5059                         {
5060                                 /* See if SYN's cross. Drop if boring */
5061                                 if(th->syn && !th->rst)
5062                                 {
5063                                         /* Crossed SYN's are fine - but talking to
5064                                            yourself is right out... */
5065                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5066                                                 sk->dummy_th.source==th->source &&
5067                                                 sk->dummy_th.dest==th->dest)
5068                                         {
5069                                                 tcp_statistics.TcpAttemptFails++;
5070                                                 return tcp_std_reset(sk,skb);
5071                                         }
5072                                         tcp_set_state(sk,TCP_SYN_RECV);
5073                                         
5074                                         /*
5075                                          *      FIXME:
5076                                          *      Must send SYN|ACK here
5077                                          */
5078                                 }               
5079                                 /* Discard junk segment */
5080                                 kfree_skb(skb, FREE_READ);
5081                                 release_sock(sk);
5082                                 return 0;
5083                         }
5084                         /*
5085                          *      SYN_RECV with data maybe.. drop through
5086                          */
5087                         goto rfc_step6;
5088                 }
5089 
5090         /*
5091          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5092          *      a more complex suggestion for fixing these reuse issues in RFC1644
5093          *      but not yet ready for general use. Also see RFC1379.
5094          */
5095         
5096 #define BSD_TIME_WAIT
5097 #ifdef BSD_TIME_WAIT
5098                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5099                         after(skb->seq, sk->acked_seq) && !th->rst)
5100                 {
5101                         u32 seq = sk->write_seq;
5102                         if(sk->debug)
5103                                 printk("Doing a BSD time wait\n");
5104                         tcp_statistics.TcpEstabResets++;           
5105                         sk->rmem_alloc -= skb->truesize;
5106                         skb->sk = NULL;
5107                         sk->err=ECONNRESET;
5108                         tcp_set_state(sk, TCP_CLOSE);
5109                         sk->shutdown = SHUTDOWN_MASK;
5110                         release_sock(sk);
5111                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5112                         if (sk && sk->state==TCP_LISTEN)
5113                         {
5114                                 sk->inuse=1;
5115                                 skb->sk = sk;
5116                                 sk->rmem_alloc += skb->truesize;
5117                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5118                                 release_sock(sk);
5119                                 return 0;
5120                         }
5121                         kfree_skb(skb, FREE_READ);
5122                         return 0;
5123                 }
5124 #endif  
5125         }
5126 
5127         /*
5128          *      We are now in normal data flow (see the step list in the RFC)
5129          *      Note most of these are inline now. I'll inline the lot when
5130          *      I have time to test it hard and look at what gcc outputs 
5131          */
5132         
5133         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5134         {
5135                 kfree_skb(skb, FREE_READ);
5136                 release_sock(sk);
5137                 return 0;
5138         }
5139 
5140         if(th->rst)
5141                 return tcp_std_reset(sk,skb);
5142         
5143         /*
5144          *      !syn_ok is effectively the state test in RFC793.
5145          */
5146          
5147         if(th->syn && !syn_ok)
5148         {
5149                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5150                 return tcp_std_reset(sk,skb);   
5151         }
5152 
5153         /*
5154          *      Process the ACK
5155          */
5156          
5157 
5158         if(th->ack && !tcp_ack(sk,th,saddr,len))
5159         {
5160                 /*
5161                  *      Our three way handshake failed.
5162                  */
5163                  
5164                 if(sk->state==TCP_SYN_RECV)
5165                 {
5166                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5167                 }
5168                 kfree_skb(skb, FREE_READ);
5169                 release_sock(sk);
5170                 return 0;
5171         }
5172         
5173 rfc_step6:              /* I'll clean this up later */
5174 
5175         /*
5176          *      If the accepted buffer put us over our queue size we
5177          *      now drop it (we must process the ack first to avoid
5178          *      deadlock cases).
5179          */
5180          
5181         if (sk->rmem_alloc  >= sk->rcvbuf) 
5182         {
5183                 kfree_skb(skb, FREE_READ);
5184                 release_sock(sk);
5185                 return(0);
5186         }
5187 
5188 
5189         /*
5190          *      Process urgent data
5191          */
5192                 
5193         if(tcp_urg(sk, th, saddr, len))
5194         {
5195                 kfree_skb(skb, FREE_READ);
5196                 release_sock(sk);
5197                 return 0;
5198         }
5199         
5200         /*
5201          *      Process the encapsulated data
5202          */
5203         
5204         if(tcp_data(skb,sk, saddr, len))
5205         {
5206                 kfree_skb(skb, FREE_READ);
5207                 release_sock(sk);
5208                 return 0;
5209         }
5210 
5211         /*
5212          *      And done
5213          */     
5214         
5215         release_sock(sk);
5216         return 0;
5217 }
5218 
5219 /*
5220  *      This routine sends a packet with an out of date sequence
5221  *      number. It assumes the other end will try to ack it.
5222  */
5223 
5224 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5225 {
5226         struct sk_buff *buff,*skb;
5227         struct tcphdr *t1;
5228         struct device *dev=NULL;
5229         int tmp;
5230 
5231         if (sk->zapped)
5232                 return; /* After a valid reset we can send no more */
5233 
5234         /*
5235          *      Write data can still be transmitted/retransmitted in the
5236          *      following states.  If any other state is encountered, return.
5237          *      [listen/close will never occur here anyway]
5238          */
5239 
5240         if (sk->state != TCP_ESTABLISHED && 
5241             sk->state != TCP_CLOSE_WAIT &&
5242             sk->state != TCP_FIN_WAIT1 && 
5243             sk->state != TCP_LAST_ACK &&
5244             sk->state != TCP_CLOSING
5245         ) 
5246         {
5247                 return;
5248         }
5249         if ( before(sk->sent_seq, sk->window_seq) && 
5250             (skb=skb_peek(&sk->write_queue)))
5251         {
5252                 /*
5253                  * We are probing the opening of a window
5254                  * but the window size is != 0
5255                  * must have been a result SWS advoidance ( sender )
5256                  */
5257             
5258                 struct iphdr *iph;
5259                 struct tcphdr *th;
5260                 struct tcphdr *nth;
5261                 unsigned long win_size;
5262 #if 0
5263                 unsigned long ow_size;
5264 #endif
5265                 void * tcp_data_start;
5266         
5267                 /*
5268                  *      How many bytes can we send ?
5269                  */
5270                  
5271                 win_size = sk->window_seq - sk->sent_seq;
5272 
5273                 /*
5274                  *      Recover the buffer pointers
5275                  */
5276                  
5277                 iph = (struct iphdr *)skb->ip_hdr;
5278                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5279 
5280                 /*
5281                  *      Grab the data for a temporary frame
5282                  */
5283                  
5284                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5285                                      (iph->ihl << 2) +
5286                                      sk->prot->max_header + 15, 
5287                                      1, GFP_ATOMIC);
5288                 if ( buff == NULL )
5289                         return;
5290 
5291                 /* 
5292                  *      If we strip the packet on the write queue we must
5293                  *      be ready to retransmit this one 
5294                  */
5295             
5296                 buff->free = /*0*/1;
5297 
5298                 buff->sk = sk;
5299                 buff->localroute = sk->localroute;
5300                 
5301                 /*
5302                  *      Put headers on the new packet
5303                  */
5304 
5305                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5306                                          IPPROTO_TCP, sk->opt, buff->truesize,
5307                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5308                 if (tmp < 0) 
5309                 {
5310                         sock_wfree(sk, buff);
5311                         return;
5312                 }
5313                 
5314                 /*
5315                  *      Move the TCP header over
5316                  */
5317 
5318                 buff->dev = dev;
5319 
5320                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5321 
5322                 memcpy(nth, th, th->doff * 4);
5323                 
5324                 /*
5325                  *      Correct the new header
5326                  */
5327                  
5328                 nth->ack = 1; 
5329                 nth->ack_seq = htonl(sk->acked_seq);
5330                 nth->window = htons(tcp_select_window(sk));
5331                 nth->check = 0;
5332 
5333                 /*
5334                  *      Find the first data byte.
5335                  */
5336                  
5337                 tcp_data_start = (char *) th + (th->doff << 2);
5338 
5339                 /*
5340                  *      Add it to our new buffer
5341                  */
5342                  
5343                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5344                 
5345                 /*
5346                  *      Remember our right edge sequence number.
5347                  */
5348                  
5349                 buff->end_seq = sk->sent_seq + win_size;
5350                 sk->sent_seq = buff->end_seq;           /* Hack */
5351                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5352                         nth->urg = 0;
5353 
5354                 /*
5355                  *      Checksum the split buffer
5356                  */
5357                  
5358                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5359                            nth->doff * 4 + win_size , sk);
5360         }
5361         else
5362         {       
5363                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5364                 if (buff == NULL) 
5365                         return;
5366 
5367                 buff->free = 1;
5368                 buff->sk = sk;
5369                 buff->localroute = sk->localroute;
5370 
5371                 /*
5372                  *      Put in the IP header and routing stuff. 
5373                  */
5374                  
5375                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5376                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5377                 if (tmp < 0) 
5378                 {
5379                         sock_wfree(sk, buff);
5380                         return;
5381                 }
5382 
5383                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5384                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5385 
5386                 /*
5387                  *      Use a previous sequence.
5388                  *      This should cause the other end to send an ack.
5389                  */
5390          
5391                 t1->seq = htonl(sk->sent_seq-1);
5392                 t1->ack = 1; 
5393                 t1->res1= 0;
5394                 t1->res2= 0;
5395                 t1->rst = 0;
5396                 t1->urg = 0;
5397                 t1->psh = 0;
5398                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5399                 t1->syn = 0;
5400                 t1->ack_seq = htonl(sk->acked_seq);
5401                 t1->window = htons(tcp_select_window(sk));
5402                 t1->doff = sizeof(*t1)/4;
5403                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5404 
5405         }               
5406 
5407         /*
5408          *      Send it.
5409          */
5410         
5411         sk->prot->queue_xmit(sk, dev, buff, 1);
5412         tcp_statistics.TcpOutSegs++;
5413 }
5414 
5415 /*
5416  *      A window probe timeout has occurred.
5417  */
5418 
5419 void tcp_send_probe0(struct sock *sk)
     /*  */
5420 {
5421         if (sk->zapped)
5422                 return;         /* After a valid reset we can send no more */
5423 
5424         tcp_write_wakeup(sk);
5425 
5426         sk->backoff++;
5427         sk->rto = min(sk->rto << 1, 120*HZ);
5428         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5429         sk->retransmits++;
5430         sk->prot->retransmits ++;
5431 }
5432 
5433 /*
5434  *      Socket option code for TCP. 
5435  */
5436   
5437 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5438 {
5439         int val,err;
5440 
5441         if(level!=SOL_TCP)
5442                 return ip_setsockopt(sk,level,optname,optval,optlen);
5443 
5444         if (optval == NULL) 
5445                 return(-EINVAL);
5446 
5447         err=verify_area(VERIFY_READ, optval, sizeof(int));
5448         if(err)
5449                 return err;
5450         
5451         val = get_user((int *)optval);
5452 
5453         switch(optname)
5454         {
5455                 case TCP_MAXSEG:
5456 /*
5457  * values greater than interface MTU won't take effect.  however at
5458  * the point when this call is done we typically don't yet know
5459  * which interface is going to be used
5460  */
5461                         if(val<1||val>MAX_WINDOW)
5462                                 return -EINVAL;
5463                         sk->user_mss=val;
5464                         return 0;
5465                 case TCP_NODELAY:
5466                         sk->nonagle=(val==0)?0:1;
5467                         return 0;
5468                 default:
5469                         return(-ENOPROTOOPT);
5470         }
5471 }
5472 
5473 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5474 {
5475         int val,err;
5476 
5477         if(level!=SOL_TCP)
5478                 return ip_getsockopt(sk,level,optname,optval,optlen);
5479                         
5480         switch(optname)
5481         {
5482                 case TCP_MAXSEG:
5483                         val=sk->user_mss;
5484                         break;
5485                 case TCP_NODELAY:
5486                         val=sk->nonagle;
5487                         break;
5488                 default:
5489                         return(-ENOPROTOOPT);
5490         }
5491         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5492         if(err)
5493                 return err;
5494         put_user(sizeof(int),(int *) optlen);
5495 
5496         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5497         if(err)
5498                 return err;
5499         put_user(val,(int *)optval);
5500 
5501         return(0);
5502 }       
5503 
5504 
5505 struct proto tcp_prot = {
5506         tcp_close,
5507         ip_build_header,
5508         tcp_connect,
5509         tcp_accept,
5510         ip_queue_xmit,
5511         tcp_retransmit,
5512         tcp_write_wakeup,
5513         tcp_read_wakeup,
5514         tcp_rcv,
5515         tcp_select,
5516         tcp_ioctl,
5517         NULL,
5518         tcp_shutdown,
5519         tcp_setsockopt,
5520         tcp_getsockopt,
5521         tcp_sendmsg,
5522         tcp_recvmsg,
5523         NULL,           /* No special bind() */
5524         128,
5525         0,
5526         "TCP",
5527         0, 0,
5528         {NULL,}
5529 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS