net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *
 183  *
 184  * To Fix:
 185  *              Fast path the code. Two things here - fix the window calculation
 186  *              so it doesn't iterate over the queue, also spot packets with no funny
 187  *              options arriving in order and process directly.
 188  *
 189  *              Rewrite output state machine to use a single queue and do low window
 190  *              situations as per the spec (RFC 1122)
 191  *              Speed up input assembly algorithm.
 192  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 193  *              could do with it working on IPv4
 194  *              User settable/learned rtt/max window/mtu
 195  *              Cope with MTU/device switches when retransmitting in tcp.
 196  *              Fix the window handling to use PR's new code.
 197  *
 198  *              Change the fundamental structure to a single send queue maintained
 199  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 200  *              active routes too]). Cut the queue off in tcp_retransmit/
 201  *              tcp_transmit.
 202  *              Change the receive queue to assemble as it goes. This lets us
 203  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 204  *              tcp_data/tcp_read as well as the window shrink crud.
 205  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 206  *              tcp_queue_skb seem obvious routines to extract.
 207  *      
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *              
 245  *      TCP_CLOSE               socket is finished
 246  */
 247 
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * 
 255  * Use of PSH (4.2.2.2)
 256  *   MAY aggregate data sent without the PSH flag. (does)
 257  *   MAY queue data recieved without the PSH flag. (does)
 258  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 259  *   MAY implement PSH on send calls. (doesn't, thus:)
 260  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 261  *     MUST set PSH on last segment (does)
 262  *   MAY pass received PSH to application layer (doesn't)
 263  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 264  * 
 265  * Window Size (4.2.2.3, 4.2.2.16)
 266  *   MUST treat window size as an unsigned number (does)
 267  *   SHOULD treat window size as a 32-bit number (does not)
 268  *   MUST NOT shrink window once it is offered (does not normally)
 269  *   
 270  * Urgent Pointer (4.2.2.4)
 271  * **MUST point urgent pointer to last byte of urgent data (not right
 272  *     after). (doesn't, to be like BSD)
 273  *   MUST inform application layer asynchronously of incoming urgent
 274  *     data. (does)
 275  *   MUST provide application with means of determining the amount of
 276  *     urgent data pending. (does)
 277  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 278  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 279  *      [Follows BSD 1 byte of urgent data]
 280  * 
 281  * TCP Options (4.2.2.5)
 282  *   MUST be able to recieve TCP options in any segment. (does)
 283  *   MUST ignore unsupported options (does)
 284  *   
 285  * Maximum Segment Size Option (4.2.2.6)
 286  *   MUST implement both sending and receiving MSS. (does)
 287  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 288  *     it always). (does, even when MSS == 536, which is legal)
 289  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 290  *   MUST calculate "effective send MSS" correctly:
 291  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 292  *     (does - but allows operator override)
 293  *  
 294  * TCP Checksum (4.2.2.7)
 295  *   MUST generate and check TCP checksum. (does)
 296  * 
 297  * Initial Sequence Number Selection (4.2.2.8)
 298  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 299  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 300  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 301  * 
 302  * Simultaneous Open Attempts (4.2.2.10)
 303  *   MUST support simultaneous open attempts (does)
 304  * 
 305  * Recovery from Old Duplicate SYN (4.2.2.11)
 306  *   MUST keep track of active vs. passive open (does)
 307  * 
 308  * RST segment (4.2.2.12)
 309  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 310  *     anything with it, which is standard)
 311  * 
 312  * Closing a Connection (4.2.2.13)
 313  *   MUST inform application of whether connectin was closed by RST or
 314  *     normal close. (does)
 315  *   MAY allow "half-duplex" close (treat connection as closed for the
 316  *     local app, even before handshake is done). (does)
 317  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 318  * 
 319  * Retransmission Timeout (4.2.2.15)
 320  *   MUST implement Jacobson's slow start and congestion avoidance
 321  *     stuff. (does) 
 322  * 
 323  * Probing Zero Windows (4.2.2.17)
 324  *   MUST support probing of zero windows. (does)
 325  *   MAY keep offered window closed indefinitely. (does)
 326  *   MUST allow remote window to stay closed indefinitely. (does)
 327  * 
 328  * Passive Open Calls (4.2.2.18)
 329  *   MUST NOT let new passive open affect other connections. (doesn't)
 330  *   MUST support passive opens (LISTENs) concurrently. (does)
 331  *   
 332  * Time to Live (4.2.2.19)
 333  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 334  * 
 335  * Event Processing (4.2.2.20)
 336  *   SHOULD queue out-of-order segments. (does)
 337  *   MUST aggregate ACK segments whenever possible. (does but badly)
 338  *   
 339  * Retransmission Timeout Calculation (4.2.3.1)
 340  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 341  *     calculation. (does, or at least explains them in the comments 8*b)
 342  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 343  * 
 344  * When to Send an ACK Segment (4.2.3.2)
 345  *   SHOULD implement delayed ACK. (does not)
 346  *   MUST keep ACK delay < 0.5 sec. (N/A)
 347  * 
 348  * When to Send a Window Update (4.2.3.3)
 349  *   MUST implement receiver-side SWS. (does)
 350  *   
 351  * When to Send Data (4.2.3.4)
 352  *   MUST implement sender-side SWS. (does - imperfectly)
 353  *   SHOULD implement Nagle algorithm. (does)
 354  * 
 355  * TCP Connection Failures (4.2.3.5)
 356  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 357  *   SHOULD inform application layer of soft errors. (doesn't)
 358  *   
 359  * TCP Keep-Alives (4.2.3.6)
 360  *   MAY provide keep-alives. (does)
 361  *   MUST make keep-alives configurable on a per-connection basis. (does)
 362  *   MUST default to no keep-alives. (does)
 363  * **MUST make keep-alive interval configurable. (doesn't)
 364  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 365  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 366  *     connection. (doesn't)
 367  *   SHOULD send keep-alive with no data. (does)
 368  * 
 369  * TCP Multihoming (4.2.3.7)
 370  *   MUST get source address from IP layer before sending first
 371  *     SYN. (does)
 372  *   MUST use same local address for all segments of a connection. (does)
 373  * 
 374  * IP Options (4.2.3.8)
 375  *   (I don't think the IP layer sees the IP options, yet.)
 376  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 377  *   MAY support Time Stamp and Record Route. (doesn't)
 378  * **MUST allow application to specify a source route. (doesn't?)
 379  * **MUST allow receieved Source Route option to set route for all future
 380  *     segments on this connection. (doesn't, not that I think it's a
 381  *     huge problem)
 382  * 
 383  * ICMP messages (4.2.3.9)
 384  *   MUST act on ICMP errors. (does)
 385  *   MUST slow transmission upon receipt of a Source Quench. (does)
 386  *   MUST NOT abort connection upon receipt of soft Destination
 387  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 388  *     Problems. (doesn't)
 389  *   SHOULD report soft Destination Unreachables etc. to the
 390  *     application. (doesn't)
 391  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 392  *     messages (2, 3, 4). (does)
 393  * 
 394  * Remote Address Validation (4.2.3.10)
 395  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 396  *   MUST ignore SYN with invalid source address. (does)
 397  *   MUST silently discard incoming SYN for broadcast/multicast
 398  *     address. (does) 
 399  * 
 400  * Asynchronous Reports (4.2.4.1)
 401  * **MUST provide mechanism for reporting soft errors to application
 402  *     layer. (doesn't)
 403  * 
 404  * Type of Service (4.2.4.2)
 405  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 406  * 
 407  * (Whew. -- MS 950903)
 408  **/
 409 
 410 #include <linux/types.h>
 411 #include <linux/sched.h>
 412 #include <linux/mm.h>
 413 #include <linux/time.h>
 414 #include <linux/string.h>
 415 #include <linux/config.h>
 416 #include <linux/socket.h>
 417 #include <linux/sockios.h>
 418 #include <linux/termios.h>
 419 #include <linux/in.h>
 420 #include <linux/fcntl.h>
 421 #include <linux/inet.h>
 422 #include <linux/netdevice.h>
 423 #include <net/snmp.h>
 424 #include <net/ip.h>
 425 #include <net/protocol.h>
 426 #include <net/icmp.h>
 427 #include <net/tcp.h>
 428 #include <net/arp.h>
 429 #include <linux/skbuff.h>
 430 #include <net/sock.h>
 431 #include <net/route.h>
 432 #include <linux/errno.h>
 433 #include <linux/timer.h>
 434 #include <asm/system.h>
 435 #include <asm/segment.h>
 436 #include <linux/mm.h>
 437 #include <net/checksum.h>
 438 
 439 /*
 440  *      The MSL timer is the 'normal' timer.
 441  */
 442  
 443 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 444 
 445 #define SEQ_TICK 3
 446 unsigned long seq_offset;
 447 struct tcp_mib  tcp_statistics;
 448 
 449 /*
 450  *      Cached last hit socket
 451  */
 452  
 453 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 454 volatile unsigned short  th_cache_dport, th_cache_sport;
 455 volatile struct sock *th_cache_sk;
 456 
 457 void tcp_cache_zap(void)
     /*  */
 458 {
 459         unsigned long flags;
 460         save_flags(flags);
 461         cli();
 462         th_cache_saddr=0;
 463         th_cache_daddr=0;
 464         th_cache_dport=0;
 465         th_cache_sport=0;
 466         th_cache_sk=NULL;
 467         restore_flags(flags);
 468 }
 469 
 470 static void tcp_close(struct sock *sk, int timeout);
 471 
 472 
 473 /*
 474  *      The less said about this the better, but it works and will do for 1.2 
 475  */
 476 
 477 static struct wait_queue *master_select_wakeup;
 478 
 479 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 480 {
 481         if (a < b) 
 482                 return(a);
 483         return(b);
 484 }
 485 
 486 #undef STATE_TRACE
 487 
 488 #ifdef STATE_TRACE
 489 static char *statename[]={
 490         "Unused","Established","Syn Sent","Syn Recv",
 491         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 492         "Close Wait","Last ACK","Listen","Closing"
 493 };
 494 #endif
 495 
 496 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 497 {
 498         if(sk->state==TCP_ESTABLISHED)
 499                 tcp_statistics.TcpCurrEstab--;
 500 #ifdef STATE_TRACE
 501         if(sk->debug)
 502                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 503 #endif  
 504         /* This is a hack but it doesn't occur often and it's going to
 505            be a real        to fix nicely */
 506            
 507         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 508         {
 509                 wake_up_interruptible(&master_select_wakeup);
 510         }
 511         sk->state=state;
 512         if(state==TCP_ESTABLISHED)
 513                 tcp_statistics.TcpCurrEstab++;
 514         if(sk->state==TCP_CLOSE)
 515                 tcp_cache_zap();
 516 }
 517 
 518 /*
 519  *      This routine picks a TCP windows for a socket based on
 520  *      the following constraints
 521  *  
 522  *      1. The window can never be shrunk once it is offered (RFC 793)
 523  *      2. We limit memory per socket
 524  *   
 525  *      For now we use NET2E3's heuristic of offering half the memory
 526  *      we have handy. All is not as bad as this seems however because
 527  *      of two things. Firstly we will bin packets even within the window
 528  *      in order to get the data we are waiting for into the memory limit.
 529  *      Secondly we bin common duplicate forms at receive time
 530  *      Better heuristics welcome
 531  */
 532    
 533 int tcp_select_window(struct sock *sk)
     /*  */
 534 {
 535         int new_window = sock_rspace(sk);
 536         
 537         if(sk->window_clamp)
 538                 new_window=min(sk->window_clamp,new_window);
 539         /*
 540          *      Two things are going on here.  First, we don't ever offer a
 541          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 542          *      receiver side of SWS as specified in RFC1122.
 543          *      Second, we always give them at least the window they
 544          *      had before, in order to avoid retracting window.  This
 545          *      is technically allowed, but RFC1122 advises against it and
 546          *      in practice it causes trouble.
 547          *
 548          *      Fixme: This doesn't correctly handle the case where
 549          *      new_window > sk->window but not by enough to allow for the
 550          *      shift in sequence space. 
 551          */
 552         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 553                 return(sk->window);
 554         return(new_window);
 555 }
 556 
 557 /*
 558  *      Find someone to 'accept'. Must be called with
 559  *      sk->inuse=1 or cli()
 560  */ 
 561 
 562 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 563 {
 564         struct sk_buff *p=skb_peek(&s->receive_queue);
 565         if(p==NULL)
 566                 return NULL;
 567         do
 568         {
 569                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 570                         return p;
 571                 p=p->next;
 572         }
 573         while(p!=(struct sk_buff *)&s->receive_queue);
 574         return NULL;
 575 }
 576 
 577 /*
 578  *      Remove a completed connection and return it. This is used by
 579  *      tcp_accept() to get connections from the queue.
 580  */
 581 
 582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 583 {
 584         struct sk_buff *skb;
 585         unsigned long flags;
 586         save_flags(flags);
 587         cli(); 
 588         skb=tcp_find_established(s);
 589         if(skb!=NULL)
 590                 skb_unlink(skb);        /* Take it off the queue */
 591         restore_flags(flags);
 592         return skb;
 593 }
 594 
 595 /* 
 596  *      This routine closes sockets which have been at least partially
 597  *      opened, but not yet accepted. Currently it is only called by
 598  *      tcp_close, and timeout mirrors the value there. 
 599  */
 600 
 601 static void tcp_close_pending (struct sock *sk) 
     /*  */
 602 {
 603         struct sk_buff *skb;
 604 
 605         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 606         {
 607                 skb->sk->dead=1;
 608                 tcp_close(skb->sk, 0);
 609                 kfree_skb(skb, FREE_READ);
 610         }
 611         return;
 612 }
 613 
 614 /*
 615  *      Enter the time wait state. 
 616  */
 617 
 618 static void tcp_time_wait(struct sock *sk)
     /*  */
 619 {
 620         tcp_set_state(sk,TCP_TIME_WAIT);
 621         sk->shutdown = SHUTDOWN_MASK;
 622         if (!sk->dead)
 623                 sk->state_change(sk);
 624         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 625 }
 626 
 627 /*
 628  *      A socket has timed out on its send queue and wants to do a
 629  *      little retransmitting. Currently this means TCP.
 630  */
 631 
 632 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 633 {
 634         struct sk_buff * skb;
 635         struct proto *prot;
 636         struct device *dev;
 637         int ct=0;
 638         struct rtable *rt;
 639 
 640         prot = sk->prot;
 641         skb = sk->send_head;
 642 
 643         while (skb != NULL)
 644         {
 645                 struct tcphdr *th;
 646                 struct iphdr *iph;
 647                 int size;
 648 
 649                 dev = skb->dev;
 650                 IS_SKB(skb);
 651                 skb->when = jiffies;
 652 
 653                 /*
 654                  *      Discard the surplus MAC header
 655                  */
 656                  
 657                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 658 
 659                 /*
 660                  * In general it's OK just to use the old packet.  However we
 661                  * need to use the current ack and window fields.  Urg and
 662                  * urg_ptr could possibly stand to be updated as well, but we
 663                  * don't keep the necessary data.  That shouldn't be a problem,
 664                  * if the other end is doing the right thing.  Since we're
 665                  * changing the packet, we have to issue a new IP identifier.
 666                  */
 667 
 668                 iph = (struct iphdr *)skb->data;
 669                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 670                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 671                 
 672                 /*
 673                  *      Note: We ought to check for window limits here but
 674                  *      currently this is done (less efficiently) elsewhere.
 675                  */
 676 
 677                 /*
 678                  *      Put a MAC header back on (may cause ARPing)
 679                  */
 680                  
 681                 {
 682                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 683                          */
 684                         struct options *  opt = (struct options*)skb->proto_priv;
 685                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 686                 }
 687 
 688                 iph->id = htons(ip_id_count++);
 689 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 690                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 691                         iph->frag_off &= ~htons(IP_DF);
 692 #endif
 693                 ip_send_check(iph);
 694                         
 695                 if (rt==NULL)   /* Deep poo */
 696                 {
 697                         if(skb->sk)
 698                         {
 699                                 skb->sk->err=ENETUNREACH;
 700                                 skb->sk->error_report(skb->sk);
 701                         }
 702                 }
 703                 else
 704                 {
 705                         dev=rt->rt_dev;
 706                         skb->raddr=rt->rt_gateway;
 707                         skb->dev=dev;
 708                         skb->arp=1;
 709                         if (rt->rt_hh)
 710                         {
 711                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 712                                 if (!rt->rt_hh->hh_uptodate)
 713                                 {
 714                                         skb->arp = 0;
 715 #if RT_CACHE_DEBUG >= 2
 716                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 717 #endif
 718                                 }
 719                         }
 720                         else if (dev->hard_header)
 721                         {
 722                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 723                                         skb->arp=0;
 724                         }
 725                 
 726                         /*
 727                          *      This is not the right way to handle this. We have to
 728                          *      issue an up to date window and ack report with this 
 729                          *      retransmit to keep the odd buggy tcp that relies on 
 730                          *      the fact BSD does this happy. 
 731                          *      We don't however need to recalculate the entire 
 732                          *      checksum, so someone wanting a small problem to play
 733                          *      with might like to implement RFC1141/RFC1624 and speed
 734                          *      this up by avoiding a full checksum.
 735                          */
 736                  
 737                         th->ack_seq = ntohl(sk->acked_seq);
 738                         th->window = ntohs(tcp_select_window(sk));
 739                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 740                 
 741                         /*
 742                          *      If the interface is (still) up and running, kick it.
 743                          */
 744         
 745                         if (dev->flags & IFF_UP)
 746                         {
 747                                 /*
 748                                  *      If the packet is still being sent by the device/protocol
 749                                  *      below then don't retransmit. This is both needed, and good -
 750                                  *      especially with connected mode AX.25 where it stops resends
 751                                  *      occurring of an as yet unsent anyway frame!
 752                                  *      We still add up the counts as the round trip time wants
 753                                  *      adjusting.
 754                                  */
 755                                 if (sk && !skb_device_locked(skb))
 756                                 {
 757                                         /* Remove it from any existing driver queue first! */
 758                                         skb_unlink(skb);
 759                                         /* Now queue it */
 760                                         ip_statistics.IpOutRequests++;
 761                                         dev_queue_xmit(skb, dev, sk->priority);
 762                                 }
 763                         }
 764                 }
 765                 
 766                 /*
 767                  *      Count retransmissions
 768                  */
 769                  
 770                 ct++;
 771                 sk->prot->retransmits ++;
 772                 tcp_statistics.TcpRetransSegs++;
 773                 
 774 
 775                 /*
 776                  *      Only one retransmit requested.
 777                  */
 778         
 779                 if (!all)
 780                         break;
 781 
 782                 /*
 783                  *      This should cut it off before we send too many packets.
 784                  */
 785 
 786                 if (ct >= sk->cong_window)
 787                         break;
 788                 skb = skb->link3;
 789         }
 790 }
 791 
 792 /*
 793  *      Reset the retransmission timer
 794  */
 795  
 796 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 797 {
 798         del_timer(&sk->retransmit_timer);
 799         sk->ip_xmit_timeout = why;
 800         if((int)when < 0)
 801         {
 802                 when=3;
 803                 printk("Error: Negative timer in xmit_timer\n");
 804         }
 805         sk->retransmit_timer.expires=jiffies+when;
 806         add_timer(&sk->retransmit_timer);
 807 }
 808 
 809 /*
 810  *      This is the normal code called for timeouts.  It does the retransmission
 811  *      and then does backoff.  tcp_do_retransmit is separated out because
 812  *      tcp_ack needs to send stuff from the retransmit queue without
 813  *      initiating a backoff.
 814  */
 815 
 816 
 817 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 818 {
 819         tcp_do_retransmit(sk, all);
 820 
 821         /*
 822          * Increase the timeout each time we retransmit.  Note that
 823          * we do not increase the rtt estimate.  rto is initialized
 824          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 825          * that doubling rto each time is the least we can get away with.
 826          * In KA9Q, Karn uses this for the first few times, and then
 827          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 828          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 829          * defined in the protocol as the maximum possible RTT.  I guess
 830          * we'll have to use something other than TCP to talk to the
 831          * University of Mars.
 832          *
 833          * PAWS allows us longer timeouts and large windows, so once
 834          * implemented ftp to mars will work nicely. We will have to fix
 835          * the 120 second clamps though!
 836          */
 837 
 838         sk->retransmits++;
 839         sk->prot->retransmits++;
 840         sk->backoff++;
 841         sk->rto = min(sk->rto << 1, 120*HZ);
 842         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 843 }
 844 
 845 
 846 /*
 847  *      A timer event has trigger a tcp retransmit timeout. The
 848  *      socket xmit queue is ready and set up to send. Because
 849  *      the ack receive code keeps the queue straight we do
 850  *      nothing clever here.
 851  */
 852 
 853 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 854 {
 855         if (all) 
 856         {
 857                 tcp_retransmit_time(sk, all);
 858                 return;
 859         }
 860 
 861         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 862         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 863         sk->cong_count = 0;
 864 
 865         sk->cong_window = 1;
 866 
 867         /* Do the actual retransmit. */
 868         tcp_retransmit_time(sk, all);
 869 }
 870 
 871 /*
 872  *      A write timeout has occurred. Process the after effects.
 873  */
 874 
 875 static int tcp_write_timeout(struct sock *sk)
     /*  */
 876 {
 877         /*
 878          *      Look for a 'soft' timeout.
 879          */
 880         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 881                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 882         {
 883                 /*
 884                  *      Attempt to recover if arp has changed (unlikely!) or
 885                  *      a route has shifted (not supported prior to 1.3).
 886                  */
 887                 ip_rt_advice(&sk->ip_route_cache, 0);
 888         }
 889         
 890         /*
 891          *      Have we tried to SYN too many times (repent repent 8))
 892          */
 893          
 894         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 895         {
 896                 sk->err=ETIMEDOUT;
 897                 sk->error_report(sk);
 898                 del_timer(&sk->retransmit_timer);
 899                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 900                 tcp_set_state(sk,TCP_CLOSE);
 901                 /* Don't FIN, we got nothing back */
 902                 release_sock(sk);
 903                 return 0;
 904         }
 905         /*
 906          *      Has it gone just too far ?
 907          */
 908         if (sk->retransmits > TCP_RETR2) 
 909         {
 910                 sk->err = ETIMEDOUT;
 911                 sk->error_report(sk);
 912                 del_timer(&sk->retransmit_timer);
 913                 /*
 914                  *      Time wait the socket 
 915                  */
 916                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 917                 {
 918                         tcp_set_state(sk,TCP_TIME_WAIT);
 919                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 920                 }
 921                 else
 922                 {
 923                         /*
 924                          *      Clean up time.
 925                          */
 926                         tcp_set_state(sk, TCP_CLOSE);
 927                         release_sock(sk);
 928                         return 0;
 929                 }
 930         }
 931         return 1;
 932 }
 933 
 934 /*
 935  *      The TCP retransmit timer. This lacks a few small details.
 936  *
 937  *      1.      An initial rtt timeout on the probe0 should cause what we can
 938  *              of the first write queue buffer to be split and sent.
 939  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 940  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 941  *              tcp_err should save a 'soft error' for us.
 942  */
 943 
 944 static void retransmit_timer(unsigned long data)
     /*  */
 945 {
 946         struct sock *sk = (struct sock*)data;
 947         int why = sk->ip_xmit_timeout;
 948 
 949         /* 
 950          * only process if socket is not in use
 951          */
 952 
 953         cli();
 954         if (sk->inuse || in_bh) 
 955         {
 956                 /* Try again in 1 second */
 957                 sk->retransmit_timer.expires = jiffies+HZ;
 958                 add_timer(&sk->retransmit_timer);
 959                 sti();
 960                 return;
 961         }
 962 
 963         sk->inuse = 1;
 964         sti();
 965 
 966         /* Always see if we need to send an ack. */
 967 
 968         if (sk->ack_backlog && !sk->zapped) 
 969         {
 970                 sk->prot->read_wakeup (sk);
 971                 if (! sk->dead)
 972                         sk->data_ready(sk,0);
 973         }
 974 
 975         /* Now we need to figure out why the socket was on the timer. */
 976 
 977         switch (why) 
 978         {
 979                 /* Window probing */
 980                 case TIME_PROBE0:
 981                         tcp_send_probe0(sk);
 982                         tcp_write_timeout(sk);
 983                         break;
 984                 /* Retransmitting */
 985                 case TIME_WRITE:
 986                         /* It could be we got here because we needed to send an ack.
 987                          * So we need to check for that.
 988                          */
 989                 {
 990                         struct sk_buff *skb;
 991                         unsigned long flags;
 992 
 993                         save_flags(flags);
 994                         cli();
 995                         skb = sk->send_head;
 996                         if (!skb) 
 997                         {
 998                                 restore_flags(flags);
 999                         } 
1000                         else 
1001                         {
1002                                 /*
1003                                  *      Kicked by a delayed ack. Reset timer
1004                                  *      correctly now
1005                                  */
1006                                 if (jiffies < skb->when + sk->rto) 
1007                                 {
1008                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1009                                         restore_flags(flags);
1010                                         break;
1011                                 }
1012                                 restore_flags(flags);
1013                                 /*
1014                                  *      Retransmission
1015                                  */
1016                                 sk->retransmits++;
1017                                 sk->prot->retransmits++;
1018                                 sk->prot->retransmit (sk, 0);
1019                                 tcp_write_timeout(sk);
1020                         }
1021                         break;
1022                 }
1023                 /* Sending Keepalives */
1024                 case TIME_KEEPOPEN:
1025                         /* 
1026                          * this reset_timer() call is a hack, this is not
1027                          * how KEEPOPEN is supposed to work.
1028                          */
1029                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1030 
1031                         /* Send something to keep the connection open. */
1032                         if (sk->prot->write_wakeup)
1033                                   sk->prot->write_wakeup (sk);
1034                         sk->retransmits++;
1035                         sk->prot->retransmits++;
1036                         tcp_write_timeout(sk);
1037                         break;
1038                 default:
1039                         printk ("rexmit_timer: timer expired - reason unknown\n");
1040                         break;
1041         }
1042         release_sock(sk);
1043 }
1044 
1045 /*
1046  * This routine is called by the ICMP module when it gets some
1047  * sort of error condition.  If err < 0 then the socket should
1048  * be closed and the error returned to the user.  If err > 0
1049  * it's just the icmp type << 8 | icmp code.  After adjustment
1050  * header points to the first 8 bytes of the tcp header.  We need
1051  * to find the appropriate port.
1052  */
1053 
1054 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1055         __u32 saddr, struct inet_protocol *protocol)
1056 {
1057         struct tcphdr *th = (struct tcphdr *)header;
1058         struct sock *sk;
1059         
1060         /*
1061          *      This one is _WRONG_. FIXME urgently.
1062          */
1063 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1064         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1065 #endif  
1066         th =(struct tcphdr *)header;
1067         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1068 
1069         if (sk == NULL) 
1070                 return;
1071   
1072         if (type == ICMP_SOURCE_QUENCH) 
1073         {
1074                 /*
1075                  * FIXME:
1076                  * For now we will just trigger a linear backoff.
1077                  * The slow start code should cause a real backoff here.
1078                  */
1079                 if (sk->cong_window > 4)
1080                         sk->cong_window--;
1081                 return;
1082         }
1083         
1084         if (type == ICMP_PARAMETERPROB)
1085         {
1086                 sk->err=EPROTO;
1087                 sk->error_report(sk);
1088         }
1089 
1090 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1091         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1092         {
1093                 struct rtable * rt;
1094                 /*
1095                  * Ugly trick to pass MTU to protocol layer.
1096                  * Really we should add argument "info" to error handler.
1097                  */
1098                 unsigned short new_mtu = ntohs(iph->id);
1099 
1100                 if ((rt = sk->ip_route_cache) != NULL)
1101                         if (rt->rt_mtu > new_mtu)
1102                                 rt->rt_mtu = new_mtu;
1103 
1104                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr))
1105                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1106 
1107                 return;
1108         }
1109 #endif
1110 
1111         /*
1112          * If we've already connected we will keep trying
1113          * until we time out, or the user gives up.
1114          */
1115 
1116         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1117         {
1118                 sk->err = icmp_err_convert[code].errno;
1119                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1120                 {
1121                         tcp_statistics.TcpAttemptFails++;
1122                         tcp_set_state(sk,TCP_CLOSE);
1123                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1124                 }
1125         }
1126         return;
1127 }
1128 
1129 
1130 /*
1131  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1132  *      in the received data queue (ie a frame missing that needs sending to us). Not
1133  *      sorting using two queues as data arrives makes life so much harder.
1134  */
1135 
1136 static int tcp_readable(struct sock *sk)
     /*  */
1137 {
1138         unsigned long counted;
1139         unsigned long amount;
1140         struct sk_buff *skb;
1141         int sum;
1142         unsigned long flags;
1143 
1144         if(sk && sk->debug)
1145                 printk("tcp_readable: %p - ",sk);
1146 
1147         save_flags(flags);
1148         cli();
1149         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1150         {
1151                 restore_flags(flags);
1152                 if(sk && sk->debug) 
1153                         printk("empty\n");
1154                 return(0);
1155         }
1156   
1157         counted = sk->copied_seq;       /* Where we are at the moment */
1158         amount = 0;
1159   
1160         /* 
1161          *      Do until a push or until we are out of data. 
1162          */
1163          
1164         do 
1165         {
1166                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1167                         break;
1168                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1169                 if (skb->h.th->syn)
1170                         sum++;
1171                 if (sum > 0) 
1172                 {                                       /* Add it up, move on */
1173                         amount += sum;
1174                         if (skb->h.th->syn) 
1175                                 amount--;
1176                         counted += sum;
1177                 }
1178                 /*
1179                  * Don't count urg data ... but do it in the right place!
1180                  * Consider: "old_data (ptr is here) URG PUSH data"
1181                  * The old code would stop at the first push because
1182                  * it counted the urg (amount==1) and then does amount--
1183                  * *after* the loop.  This means tcp_readable() always
1184                  * returned zero if any URG PUSH was in the queue, even
1185                  * though there was normal data available. If we subtract
1186                  * the urg data right here, we even get it to work for more
1187                  * than one URG PUSH skb without normal data.
1188                  * This means that select() finally works now with urg data
1189                  * in the queue.  Note that rlogin was never affected
1190                  * because it doesn't use select(); it uses two processes
1191                  * and a blocking read().  And the queue scan in tcp_read()
1192                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1193                  */
1194                 if (skb->h.th->urg)
1195                         amount--;       /* don't count urg data */
1196                 if (amount && skb->h.th->psh) break;
1197                 skb = skb->next;
1198         }
1199         while(skb != (struct sk_buff *)&sk->receive_queue);
1200 
1201         restore_flags(flags);
1202         if(sk->debug)
1203                 printk("got %lu bytes.\n",amount);
1204         return(amount);
1205 }
1206 
1207 /*
1208  * LISTEN is a special case for select..
1209  */
1210 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1211 {
1212         if (sel_type == SEL_IN) {
1213                 int retval;
1214 
1215                 sk->inuse = 1;
1216                 retval = (tcp_find_established(sk) != NULL);
1217                 release_sock(sk);
1218                 if (!retval)
1219                         select_wait(&master_select_wakeup,wait);
1220                 return retval;
1221         }
1222         return 0;
1223 }
1224 
1225 
1226 /*
1227  *      Wait for a TCP event.
1228  *
1229  *      Note that we don't need to set "sk->inuse", as the upper select layers
1230  *      take care of normal races (between the test and the event) and we don't
1231  *      go look at any of the socket buffers directly.
1232  */
1233 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1234 {
1235         if (sk->state == TCP_LISTEN)
1236                 return tcp_listen_select(sk, sel_type, wait);
1237 
1238         switch(sel_type) {
1239         case SEL_IN:
1240                 if (sk->err)
1241                         return 1;
1242                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1243                         break;
1244 
1245                 if (sk->shutdown & RCV_SHUTDOWN)
1246                         return 1;
1247                         
1248                 if (sk->acked_seq == sk->copied_seq)
1249                         break;
1250 
1251                 if (sk->urg_seq != sk->copied_seq ||
1252                     sk->acked_seq != sk->copied_seq+1 ||
1253                     sk->urginline || !sk->urg_data)
1254                         return 1;
1255                 break;
1256 
1257         case SEL_OUT:
1258                 if (sk->err)
1259                         return 1;
1260                 if (sk->shutdown & SEND_SHUTDOWN) 
1261                         return 0;
1262                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1263                         break;
1264                 /*
1265                  * This is now right thanks to a small fix
1266                  * by Matt Dillon.
1267                  */
1268 
1269                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1270                         break;
1271                 return 1;
1272 
1273         case SEL_EX:
1274                 if (sk->urg_data)
1275                         return 1;
1276                 break;
1277         }
1278         select_wait(sk->sleep, wait);
1279         return 0;
1280 }
1281 
1282 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1283 {
1284         int err;
1285         switch(cmd) 
1286         {
1287 
1288                 case TIOCINQ:
1289 #ifdef FIXME    /* FIXME: */
1290                 case FIONREAD:
1291 #endif
1292                 {
1293                         unsigned long amount;
1294 
1295                         if (sk->state == TCP_LISTEN) 
1296                                 return(-EINVAL);
1297 
1298                         sk->inuse = 1;
1299                         amount = tcp_readable(sk);
1300                         release_sock(sk);
1301                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1302                         if(err)
1303                                 return err;
1304                         put_user(amount, (int *)arg);
1305                         return(0);
1306                 }
1307                 case SIOCATMARK:
1308                 {
1309                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1310 
1311                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1312                         if (err)
1313                                 return err;
1314                         put_user(answ,(int *) arg);
1315                         return(0);
1316                 }
1317                 case TIOCOUTQ:
1318                 {
1319                         unsigned long amount;
1320 
1321                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1322                         amount = sock_wspace(sk);
1323                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1324                         if(err)
1325                                 return err;
1326                         put_user(amount, (int *)arg);
1327                         return(0);
1328                 }
1329                 default:
1330                         return(-EINVAL);
1331         }
1332 }
1333 
1334 
1335 /*
1336  *      This routine computes a TCP checksum. 
1337  *
1338  *      Modified January 1995 from a go-faster DOS routine by
1339  *      Jorge Cwik <jorge@laser.satlink.net>
1340  */
1341  
1342 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1343           unsigned long saddr, unsigned long daddr, unsigned long base)
1344 {     
1345         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1346 }
1347 
1348 
1349 
1350 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1351                 unsigned long daddr, int len, struct sock *sk)
1352 {
1353         th->check = 0;
1354         th->check = tcp_check(th, len, saddr, daddr,
1355                 csum_partial((char *)th,len,0));
1356         return;
1357 }
1358 
1359 /*
1360  *      This is the main buffer sending routine. We queue the buffer
1361  *      having checked it is sane seeming.
1362  */
1363  
1364 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1365 {
1366         int size;
1367         struct tcphdr * th = skb->h.th;
1368 
1369         /*
1370          *      length of packet (not counting length of pre-tcp headers) 
1371          */
1372          
1373         size = skb->len - ((unsigned char *) th - skb->data);
1374 
1375         /*
1376          *      Sanity check it.. 
1377          */
1378          
1379         if (size < sizeof(struct tcphdr) || size > skb->len) 
1380         {
1381                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1382                         skb, skb->data, th, skb->len);
1383                 kfree_skb(skb, FREE_WRITE);
1384                 return;
1385         }
1386 
1387         /*
1388          *      If we have queued a header size packet.. (these crash a few
1389          *      tcp stacks if ack is not set)
1390          */
1391          
1392         if (size == sizeof(struct tcphdr)) 
1393         {
1394                 /* If it's got a syn or fin it's notionally included in the size..*/
1395                 if(!th->syn && !th->fin) 
1396                 {
1397                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1398                         kfree_skb(skb,FREE_WRITE);
1399                         return;
1400                 }
1401         }
1402 
1403         /*
1404          *      Actual processing.
1405          */
1406          
1407         tcp_statistics.TcpOutSegs++;  
1408         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1409         
1410         /*
1411          *      We must queue if
1412          *
1413          *      a) The right edge of this frame exceeds the window
1414          *      b) We are retransmitting (Nagle's rule)
1415          *      c) We have too many packets 'in flight'
1416          */
1417          
1418         if (after(skb->h.seq, sk->window_seq) ||
1419             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1420              sk->packets_out >= sk->cong_window) 
1421         {
1422                 /* checksum will be supplied by tcp_write_xmit.  So
1423                  * we shouldn't need to set it at all.  I'm being paranoid */
1424                 th->check = 0;
1425                 if (skb->next != NULL) 
1426                 {
1427                         printk("tcp_send_partial: next != NULL\n");
1428                         skb_unlink(skb);
1429                 }
1430                 skb_queue_tail(&sk->write_queue, skb);
1431                 
1432                 /*
1433                  *      If we don't fit we have to start the zero window
1434                  *      probes. This is broken - we really need to do a partial
1435                  *      send _first_ (This is what causes the Cisco and PC/TCP
1436                  *      grief).
1437                  */
1438                  
1439                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1440                     sk->send_head == NULL && sk->ack_backlog == 0)
1441                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1442         } 
1443         else 
1444         {
1445                 /*
1446                  *      This is going straight out
1447                  */
1448                  
1449                 th->ack_seq = ntohl(sk->acked_seq);
1450                 th->window = ntohs(tcp_select_window(sk));
1451 
1452                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1453 
1454                 sk->sent_seq = sk->write_seq;
1455                 
1456                 /*
1457                  *      This is mad. The tcp retransmit queue is put together
1458                  *      by the ip layer. This causes half the problems with
1459                  *      unroutable FIN's and other things.
1460                  */
1461                  
1462                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1463                 
1464                 /*
1465                  *      Set for next retransmit based on expected ACK time.
1466                  *      FIXME: We set this every time which means our 
1467                  *      retransmits are really about a window behind.
1468                  */
1469 
1470                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1471         }
1472 }
1473 
1474 /*
1475  *      Locking problems lead us to a messy situation where we can have
1476  *      multiple partially complete buffers queued up. This is really bad
1477  *      as we don't want to be sending partial buffers. Fix this with
1478  *      a semaphore or similar to lock tcp_write per socket.
1479  *
1480  *      These routines are pretty self descriptive.
1481  */
1482  
1483 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1484 {
1485         struct sk_buff * skb;
1486         unsigned long flags;
1487 
1488         save_flags(flags);
1489         cli();
1490         skb = sk->partial;
1491         if (skb) {
1492                 sk->partial = NULL;
1493                 del_timer(&sk->partial_timer);
1494         }
1495         restore_flags(flags);
1496         return skb;
1497 }
1498 
1499 /*
1500  *      Empty the partial queue
1501  */
1502  
1503 static void tcp_send_partial(struct sock *sk)
     /*  */
1504 {
1505         struct sk_buff *skb;
1506 
1507         if (sk == NULL)
1508                 return;
1509         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1510                 tcp_send_skb(sk, skb);
1511 }
1512 
1513 /*
1514  *      Queue a partial frame
1515  */
1516  
1517 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1518 {
1519         struct sk_buff * tmp;
1520         unsigned long flags;
1521 
1522         save_flags(flags);
1523         cli();
1524         tmp = sk->partial;
1525         if (tmp)
1526                 del_timer(&sk->partial_timer);
1527         sk->partial = skb;
1528         init_timer(&sk->partial_timer);
1529         /*
1530          *      Wait up to 1 second for the buffer to fill.
1531          */
1532         sk->partial_timer.expires = jiffies+HZ;
1533         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1534         sk->partial_timer.data = (unsigned long) sk;
1535         add_timer(&sk->partial_timer);
1536         restore_flags(flags);
1537         if (tmp)
1538                 tcp_send_skb(sk, tmp);
1539 }
1540 
1541 
1542 /*
1543  *      This routine sends an ack and also updates the window. 
1544  */
1545  
1546 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1547              struct sock *sk,
1548              struct tcphdr *th, unsigned long daddr)
1549 {
1550         struct sk_buff *buff;
1551         struct tcphdr *t1;
1552         struct device *dev = NULL;
1553         int tmp;
1554 
1555         if(sk->zapped)
1556                 return;         /* We have been reset, we may not send again */
1557                 
1558         /*
1559          * We need to grab some memory, and put together an ack,
1560          * and then put it into the queue to be sent.
1561          */
1562 
1563         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1564         if (buff == NULL) 
1565         {
1566                 /* 
1567                  *      Force it to send an ack. We don't have to do this
1568                  *      (ACK is unreliable) but it's much better use of 
1569                  *      bandwidth on slow links to send a spare ack than
1570                  *      resend packets. 
1571                  */
1572                  
1573                 sk->ack_backlog++;
1574                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1575                 {
1576                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1577                 }
1578                 return;
1579         }
1580 
1581         /*
1582          *      Assemble a suitable TCP frame
1583          */
1584          
1585         buff->sk = sk;
1586         buff->localroute = sk->localroute;
1587 
1588         /* 
1589          *      Put in the IP header and routing stuff. 
1590          */
1591          
1592         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1593                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1594         if (tmp < 0) 
1595         {
1596                 buff->free = 1;
1597                 sock_wfree(sk, buff);
1598                 return;
1599         }
1600         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1601 
1602         memcpy(t1, th, sizeof(*t1));
1603 
1604         /*
1605          *      Swap the send and the receive. 
1606          */
1607          
1608         t1->dest = th->source;
1609         t1->source = th->dest;
1610         t1->seq = ntohl(sequence);
1611         t1->ack = 1;
1612         sk->window = tcp_select_window(sk);
1613         t1->window = ntohs(sk->window);
1614         t1->res1 = 0;
1615         t1->res2 = 0;
1616         t1->rst = 0;
1617         t1->urg = 0;
1618         t1->syn = 0;
1619         t1->psh = 0;
1620         t1->fin = 0;
1621         
1622         /*
1623          *      If we have nothing queued for transmit and the transmit timer
1624          *      is on we are just doing an ACK timeout and need to switch
1625          *      to a keepalive.
1626          */
1627          
1628         if (ack == sk->acked_seq) 
1629         {
1630                 sk->ack_backlog = 0;
1631                 sk->bytes_rcv = 0;
1632                 sk->ack_timed = 0;
1633                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1634                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1635                 {
1636                         if(sk->keepopen) {
1637                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1638                         } else {
1639                                 delete_timer(sk);
1640                         }
1641                 }
1642         }
1643         
1644         /*
1645          *      Fill in the packet and send it
1646          */
1647          
1648         t1->ack_seq = ntohl(ack);
1649         t1->doff = sizeof(*t1)/4;
1650         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1651         if (sk->debug)
1652                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1653         tcp_statistics.TcpOutSegs++;
1654         sk->prot->queue_xmit(sk, dev, buff, 1);
1655 }
1656 
1657 
1658 /* 
1659  *      This routine builds a generic TCP header. 
1660  */
1661  
1662 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1663 {
1664 
1665         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1666         th->seq = htonl(sk->write_seq);
1667         th->psh =(push == 0) ? 1 : 0;
1668         th->doff = sizeof(*th)/4;
1669         th->ack = 1;
1670         th->fin = 0;
1671         sk->ack_backlog = 0;
1672         sk->bytes_rcv = 0;
1673         sk->ack_timed = 0;
1674         th->ack_seq = htonl(sk->acked_seq);
1675         sk->window = tcp_select_window(sk);
1676         th->window = htons(sk->window);
1677 
1678         return(sizeof(*th));
1679 }
1680 
1681 /*
1682  *      This routine copies from a user buffer into a socket,
1683  *      and starts the transmit system.
1684  */
1685 
1686 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1687           int len, int nonblock, int flags)
1688 {
1689         int copied = 0;
1690         int copy;
1691         int tmp;
1692         int seglen;
1693         int iovct=0;
1694         struct sk_buff *skb;
1695         struct sk_buff *send_tmp;
1696         struct proto *prot;
1697         struct device *dev = NULL;
1698         unsigned char *from;
1699         
1700         /*
1701          *      Do sanity checking for sendmsg/sendto/send
1702          */
1703          
1704         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1705                 return -EINVAL;
1706         if (msg->msg_name)
1707         {
1708                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1709                 if(sk->state == TCP_CLOSE)
1710                         return -ENOTCONN;
1711                 if (msg->msg_namelen < sizeof(*addr))
1712                         return -EINVAL;
1713                 if (addr->sin_family && addr->sin_family != AF_INET) 
1714                         return -EINVAL;
1715                 if (addr->sin_port != sk->dummy_th.dest) 
1716                         return -EISCONN;
1717                 if (addr->sin_addr.s_addr != sk->daddr) 
1718                         return -EISCONN;
1719         }
1720         
1721         /*
1722          *      Ok commence sending
1723          */
1724         
1725         while(iovct<msg->msg_iovlen)
1726         {
1727                 seglen=msg->msg_iov[iovct].iov_len;
1728                 from=msg->msg_iov[iovct++].iov_base;
1729                 sk->inuse=1;
1730                 prot = sk->prot;
1731                 while(seglen > 0) 
1732                 {
1733                         if (sk->err) 
1734                         {                       /* Stop on an error */
1735                                 release_sock(sk);
1736                                 if (copied) 
1737                                         return(copied);
1738                                 return sock_error(sk);
1739                         }
1740 
1741                         /*
1742                          *      First thing we do is make sure that we are established. 
1743                          */
1744         
1745                         if (sk->shutdown & SEND_SHUTDOWN) 
1746                         {
1747                                 release_sock(sk);
1748                                 sk->err = EPIPE;
1749                                 if (copied) 
1750                                         return(copied);
1751                                 sk->err = 0;
1752                                 return(-EPIPE);
1753                         }
1754 
1755                         /* 
1756                          *      Wait for a connection to finish.
1757                          */
1758                 
1759                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1760                         {
1761                                 if (sk->err) 
1762                                 {
1763                                         release_sock(sk);
1764                                         if (copied) 
1765                                                 return(copied);
1766                                         return sock_error(sk);
1767                                 }               
1768         
1769                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1770                                 {
1771                                         release_sock(sk);
1772                                         if (copied) 
1773                                                 return(copied);
1774         
1775                                         if (sk->err) 
1776                                                 return sock_error(sk);
1777 
1778                                         if (sk->keepopen) 
1779                                         {
1780                                                 send_sig(SIGPIPE, current, 0);
1781                                         }
1782                                         return(-EPIPE);
1783                                 }
1784         
1785                                 if (nonblock || copied) 
1786                                 {
1787                                         release_sock(sk);
1788                                         if (copied) 
1789                                                 return(copied);
1790                                         return(-EAGAIN);
1791                                 }
1792         
1793                                 release_sock(sk);
1794                                 cli();
1795                         
1796                                 if (sk->state != TCP_ESTABLISHED &&
1797                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1798                                 {
1799                                         interruptible_sleep_on(sk->sleep);      
1800                                         if (current->signal & ~current->blocked)
1801                                         {
1802                                                 sti();
1803                                                 if (copied) 
1804                                                         return(copied);
1805                                                 return(-ERESTARTSYS);
1806                                         }
1807                                 }
1808                                 sk->inuse = 1;
1809                                 sti();
1810                         }
1811         
1812                 /*
1813                  * The following code can result in copy <= if sk->mss is ever
1814                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1815                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1816                  * had better not get here until we've seen his SYN and at least one
1817                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1818                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1819                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1820                  * before the exchange of SYN's.  If the initial ack from the other
1821                  * end has a window of 0, max_window and thus mss will both be 0.
1822                  */
1823         
1824                 /* 
1825                  *      Now we need to check if we have a half built packet. 
1826                  */
1827 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1828                 /*
1829                  *      FIXME:  I'm almost sure that this fragment is BUG,
1830                  *              but it works... I do not know why 8) --ANK
1831                  *
1832                  *      Really, we should rebuild all the queues...
1833                  *      It's difficult. Temprorary hack is to send all
1834                  *      queued segments with allowed fragmentation.
1835                  */
1836                 {
1837                         int new_mss = min(sk->mtu, sk->max_window);
1838                         if (new_mss < sk->mss)
1839                         {
1840                                 tcp_send_partial(sk);
1841                                 sk->mss = new_mss;
1842                         }
1843                 }
1844 #endif
1845         
1846                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1847                         {
1848                                 int hdrlen;
1849 
1850                                  /* IP header + TCP header */
1851                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1852                                          + sizeof(struct tcphdr);
1853         
1854                                 /* Add more stuff to the end of skb->len */
1855                                 if (!(flags & MSG_OOB)) 
1856                                 {
1857                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1858                                         if (copy <= 0) 
1859                                         {
1860                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1861                                                 return -EFAULT;
1862                                         }                 
1863                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1864                                         from += copy;
1865                                         copied += copy;
1866                                         len -= copy;
1867                                         sk->write_seq += copy;
1868                                         seglen -= copy;
1869                                 }
1870                                 if ((skb->len - hdrlen) >= sk->mss ||
1871                                         (flags & MSG_OOB) || !sk->packets_out)
1872                                         tcp_send_skb(sk, skb);
1873                                 else
1874                                         tcp_enqueue_partial(skb, sk);
1875                                 continue;
1876                         }
1877 
1878                 /*
1879                  * We also need to worry about the window.
1880                  * If window < 1/2 the maximum window we've seen from this
1881                  *   host, don't use it.  This is sender side
1882                  *   silly window prevention, as specified in RFC1122.
1883                  *   (Note that this is different than earlier versions of
1884                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1885                  *   use the whole MSS.  Since the results in the right
1886                  *   edge of the packet being outside the window, it will
1887                  *   be queued for later rather than sent.
1888                  */
1889 
1890                         copy = sk->window_seq - sk->write_seq;
1891                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1892                                 copy = sk->mss;
1893                         if (copy > seglen)
1894                                 copy = seglen;
1895 
1896                 /*
1897                  *      We should really check the window here also. 
1898                  */
1899                  
1900                         send_tmp = NULL;
1901                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1902                         {
1903                                 /*
1904                                  *      We will release the socket in case we sleep here. 
1905                                  */
1906                                 release_sock(sk);
1907                                 /*
1908                                  *      NB: following must be mtu, because mss can be increased.
1909                                  *      mss is always <= mtu 
1910                                  */
1911                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1912                                 sk->inuse = 1;
1913                                 send_tmp = skb;
1914                         } 
1915                         else 
1916                         {
1917                                 /*
1918                                  *      We will release the socket in case we sleep here. 
1919                                  */
1920                                 release_sock(sk);
1921                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1922                                 sk->inuse = 1;
1923                         }
1924         
1925                         /*
1926                          *      If we didn't get any memory, we need to sleep. 
1927                          */
1928         
1929                         if (skb == NULL) 
1930                         {
1931                                 sk->socket->flags |= SO_NOSPACE;
1932                                 if (nonblock) 
1933                                 {
1934                                         release_sock(sk);
1935                                         if (copied) 
1936                                                 return(copied);
1937                                         return(-EAGAIN);
1938                                 }
1939 
1940                                 /*
1941                                  *      FIXME: here is another race condition. 
1942                                  */
1943 
1944                                 tmp = sk->wmem_alloc;
1945                                 release_sock(sk);
1946                                 cli();
1947                                 /*
1948                                  *      Again we will try to avoid it. 
1949                                  */
1950                                 if (tmp <= sk->wmem_alloc &&
1951                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1952                                         && sk->err == 0) 
1953                                 {
1954                                         sk->socket->flags &= ~SO_NOSPACE;
1955                                         interruptible_sleep_on(sk->sleep);
1956                                         if (current->signal & ~current->blocked) 
1957                                         {
1958                                                 sti();
1959                                                 if (copied) 
1960                                                         return(copied);
1961                                                 return(-ERESTARTSYS);
1962                                         }
1963                                 }
1964                                 sk->inuse = 1;
1965                                 sti();
1966                                 continue;
1967                         }
1968 
1969                         skb->sk = sk;
1970                         skb->free = 0;
1971                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1972         
1973                         /*
1974                          * FIXME: we need to optimize this.
1975                          * Perhaps some hints here would be good.
1976                          */
1977                 
1978                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1979                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1980                         if (tmp < 0 ) 
1981                         {
1982                                 sock_wfree(sk, skb);
1983                                 release_sock(sk);
1984                                 if (copied) 
1985                                         return(copied);
1986                                 return(tmp);
1987                         }
1988 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1989                         skb->ip_hdr->frag_off |= htons(IP_DF);
1990 #endif
1991                         skb->dev = dev;
1992                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1993                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1994                         if (tmp < 0) 
1995                         {
1996                                 sock_wfree(sk, skb);
1997                                 release_sock(sk);
1998                                 if (copied) 
1999                                         return(copied);
2000                                 return(tmp);
2001                         }
2002         
2003                         if (flags & MSG_OOB) 
2004                         {
2005                                 skb->h.th->urg = 1;
2006                                 skb->h.th->urg_ptr = ntohs(copy);
2007                         }
2008 
2009                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2010                 
2011                         from += copy;
2012                         copied += copy;
2013                         len -= copy;
2014                         seglen -= copy;
2015                         skb->free = 0;
2016                         sk->write_seq += copy;
2017                 
2018                         if (send_tmp != NULL && sk->packets_out) 
2019                         {
2020                                 tcp_enqueue_partial(send_tmp, sk);
2021                                 continue;
2022                         }
2023                         tcp_send_skb(sk, skb);
2024                 }
2025         }
2026         sk->err = 0;
2027 
2028 /*
2029  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2030  *      interactive fast network servers. It's meant to be on and
2031  *      it really improves the throughput though not the echo time
2032  *      on my slow slip link - Alan
2033  */
2034 
2035 /*
2036  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2037  */
2038  
2039         if(sk->partial && ((!sk->packets_out) 
2040      /* If not nagling we can send on the before case too.. */
2041               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2042         ))
2043                 tcp_send_partial(sk);
2044 
2045         release_sock(sk);
2046         return(copied);
2047 }
2048 
2049 /*
2050  *      Send an ack if one is backlogged at this point. Ought to merge
2051  *      this with tcp_send_ack().
2052  */
2053  
2054 static void tcp_read_wakeup(struct sock *sk)
     /*  */
2055 {
2056         int tmp;
2057         struct device *dev = NULL;
2058         struct tcphdr *t1;
2059         struct sk_buff *buff;
2060 
2061         if (!sk->ack_backlog) 
2062                 return;
2063 
2064         /*
2065          * If we're closed, don't send an ack, or we'll get a RST
2066          * from the closed destination.
2067          */
2068         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2069                 return; 
2070 
2071         /*
2072          * FIXME: we need to put code here to prevent this routine from
2073          * being called.  Being called once in a while is ok, so only check
2074          * if this is the second time in a row.
2075          */
2076 
2077         /*
2078          * We need to grab some memory, and put together an ack,
2079          * and then put it into the queue to be sent.
2080          */
2081 
2082         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2083         if (buff == NULL) 
2084         {
2085                 /* Try again real soon. */
2086                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2087                 return;
2088         }
2089 
2090         buff->sk = sk;
2091         buff->localroute = sk->localroute;
2092         
2093         /*
2094          *      Put in the IP header and routing stuff. 
2095          */
2096 
2097         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2098                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2099         if (tmp < 0) 
2100         {
2101                 buff->free = 1;
2102                 sock_wfree(sk, buff);
2103                 return;
2104         }
2105 
2106         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2107 
2108         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2109         t1->seq = htonl(sk->sent_seq);
2110         t1->ack = 1;
2111         t1->res1 = 0;
2112         t1->res2 = 0;
2113         t1->rst = 0;
2114         t1->urg = 0;
2115         t1->syn = 0;
2116         t1->psh = 0;
2117         sk->ack_backlog = 0;
2118         sk->bytes_rcv = 0;
2119         sk->window = tcp_select_window(sk);
2120         t1->window = ntohs(sk->window);
2121         t1->ack_seq = ntohl(sk->acked_seq);
2122         t1->doff = sizeof(*t1)/4;
2123         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2124         sk->prot->queue_xmit(sk, dev, buff, 1);
2125         tcp_statistics.TcpOutSegs++;
2126 }
2127 
2128 
2129 /*
2130  *      FIXME:
2131  *      This routine frees used buffers.
2132  *      It should consider sending an ACK to let the
2133  *      other end know we now have a bigger window.
2134  */
2135 
2136 static void cleanup_rbuf(struct sock *sk)
     /*  */
2137 {
2138         unsigned long flags;
2139         unsigned long left;
2140         struct sk_buff *skb;
2141         unsigned long rspace;
2142 
2143         if(sk->debug)
2144                 printk("cleaning rbuf for sk=%p\n", sk);
2145   
2146         save_flags(flags);
2147         cli();
2148   
2149         left = sock_rspace(sk);
2150  
2151         /*
2152          *      We have to loop through all the buffer headers,
2153          *      and try to free up all the space we can.
2154          */
2155 
2156         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2157         {
2158                 if (!skb->used || skb->users) 
2159                         break;
2160                 skb_unlink(skb);
2161                 skb->sk = sk;
2162                 kfree_skb(skb, FREE_READ);
2163         }
2164 
2165         restore_flags(flags);
2166 
2167         /*
2168          *      FIXME:
2169          *      At this point we should send an ack if the difference
2170          *      in the window, and the amount of space is bigger than
2171          *      TCP_WINDOW_DIFF.
2172          */
2173 
2174         if(sk->debug)
2175                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2176                                             left);
2177         if ((rspace=sock_rspace(sk)) != left) 
2178         {
2179                 /*
2180                  * This area has caused the most trouble.  The current strategy
2181                  * is to simply do nothing if the other end has room to send at
2182                  * least 3 full packets, because the ack from those will auto-
2183                  * matically update the window.  If the other end doesn't think
2184                  * we have much space left, but we have room for at least 1 more
2185                  * complete packet than it thinks we do, we will send an ack
2186                  * immediately.  Otherwise we will wait up to .5 seconds in case
2187                  * the user reads some more.
2188                  */
2189                 sk->ack_backlog++;
2190         /*
2191          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2192          * if the other end is offering a window smaller than the agreed on MSS
2193          * (called sk->mtu here).  In theory there's no connection between send
2194          * and receive, and so no reason to think that they're going to send
2195          * small packets.  For the moment I'm using the hack of reducing the mss
2196          * only on the send side, so I'm putting mtu here.
2197          */
2198 
2199                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2200                 {
2201                         /* Send an ack right now. */
2202                         tcp_read_wakeup(sk);
2203                 } 
2204                 else 
2205                 {
2206                         /* Force it to send an ack soon. */
2207                         int was_active = del_timer(&sk->retransmit_timer);
2208                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2209                         {
2210                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2211                         } 
2212                         else
2213                                 add_timer(&sk->retransmit_timer);
2214                 }
2215         }
2216 } 
2217 
2218 
2219 /*
2220  *      Handle reading urgent data. BSD has very simple semantics for
2221  *      this, no blocking and very strange errors 8)
2222  */
2223  
2224 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2225              struct msghdr *msg, int len, int flags, int *addr_len)
2226 {
2227         /*
2228          *      No URG data to read
2229          */
2230         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2231                 return -EINVAL; /* Yes this is right ! */
2232                 
2233         if (sk->err) 
2234                 return sock_error(sk);
2235                 
2236         if (sk->state == TCP_CLOSE || sk->done) 
2237         {
2238                 if (!sk->done) 
2239                 {
2240                         sk->done = 1;
2241                         return 0;
2242                 }
2243                 return -ENOTCONN;
2244         }
2245 
2246         if (sk->shutdown & RCV_SHUTDOWN) 
2247         {
2248                 sk->done = 1;
2249                 return 0;
2250         }
2251         sk->inuse = 1;
2252         if (sk->urg_data & URG_VALID) 
2253         {
2254                 char c = sk->urg_data;
2255                 if (!(flags & MSG_PEEK))
2256                         sk->urg_data = URG_READ;
2257                 memcpy_toiovec(msg->msg_iov, &c, 1);
2258                 if(msg->msg_name)
2259                 {
2260                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2261                         sin->sin_family=AF_INET;
2262                         sin->sin_addr.s_addr=sk->daddr;
2263                         sin->sin_port=sk->dummy_th.dest;
2264                 }
2265                 if(addr_len)
2266                         *addr_len=sizeof(struct sockaddr_in);
2267                 release_sock(sk);
2268                 return 1;
2269         }
2270         release_sock(sk);
2271         
2272         /*
2273          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2274          * the available implementations agree in this case:
2275          * this call should never block, independent of the
2276          * blocking state of the socket.
2277          * Mike <pall@rz.uni-karlsruhe.de>
2278          */
2279         return -EAGAIN;
2280 }
2281 
2282 
2283 /*
2284  *      This routine copies from a sock struct into the user buffer. 
2285  */
2286  
2287 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2288         int len, int nonblock, int flags, int *addr_len)
2289 {
2290         struct wait_queue wait = { current, NULL };
2291         int copied = 0;
2292         u32 peek_seq;
2293         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2294         unsigned long used;
2295 
2296         /* 
2297          *      This error should be checked. 
2298          */
2299          
2300         if (sk->state == TCP_LISTEN)
2301                 return -ENOTCONN;
2302 
2303         /*
2304          *      Urgent data needs to be handled specially. 
2305          */
2306          
2307         if (flags & MSG_OOB)
2308                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2309 
2310         /*
2311          *      Copying sequence to update. This is volatile to handle
2312          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2313          *      inline and thus not flush cached variables otherwise).
2314          */
2315          
2316         peek_seq = sk->copied_seq;
2317         seq = &sk->copied_seq;
2318         if (flags & MSG_PEEK)
2319                 seq = &peek_seq;
2320 
2321         add_wait_queue(sk->sleep, &wait);
2322         sk->inuse = 1;
2323         while (len > 0) 
2324         {
2325                 struct sk_buff * skb;
2326                 u32 offset;
2327         
2328                 /*
2329                  * Are we at urgent data? Stop if we have read anything.
2330                  */
2331                  
2332                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2333                         break;
2334 
2335                 /*
2336                  *      Next get a buffer.
2337                  */
2338                  
2339                 current->state = TASK_INTERRUPTIBLE;
2340 
2341                 skb = skb_peek(&sk->receive_queue);
2342                 do 
2343                 {
2344                         if (!skb)
2345                                 break;
2346                         if (before(*seq, skb->h.th->seq))
2347                                 break;
2348                         offset = *seq - skb->h.th->seq;
2349                         if (skb->h.th->syn)
2350                                 offset--;
2351                         if (offset < skb->len)
2352                                 goto found_ok_skb;
2353                         if (skb->h.th->fin)
2354                                 goto found_fin_ok;
2355                         if (!(flags & MSG_PEEK))
2356                                 skb->used = 1;
2357                         skb = skb->next;
2358                 }
2359                 while (skb != (struct sk_buff *)&sk->receive_queue);
2360 
2361                 if (copied)
2362                         break;
2363 
2364                 if (sk->err) 
2365                 {
2366                         copied = sock_error(sk);
2367                         break;
2368                 }
2369 
2370                 if (sk->state == TCP_CLOSE) 
2371                 {
2372                         if (!sk->done) 
2373                         {
2374                                 sk->done = 1;
2375                                 break;
2376                         }
2377                         copied = -ENOTCONN;
2378                         break;
2379                 }
2380 
2381                 if (sk->shutdown & RCV_SHUTDOWN) 
2382                 {
2383                         sk->done = 1;
2384                         break;
2385                 }
2386                         
2387                 if (nonblock) 
2388                 {
2389                         copied = -EAGAIN;
2390                         break;
2391                 }
2392 
2393                 cleanup_rbuf(sk);
2394                 release_sock(sk);
2395                 sk->socket->flags |= SO_WAITDATA;
2396                 schedule();
2397                 sk->socket->flags &= ~SO_WAITDATA;
2398                 sk->inuse = 1;
2399 
2400                 if (current->signal & ~current->blocked) 
2401                 {
2402                         copied = -ERESTARTSYS;
2403                         break;
2404                 }
2405                 continue;
2406 
2407         found_ok_skb:
2408                 /*
2409                  *      Lock the buffer. We can be fairly relaxed as
2410                  *      an interrupt will never steal a buffer we are 
2411                  *      using unless I've missed something serious in
2412                  *      tcp_data.
2413                  */
2414                 
2415                 skb->users++;
2416                 
2417                 /*
2418                  *      Ok so how much can we use ? 
2419                  */
2420                  
2421                 used = skb->len - offset;
2422                 if (len < used)
2423                         used = len;
2424                 /*
2425                  *      Do we have urgent data here? 
2426                  */
2427                 
2428                 if (sk->urg_data) 
2429                 {
2430                         u32 urg_offset = sk->urg_seq - *seq;
2431                         if (urg_offset < used) 
2432                         {
2433                                 if (!urg_offset) 
2434                                 {
2435                                         if (!sk->urginline) 
2436                                         {
2437                                                 ++*seq;
2438                                                 offset++;
2439                                                 used--;
2440                                         }
2441                                 }
2442                                 else
2443                                         used = urg_offset;
2444                         }
2445                 }
2446                 
2447                 /*
2448                  *      Copy it - We _MUST_ update *seq first so that we
2449                  *      don't ever double read when we have dual readers
2450                  */
2451                  
2452                 *seq += used;
2453 
2454                 /*
2455                  *      This memcpy_tofs can sleep. If it sleeps and we
2456                  *      do a second read it relies on the skb->users to avoid
2457                  *      a crash when cleanup_rbuf() gets called.
2458                  */
2459                  
2460                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2461                         skb->h.th->doff*4 + offset, used);
2462                 copied += used;
2463                 len -= used;
2464                 
2465                 /*
2466                  *      We now will not sleep again until we are finished
2467                  *      with skb. Sorry if you are doing the SMP port
2468                  *      but you'll just have to fix it neatly ;)
2469                  */
2470                  
2471                 skb->users --;
2472                 
2473                 if (after(sk->copied_seq,sk->urg_seq))
2474                         sk->urg_data = 0;
2475                 if (used + offset < skb->len)
2476                         continue;
2477                 
2478                 /*
2479                  *      Process the FIN.
2480                  */
2481 
2482                 if (skb->h.th->fin)
2483                         goto found_fin_ok;
2484                 if (flags & MSG_PEEK)
2485                         continue;
2486                 skb->used = 1;
2487                 continue;
2488 
2489         found_fin_ok:
2490                 ++*seq;
2491                 if (flags & MSG_PEEK)
2492                         break;
2493                         
2494                 /*
2495                  *      All is done
2496                  */
2497                  
2498                 skb->used = 1;
2499                 sk->shutdown |= RCV_SHUTDOWN;
2500                 break;
2501 
2502         }
2503         
2504         if(copied>0 && msg->msg_name)
2505         {
2506                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2507                 sin->sin_family=AF_INET;
2508                 sin->sin_addr.s_addr=sk->daddr;
2509                 sin->sin_port=sk->dummy_th.dest;
2510         }
2511         if(addr_len)
2512                 *addr_len=sizeof(struct sockaddr_in);
2513                 
2514         remove_wait_queue(sk->sleep, &wait);
2515         current->state = TASK_RUNNING;
2516 
2517         /* Clean up data we have read: This will do ACK frames */
2518         cleanup_rbuf(sk);
2519         release_sock(sk);
2520         return copied;
2521 }
2522 
2523 
2524 
2525 /*
2526  *      State processing on a close. This implements the state shift for
2527  *      sending our FIN frame. Note that we only send a FIN for some 
2528  *      states. A shutdown() may have already sent the FIN, or we may be
2529  *      closed.
2530  */
2531  
2532 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2533 {
2534         int ns=TCP_CLOSE;
2535         int send_fin=0;
2536         switch(sk->state)
2537         {
2538                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2539                         break;
2540                 case TCP_SYN_RECV:
2541                 case TCP_ESTABLISHED:   /* Closedown begin */
2542                         ns=TCP_FIN_WAIT1;
2543                         send_fin=1;
2544                         break;
2545                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2546                 case TCP_FIN_WAIT2:
2547                 case TCP_CLOSING:
2548                         ns=sk->state;
2549                         break;
2550                 case TCP_CLOSE:
2551                 case TCP_LISTEN:
2552                         break;
2553                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2554                                            wait only for the ACK */
2555                         ns=TCP_LAST_ACK;
2556                         send_fin=1;
2557         }
2558         
2559         tcp_set_state(sk,ns);
2560                 
2561         /*
2562          *      This is a (useful) BSD violating of the RFC. There is a
2563          *      problem with TCP as specified in that the other end could
2564          *      keep a socket open forever with no application left this end.
2565          *      We use a 3 minute timeout (about the same as BSD) then kill
2566          *      our end. If they send after that then tough - BUT: long enough
2567          *      that we won't make the old 4*rto = almost no time - whoops
2568          *      reset mistake.
2569          */
2570         if(dead && ns==TCP_FIN_WAIT2)
2571         {
2572                 int timer_active=del_timer(&sk->timer);
2573                 if(timer_active)
2574                         add_timer(&sk->timer);
2575                 else
2576                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2577         }
2578         
2579         return send_fin;
2580 }
2581 
2582 /*
2583  *      Send a fin.
2584  */
2585 
2586 static void tcp_send_fin(struct sock *sk)
     /*  */
2587 {
2588         struct proto *prot =(struct proto *)sk->prot;
2589         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2590         struct tcphdr *t1;
2591         struct sk_buff *buff;
2592         struct device *dev=NULL;
2593         int tmp;
2594                 
2595         release_sock(sk); /* in case the malloc sleeps. */
2596         
2597         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2598         sk->inuse = 1;
2599 
2600         if (buff == NULL)
2601         {
2602                 /* This is a disaster if it occurs */
2603                 printk("tcp_send_fin: Impossible malloc failure");
2604                 return;
2605         }
2606 
2607         /*
2608          *      Administrivia
2609          */
2610          
2611         buff->sk = sk;
2612         buff->localroute = sk->localroute;
2613 
2614         /*
2615          *      Put in the IP header and routing stuff. 
2616          */
2617 
2618         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2619                            IPPROTO_TCP, sk->opt,
2620                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2621         if (tmp < 0) 
2622         {
2623                 int t;
2624                 /*
2625                  *      Finish anyway, treat this as a send that got lost. 
2626                  *      (Not good).
2627                  */
2628                  
2629                 buff->free = 1;
2630                 sock_wfree(sk,buff);
2631                 sk->write_seq++;
2632                 t=del_timer(&sk->timer);
2633                 if(t)
2634                         add_timer(&sk->timer);
2635                 else
2636                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2637                 return;
2638         }
2639         
2640         /*
2641          *      We ought to check if the end of the queue is a buffer and
2642          *      if so simply add the fin to that buffer, not send it ahead.
2643          */
2644 
2645         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2646         buff->dev = dev;
2647         memcpy(t1, th, sizeof(*t1));
2648         t1->seq = ntohl(sk->write_seq);
2649         sk->write_seq++;
2650         buff->h.seq = sk->write_seq;
2651         t1->ack = 1;
2652         t1->ack_seq = ntohl(sk->acked_seq);
2653         t1->window = ntohs(sk->window=tcp_select_window(sk));
2654         t1->fin = 1;
2655         t1->rst = 0;
2656         t1->doff = sizeof(*t1)/4;
2657         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2658 
2659         /*
2660          * If there is data in the write queue, the fin must be appended to
2661          * the write queue.
2662          */
2663         
2664         if (skb_peek(&sk->write_queue) != NULL) 
2665         {
2666                 buff->free = 0;
2667                 if (buff->next != NULL) 
2668                 {
2669                         printk("tcp_send_fin: next != NULL\n");
2670                         skb_unlink(buff);
2671                 }
2672                 skb_queue_tail(&sk->write_queue, buff);
2673         } 
2674         else 
2675         {
2676                 sk->sent_seq = sk->write_seq;
2677                 sk->prot->queue_xmit(sk, dev, buff, 0);
2678                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2679         }
2680 }
2681 
2682 /*
2683  *      Shutdown the sending side of a connection. Much like close except
2684  *      that we don't receive shut down or set sk->dead=1.
2685  */
2686 
2687 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2688 {
2689         /*
2690          *      We need to grab some memory, and put together a FIN,
2691          *      and then put it into the queue to be sent.
2692          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2693          */
2694 
2695         if (!(how & SEND_SHUTDOWN)) 
2696                 return;
2697          
2698         /*
2699          *      If we've already sent a FIN, or it's a closed state
2700          */
2701          
2702         if (sk->state == TCP_FIN_WAIT1 ||
2703             sk->state == TCP_FIN_WAIT2 ||
2704             sk->state == TCP_CLOSING ||
2705             sk->state == TCP_LAST_ACK ||
2706             sk->state == TCP_TIME_WAIT || 
2707             sk->state == TCP_CLOSE ||
2708             sk->state == TCP_LISTEN
2709           )
2710         {
2711                 return;
2712         }
2713         sk->inuse = 1;
2714 
2715         /*
2716          * flag that the sender has shutdown
2717          */
2718 
2719         sk->shutdown |= SEND_SHUTDOWN;
2720 
2721         /*
2722          *  Clear out any half completed packets. 
2723          */
2724 
2725         if (sk->partial)
2726                 tcp_send_partial(sk);
2727                 
2728         /*
2729          *      FIN if needed
2730          */
2731          
2732         if(tcp_close_state(sk,0))
2733                 tcp_send_fin(sk);
2734                 
2735         release_sock(sk);
2736 }
2737 
2738 /*
2739  *      This routine will send an RST to the other tcp. 
2740  */
2741  
2742 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2743           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2744 {
2745         struct sk_buff *buff;
2746         struct tcphdr *t1;
2747         int tmp;
2748         struct device *ndev=NULL;
2749 
2750         /*
2751          *      Cannot reset a reset (Think about it).
2752          */
2753          
2754         if(th->rst)
2755                 return;
2756   
2757         /*
2758          * We need to grab some memory, and put together an RST,
2759          * and then put it into the queue to be sent.
2760          */
2761 
2762         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2763         if (buff == NULL) 
2764                 return;
2765 
2766         buff->sk = NULL;
2767         buff->dev = dev;
2768         buff->localroute = 0;
2769 
2770         /*
2771          *      Put in the IP header and routing stuff. 
2772          */
2773 
2774         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2775                            sizeof(struct tcphdr),tos,ttl,NULL);
2776         if (tmp < 0) 
2777         {
2778                 buff->free = 1;
2779                 sock_wfree(NULL, buff);
2780                 return;
2781         }
2782 
2783         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2784         memcpy(t1, th, sizeof(*t1));
2785 
2786         /*
2787          *      Swap the send and the receive. 
2788          */
2789 
2790         t1->dest = th->source;
2791         t1->source = th->dest;
2792         t1->rst = 1;  
2793         t1->window = 0;
2794   
2795         if(th->ack)
2796         {
2797                 t1->ack = 0;
2798                 t1->seq = th->ack_seq;
2799                 t1->ack_seq = 0;
2800         }
2801         else
2802         {
2803                 t1->ack = 1;
2804                 if(!th->syn)
2805                         t1->ack_seq=htonl(th->seq);
2806                 else
2807                         t1->ack_seq=htonl(th->seq+1);
2808                 t1->seq=0;
2809         }
2810 
2811         t1->syn = 0;
2812         t1->urg = 0;
2813         t1->fin = 0;
2814         t1->psh = 0;
2815         t1->doff = sizeof(*t1)/4;
2816         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2817         prot->queue_xmit(NULL, ndev, buff, 1);
2818         tcp_statistics.TcpOutSegs++;
2819 }
2820 
2821 
2822 /*
2823  *      Look for tcp options. Parses everything but only knows about MSS.
2824  *      This routine is always called with the packet containing the SYN.
2825  *      However it may also be called with the ack to the SYN.  So you
2826  *      can't assume this is always the SYN.  It's always called after
2827  *      we have set up sk->mtu to our own MTU.
2828  *
2829  *      We need at minimum to add PAWS support here. Possibly large windows
2830  *      as Linux gets deployed on 100Mb/sec networks.
2831  */
2832  
2833 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2834 {
2835         unsigned char *ptr;
2836         int length=(th->doff*4)-sizeof(struct tcphdr);
2837         int mss_seen = 0;
2838     
2839         ptr = (unsigned char *)(th + 1);
2840   
2841         while(length>0)
2842         {
2843                 int opcode=*ptr++;
2844                 int opsize=*ptr++;
2845                 switch(opcode)
2846                 {
2847                         case TCPOPT_EOL:
2848                                 return;
2849                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2850                                 length--;
2851                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2852                                 continue;
2853                         
2854                         default:
2855                                 if(opsize<=2)   /* Avoid silly options looping forever */
2856                                         return;
2857                                 switch(opcode)
2858                                 {
2859                                         case TCPOPT_MSS:
2860                                                 if(opsize==4 && th->syn)
2861                                                 {
2862                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2863                                                         mss_seen = 1;
2864                                                 }
2865                                                 break;
2866                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2867                                 }
2868                                 ptr+=opsize-2;
2869                                 length-=opsize;
2870                 }
2871         }
2872         if (th->syn) 
2873         {
2874                 if (! mss_seen)
2875                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2876         }
2877 #ifdef CONFIG_INET_PCTCP
2878         sk->mss = min(sk->max_window >> 1, sk->mtu);
2879 #else    
2880         sk->mss = min(sk->max_window, sk->mtu);
2881 #endif  
2882 }
2883 
2884 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2885 {
2886         dst = ntohl(dst);
2887         if (IN_CLASSA(dst))
2888                 return htonl(IN_CLASSA_NET);
2889         if (IN_CLASSB(dst))
2890                 return htonl(IN_CLASSB_NET);
2891         return htonl(IN_CLASSC_NET);
2892 }
2893 
2894 /*
2895  *      Default sequence number picking algorithm.
2896  *      As close as possible to RFC 793, which
2897  *      suggests using a 250kHz clock.
2898  *      Further reading shows this assumes 2MB/s networks.
2899  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2900  *      That's funny, Linux has one built in!  Use it!
2901  */
2902 
2903 extern inline u32 tcp_init_seq(void)
     /*  */
2904 {
2905         struct timeval tv;
2906         do_gettimeofday(&tv);
2907         return tv.tv_usec+tv.tv_sec*1000000;
2908 }
2909 
2910 /*
2911  *      This routine handles a connection request.
2912  *      It should make sure we haven't already responded.
2913  *      Because of the way BSD works, we have to send a syn/ack now.
2914  *      This also means it will be harder to close a socket which is
2915  *      listening.
2916  */
2917  
2918 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2919                  unsigned long daddr, unsigned long saddr,
2920                  struct options *opt, struct device *dev, u32 seq)
2921 {
2922         struct sk_buff *buff;
2923         struct tcphdr *t1;
2924         unsigned char *ptr;
2925         struct sock *newsk;
2926         struct tcphdr *th;
2927         struct device *ndev=NULL;
2928         int tmp;
2929         struct rtable *rt;
2930   
2931         th = skb->h.th;
2932 
2933         /* If the socket is dead, don't accept the connection. */
2934         if (!sk->dead) 
2935         {
2936                 sk->data_ready(sk,0);
2937         }
2938         else 
2939         {
2940                 if(sk->debug)
2941                         printk("Reset on %p: Connect on dead socket.\n",sk);
2942                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2943                 tcp_statistics.TcpAttemptFails++;
2944                 kfree_skb(skb, FREE_READ);
2945                 return;
2946         }
2947 
2948         /*
2949          * Make sure we can accept more.  This will prevent a
2950          * flurry of syns from eating up all our memory.
2951          */
2952 
2953         if (sk->ack_backlog >= sk->max_ack_backlog) 
2954         {
2955                 tcp_statistics.TcpAttemptFails++;
2956                 kfree_skb(skb, FREE_READ);
2957                 return;
2958         }
2959 
2960         /*
2961          * We need to build a new sock struct.
2962          * It is sort of bad to have a socket without an inode attached
2963          * to it, but the wake_up's will just wake up the listening socket,
2964          * and if the listening socket is destroyed before this is taken
2965          * off of the queue, this will take care of it.
2966          */
2967 
2968         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2969         if (newsk == NULL) 
2970         {
2971                 /* just ignore the syn.  It will get retransmitted. */
2972                 tcp_statistics.TcpAttemptFails++;
2973                 kfree_skb(skb, FREE_READ);
2974                 return;
2975         }
2976 
2977         memcpy(newsk, sk, sizeof(*newsk));
2978         newsk->opt = NULL;
2979         newsk->ip_route_cache  = NULL;
2980         if (opt && opt->optlen) {
2981           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2982           if (!sk->opt) {
2983                 kfree_s(newsk, sizeof(struct sock));
2984                 tcp_statistics.TcpAttemptFails++;
2985                 kfree_skb(skb, FREE_READ);
2986                 return;
2987           }
2988           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2989                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2990                 kfree_s(newsk, sizeof(struct sock));
2991                 tcp_statistics.TcpAttemptFails++;
2992                 kfree_skb(skb, FREE_READ);
2993                 return;
2994           }
2995         }
2996         skb_queue_head_init(&newsk->write_queue);
2997         skb_queue_head_init(&newsk->receive_queue);
2998         newsk->send_head = NULL;
2999         newsk->send_tail = NULL;
3000         skb_queue_head_init(&newsk->back_log);
3001         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3002         newsk->rto = TCP_TIMEOUT_INIT;
3003         newsk->mdev = 0;
3004         newsk->max_window = 0;
3005         newsk->cong_window = 1;
3006         newsk->cong_count = 0;
3007         newsk->ssthresh = 0;
3008         newsk->backoff = 0;
3009         newsk->blog = 0;
3010         newsk->intr = 0;
3011         newsk->proc = 0;
3012         newsk->done = 0;
3013         newsk->partial = NULL;
3014         newsk->pair = NULL;
3015         newsk->wmem_alloc = 0;
3016         newsk->rmem_alloc = 0;
3017         newsk->localroute = sk->localroute;
3018 
3019         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3020 
3021         newsk->err = 0;
3022         newsk->shutdown = 0;
3023         newsk->ack_backlog = 0;
3024         newsk->acked_seq = skb->h.th->seq+1;
3025         newsk->copied_seq = skb->h.th->seq+1;
3026         newsk->fin_seq = skb->h.th->seq;
3027         newsk->state = TCP_SYN_RECV;
3028         newsk->timeout = 0;
3029         newsk->ip_xmit_timeout = 0;
3030         newsk->write_seq = seq; 
3031         newsk->window_seq = newsk->write_seq;
3032         newsk->rcv_ack_seq = newsk->write_seq;
3033         newsk->urg_data = 0;
3034         newsk->retransmits = 0;
3035         newsk->linger=0;
3036         newsk->destroy = 0;
3037         init_timer(&newsk->timer);
3038         newsk->timer.data = (unsigned long)newsk;
3039         newsk->timer.function = &net_timer;
3040         init_timer(&newsk->retransmit_timer);
3041         newsk->retransmit_timer.data = (unsigned long)newsk;
3042         newsk->retransmit_timer.function=&retransmit_timer;
3043         newsk->dummy_th.source = skb->h.th->dest;
3044         newsk->dummy_th.dest = skb->h.th->source;
3045         
3046         /*
3047          *      Swap these two, they are from our point of view. 
3048          */
3049          
3050         newsk->daddr = saddr;
3051         newsk->saddr = daddr;
3052         newsk->rcv_saddr = daddr;
3053 
3054         put_sock(newsk->num,newsk);
3055         newsk->dummy_th.res1 = 0;
3056         newsk->dummy_th.doff = 6;
3057         newsk->dummy_th.fin = 0;
3058         newsk->dummy_th.syn = 0;
3059         newsk->dummy_th.rst = 0;        
3060         newsk->dummy_th.psh = 0;
3061         newsk->dummy_th.ack = 0;
3062         newsk->dummy_th.urg = 0;
3063         newsk->dummy_th.res2 = 0;
3064         newsk->acked_seq = skb->h.th->seq + 1;
3065         newsk->copied_seq = skb->h.th->seq + 1;
3066         newsk->socket = NULL;
3067 
3068         /*
3069          *      Grab the ttl and tos values and use them 
3070          */
3071 
3072         newsk->ip_ttl=sk->ip_ttl;
3073         newsk->ip_tos=skb->ip_hdr->tos;
3074 
3075         /*
3076          *      Use 512 or whatever user asked for 
3077          */
3078 
3079         /*
3080          *      Note use of sk->user_mss, since user has no direct access to newsk 
3081          */
3082 
3083         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3084         newsk->ip_route_cache = rt;
3085         
3086         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3087                 newsk->window_clamp = rt->rt_window;
3088         else
3089                 newsk->window_clamp = 0;
3090                 
3091         if (sk->user_mss)
3092                 newsk->mtu = sk->user_mss;
3093         else if (rt)
3094                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3095         else 
3096                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3097 
3098         /*
3099          *      But not bigger than device MTU 
3100          */
3101 
3102         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3103 
3104 #ifdef CONFIG_SKIP
3105         
3106         /*
3107          *      SKIP devices set their MTU to 65535. This is so they can take packets
3108          *      unfragmented to security process then fragment. They could lie to the
3109          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3110          *      simply because the final package we want unfragmented is going to be
3111          *
3112          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3113          */
3114          
3115         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3116                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3117 #endif
3118         /*
3119          *      This will min with what arrived in the packet 
3120          */
3121 
3122         tcp_options(newsk,skb->h.th);
3123         
3124         tcp_cache_zap();
3125 
3126         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3127         if (buff == NULL) 
3128         {
3129                 sk->err = ENOMEM;
3130                 newsk->dead = 1;
3131                 newsk->state = TCP_CLOSE;
3132                 /* And this will destroy it */
3133                 release_sock(newsk);
3134                 kfree_skb(skb, FREE_READ);
3135                 tcp_statistics.TcpAttemptFails++;
3136                 return;
3137         }
3138   
3139         buff->sk = newsk;
3140         buff->localroute = newsk->localroute;
3141 
3142         /*
3143          *      Put in the IP header and routing stuff. 
3144          */
3145 
3146         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3147                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3148 
3149         /*
3150          *      Something went wrong. 
3151          */
3152 
3153         if (tmp < 0) 
3154         {
3155                 sk->err = tmp;
3156                 buff->free = 1;
3157                 kfree_skb(buff,FREE_WRITE);
3158                 newsk->dead = 1;
3159                 newsk->state = TCP_CLOSE;
3160                 release_sock(newsk);
3161                 skb->sk = sk;
3162                 kfree_skb(skb, FREE_READ);
3163                 tcp_statistics.TcpAttemptFails++;
3164                 return;
3165         }
3166 
3167         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3168   
3169         memcpy(t1, skb->h.th, sizeof(*t1));
3170         buff->h.seq = newsk->write_seq;
3171         /*
3172          *      Swap the send and the receive. 
3173          */
3174         t1->dest = skb->h.th->source;
3175         t1->source = newsk->dummy_th.source;
3176         t1->seq = ntohl(newsk->write_seq++);
3177         t1->ack = 1;
3178         newsk->window = tcp_select_window(newsk);
3179         newsk->sent_seq = newsk->write_seq;
3180         t1->window = ntohs(newsk->window);
3181         t1->res1 = 0;
3182         t1->res2 = 0;
3183         t1->rst = 0;
3184         t1->urg = 0;
3185         t1->psh = 0;
3186         t1->syn = 1;
3187         t1->ack_seq = ntohl(skb->h.th->seq+1);
3188         t1->doff = sizeof(*t1)/4+1;
3189         ptr = skb_put(buff,4);
3190         ptr[0] = 2;
3191         ptr[1] = 4;
3192         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3193         ptr[3] =(newsk->mtu) & 0xff;
3194 
3195         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3196         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3197         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3198         skb->sk = newsk;
3199 
3200         /*
3201          *      Charge the sock_buff to newsk. 
3202          */
3203          
3204         sk->rmem_alloc -= skb->truesize;
3205         newsk->rmem_alloc += skb->truesize;
3206         
3207         skb_queue_tail(&sk->receive_queue,skb);
3208         sk->ack_backlog++;
3209         release_sock(newsk);
3210         tcp_statistics.TcpOutSegs++;
3211 }
3212 
3213 
3214 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3215 {
3216         /*
3217          * We need to grab some memory, and put together a FIN, 
3218          * and then put it into the queue to be sent.
3219          */
3220         
3221         sk->inuse = 1;
3222         
3223         if(th_cache_sk==sk)
3224                 tcp_cache_zap();
3225         if(sk->state == TCP_LISTEN)
3226         {
3227                 /* Special case */
3228                 tcp_set_state(sk, TCP_CLOSE);
3229                 tcp_close_pending(sk);
3230                 release_sock(sk);
3231                 return;
3232         }
3233         
3234         sk->keepopen = 1;
3235         sk->shutdown = SHUTDOWN_MASK;
3236 
3237         if (!sk->dead) 
3238                 sk->state_change(sk);
3239 
3240         if (timeout == 0) 
3241         {
3242                 struct sk_buff *skb;
3243                 
3244                 /*
3245                  *  We need to flush the recv. buffs.  We do this only on the
3246                  *  descriptor close, not protocol-sourced closes, because the
3247                  *  reader process may not have drained the data yet!
3248                  */
3249                  
3250                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3251                         kfree_skb(skb, FREE_READ);
3252                 /*
3253                  *      Get rid off any half-completed packets. 
3254                  */
3255 
3256                 if (sk->partial) 
3257                         tcp_send_partial(sk);
3258         }
3259 
3260                 
3261         /*
3262          *      Timeout is not the same thing - however the code likes
3263          *      to send both the same way (sigh).
3264          */
3265          
3266         if(timeout)
3267         {
3268                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3269         }
3270         else
3271         {
3272                 if(tcp_close_state(sk,1)==1)
3273                 {
3274                         tcp_send_fin(sk);
3275                 }
3276         }
3277         release_sock(sk);
3278 }
3279 
3280 
3281 /*
3282  *      This routine takes stuff off of the write queue,
3283  *      and puts it in the xmit queue. This happens as incoming acks
3284  *      open up the remote window for us.
3285  */
3286  
3287 static void tcp_write_xmit(struct sock *sk)
     /*  */
3288 {
3289         struct sk_buff *skb;
3290 
3291         /*
3292          *      The bytes will have to remain here. In time closedown will
3293          *      empty the write queue and all will be happy 
3294          */
3295 
3296         if(sk->zapped)
3297                 return;
3298 
3299         /*
3300          *      Anything on the transmit queue that fits the window can
3301          *      be added providing we are not
3302          *
3303          *      a) retransmitting (Nagle's rule)
3304          *      b) exceeding our congestion window.
3305          */
3306          
3307         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3308                 before(skb->h.seq, sk->window_seq + 1) &&
3309                 (sk->retransmits == 0 ||
3310                  sk->ip_xmit_timeout != TIME_WRITE ||
3311                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3312                 && sk->packets_out < sk->cong_window) 
3313         {
3314                 IS_SKB(skb);
3315                 skb_unlink(skb);
3316                 
3317                 /*
3318                  *      See if we really need to send the packet. 
3319                  */
3320                  
3321                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3322                 {
3323                         /*
3324                          *      This is acked data. We can discard it. This 
3325                          *      cannot currently occur.
3326                          */
3327                          
3328                         sk->retransmits = 0;
3329                         kfree_skb(skb, FREE_WRITE);
3330                         if (!sk->dead) 
3331                                 sk->write_space(sk);
3332                 } 
3333                 else
3334                 {
3335                         struct tcphdr *th;
3336                         struct iphdr *iph;
3337                         int size;
3338 /*
3339  * put in the ack seq and window at this point rather than earlier,
3340  * in order to keep them monotonic.  We really want to avoid taking
3341  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3342  * Ack and window will in general have changed since this packet was put
3343  * on the write queue.
3344  */
3345                         iph = skb->ip_hdr;
3346                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3347                         size = skb->len - (((unsigned char *) th) - skb->data);
3348 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3349                         if (size > sk->mtu - sizeof(struct iphdr))
3350                         {
3351                                 iph->frag_off &= ~htons(IP_DF);
3352                                 ip_send_check(iph);
3353                         }
3354 #endif
3355                         
3356                         th->ack_seq = ntohl(sk->acked_seq);
3357                         th->window = ntohs(tcp_select_window(sk));
3358 
3359                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3360 
3361                         sk->sent_seq = skb->h.seq;
3362                         
3363                         /*
3364                          *      IP manages our queue for some crazy reason
3365                          */
3366                          
3367                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3368                         
3369                         /*
3370                          *      Again we slide the timer wrongly
3371                          */
3372                          
3373                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3374                 }
3375         }
3376 }
3377 
3378 
3379 /*
3380  *      This routine deals with incoming acks, but not outgoing ones.
3381  */
3382 
3383 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3384 {
3385         u32 ack;
3386         int flag = 0;
3387 
3388         /* 
3389          * 1 - there was data in packet as well as ack or new data is sent or 
3390          *     in shutdown state
3391          * 2 - data from retransmit queue was acked and removed
3392          * 4 - window shrunk or data from retransmit queue was acked and removed
3393          */
3394 
3395         if(sk->zapped)
3396                 return(1);      /* Dead, cant ack any more so why bother */
3397 
3398         /*
3399          *      Have we discovered a larger window
3400          */
3401          
3402         ack = ntohl(th->ack_seq);
3403 
3404         if (ntohs(th->window) > sk->max_window) 
3405         {
3406                 sk->max_window = ntohs(th->window);
3407 #ifdef CONFIG_INET_PCTCP
3408                 /* Hack because we don't send partial packets to non SWS
3409                    handling hosts */
3410                 sk->mss = min(sk->max_window>>1, sk->mtu);
3411 #else
3412                 sk->mss = min(sk->max_window, sk->mtu);
3413 #endif  
3414         }
3415 
3416         /*
3417          *      We have dropped back to keepalive timeouts. Thus we have
3418          *      no retransmits pending.
3419          */
3420          
3421         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3422                 sk->retransmits = 0;
3423 
3424         /*
3425          *      If the ack is newer than sent or older than previous acks
3426          *      then we can probably ignore it.
3427          */
3428          
3429         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3430         {
3431                 if(sk->debug)
3432                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3433                         
3434                 /*
3435                  *      Keepalive processing.
3436                  */
3437                  
3438                 if (after(ack, sk->sent_seq)) 
3439                 {
3440                         return(0);
3441                 }
3442                 
3443                 /*
3444                  *      Restart the keepalive timer.
3445                  */
3446                  
3447                 if (sk->keepopen) 
3448                 {
3449                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3450                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3451                 }
3452                 return(1);
3453         }
3454 
3455         /*
3456          *      If there is data set flag 1
3457          */
3458          
3459         if (len != th->doff*4) 
3460                 flag |= 1;
3461 
3462         /*
3463          *      See if our window has been shrunk. 
3464          */
3465 
3466         if (after(sk->window_seq, ack+ntohs(th->window))) 
3467         {
3468                 /*
3469                  * We may need to move packets from the send queue
3470                  * to the write queue, if the window has been shrunk on us.
3471                  * The RFC says you are not allowed to shrink your window
3472                  * like this, but if the other end does, you must be able
3473                  * to deal with it.
3474                  */
3475                 struct sk_buff *skb;
3476                 struct sk_buff *skb2;
3477                 struct sk_buff *wskb = NULL;
3478         
3479                 skb2 = sk->send_head;
3480                 sk->send_head = NULL;
3481                 sk->send_tail = NULL;
3482         
3483                 /*
3484                  *      This is an artifact of a flawed concept. We want one
3485                  *      queue and a smarter send routine when we send all.
3486                  */
3487         
3488                 flag |= 4;      /* Window changed */
3489         
3490                 sk->window_seq = ack + ntohs(th->window);
3491                 cli();
3492                 while (skb2 != NULL) 
3493                 {
3494                         skb = skb2;
3495                         skb2 = skb->link3;
3496                         skb->link3 = NULL;
3497                         if (after(skb->h.seq, sk->window_seq)) 
3498                         {
3499                                 if (sk->packets_out > 0) 
3500                                         sk->packets_out--;
3501                                 /* We may need to remove this from the dev send list. */
3502                                 if (skb->next != NULL) 
3503                                 {
3504                                         skb_unlink(skb);                                
3505                                 }
3506                                 /* Now add it to the write_queue. */
3507                                 if (wskb == NULL)
3508                                         skb_queue_head(&sk->write_queue,skb);
3509                                 else
3510                                         skb_append(wskb,skb);
3511                                 wskb = skb;
3512                         } 
3513                         else 
3514                         {
3515                                 if (sk->send_head == NULL) 
3516                                 {
3517                                         sk->send_head = skb;
3518                                         sk->send_tail = skb;
3519                                 }
3520                                 else
3521                                 {
3522                                         sk->send_tail->link3 = skb;
3523                                         sk->send_tail = skb;
3524                                 }
3525                                 skb->link3 = NULL;
3526                         }
3527                 }
3528                 sti();
3529         }
3530 
3531         /*
3532          *      Pipe has emptied
3533          */
3534          
3535         if (sk->send_tail == NULL || sk->send_head == NULL) 
3536         {
3537                 sk->send_head = NULL;
3538                 sk->send_tail = NULL;
3539                 sk->packets_out= 0;
3540         }
3541 
3542         /*
3543          *      Update the right hand window edge of the host
3544          */
3545          
3546         sk->window_seq = ack + ntohs(th->window);
3547 
3548         /*
3549          *      We don't want too many packets out there. 
3550          */
3551          
3552         if (sk->ip_xmit_timeout == TIME_WRITE && 
3553                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3554         {
3555                 /* 
3556                  * This is Jacobson's slow start and congestion avoidance. 
3557                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3558                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3559                  * counter and increment it once every cwnd times.  It's possible
3560                  * that this should be done only if sk->retransmits == 0.  I'm
3561                  * interpreting "new data is acked" as including data that has
3562                  * been retransmitted but is just now being acked.
3563                  */
3564                 if (sk->cong_window < sk->ssthresh)  
3565                         /* 
3566                          *      In "safe" area, increase
3567                          */
3568                         sk->cong_window++;
3569                 else 
3570                 {
3571                         /*
3572                          *      In dangerous area, increase slowly.  In theory this is
3573                          *      sk->cong_window += 1 / sk->cong_window
3574                          */
3575                         if (sk->cong_count >= sk->cong_window) 
3576                         {
3577                                 sk->cong_window++;
3578                                 sk->cong_count = 0;
3579                         }
3580                         else 
3581                                 sk->cong_count++;
3582                 }
3583         }
3584 
3585         /*
3586          *      Remember the highest ack received.
3587          */
3588          
3589         sk->rcv_ack_seq = ack;
3590 
3591         /*
3592          *      If this ack opens up a zero window, clear backoff.  It was
3593          *      being used to time the probes, and is probably far higher than
3594          *      it needs to be for normal retransmission.
3595          */
3596 
3597         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3598         {
3599                 sk->retransmits = 0;    /* Our probe was answered */
3600                 
3601                 /*
3602                  *      Was it a usable window open ?
3603                  */
3604                  
3605                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3606                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3607                 {
3608                         sk->backoff = 0;
3609                         
3610                         /*
3611                          *      Recompute rto from rtt.  this eliminates any backoff.
3612                          */
3613 
3614                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3615                         if (sk->rto > 120*HZ)
3616                                 sk->rto = 120*HZ;
3617                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3618                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3619                                                    .2 of a second is going to need huge windows (SIGH) */
3620                         sk->rto = 20;
3621                 }
3622         }
3623 
3624         /* 
3625          *      See if we can take anything off of the retransmit queue.
3626          */
3627    
3628         while(sk->send_head != NULL) 
3629         {
3630                 /* Check for a bug. */
3631                 if (sk->send_head->link3 &&
3632                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3633                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3634                         
3635                 /*
3636                  *      If our packet is before the ack sequence we can
3637                  *      discard it as it's confirmed to have arrived the other end.
3638                  */
3639                  
3640                 if (before(sk->send_head->h.seq, ack+1)) 
3641                 {
3642                         struct sk_buff *oskb;   
3643                         if (sk->retransmits) 
3644                         {       
3645                                 /*
3646                                  *      We were retransmitting.  don't count this in RTT est 
3647                                  */
3648                                 flag |= 2;
3649 
3650                                 /*
3651                                  * even though we've gotten an ack, we're still
3652                                  * retransmitting as long as we're sending from
3653                                  * the retransmit queue.  Keeping retransmits non-zero
3654                                  * prevents us from getting new data interspersed with
3655                                  * retransmissions.
3656                                  */
3657 
3658                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3659                                         sk->retransmits = 1;
3660                                 else
3661                                         sk->retransmits = 0;
3662                         }
3663                         /*
3664                          * Note that we only reset backoff and rto in the
3665                          * rtt recomputation code.  And that doesn't happen
3666                          * if there were retransmissions in effect.  So the
3667                          * first new packet after the retransmissions is
3668                          * sent with the backoff still in effect.  Not until
3669                          * we get an ack from a non-retransmitted packet do
3670                          * we reset the backoff and rto.  This allows us to deal
3671                          * with a situation where the network delay has increased
3672                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3673                          */
3674 
3675                         /*
3676                          *      We have one less packet out there. 
3677                          */
3678                          
3679                         if (sk->packets_out > 0) 
3680                                 sk->packets_out --;
3681                         /* 
3682                          *      Wake up the process, it can probably write more. 
3683                          */
3684                         if (!sk->dead) 
3685                                 sk->write_space(sk);
3686                         oskb = sk->send_head;
3687 
3688                         if (!(flag&2))  /* Not retransmitting */
3689                         {
3690                                 long m;
3691         
3692                                 /*
3693                                  *      The following amusing code comes from Jacobson's
3694                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3695                                  *      are scaled versions of rtt and mean deviation.
3696                                  *      This is designed to be as fast as possible 
3697                                  *      m stands for "measurement".
3698                                  */
3699         
3700                                 m = jiffies - oskb->when;  /* RTT */
3701                                 if(m<=0)
3702                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3703                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3704                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3705                                 if (m < 0)
3706                                         m = -m;         /* m is now abs(error) */
3707                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3708                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3709         
3710                                 /*
3711                                  *      Now update timeout.  Note that this removes any backoff.
3712                                  */
3713                          
3714                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3715                                 if (sk->rto > 120*HZ)
3716                                         sk->rto = 120*HZ;
3717                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3718                                         sk->rto = 20;
3719                                 sk->backoff = 0;
3720                         }
3721                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3722                                            In this case as we just set it up */
3723                         cli();
3724                         oskb = sk->send_head;
3725                         IS_SKB(oskb);
3726                         sk->send_head = oskb->link3;
3727                         if (sk->send_head == NULL) 
3728                         {
3729                                 sk->send_tail = NULL;
3730                         }
3731 
3732                 /*
3733                  *      We may need to remove this from the dev send list. 
3734                  */
3735 
3736                         if (oskb->next)
3737                                 skb_unlink(oskb);
3738                         sti();
3739                         kfree_skb(oskb, FREE_WRITE); /* write. */
3740                         if (!sk->dead) 
3741                                 sk->write_space(sk);
3742                 }
3743                 else
3744                 {
3745                         break;
3746                 }
3747         }
3748 
3749         /*
3750          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3751          * returns non-NULL, we complete ignore the timer stuff in the else
3752          * clause.  We ought to organize the code so that else clause can
3753          * (should) be executed regardless, possibly moving the PROBE timer
3754          * reset over.  The skb_peek() thing should only move stuff to the
3755          * write queue, NOT also manage the timer functions.
3756          */
3757 
3758         /*
3759          * Maybe we can take some stuff off of the write queue,
3760          * and put it onto the xmit queue.
3761          */
3762         if (skb_peek(&sk->write_queue) != NULL) 
3763         {
3764                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3765                         (sk->retransmits == 0 || 
3766                          sk->ip_xmit_timeout != TIME_WRITE ||
3767                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3768                         && sk->packets_out < sk->cong_window) 
3769                 {
3770                         /*
3771                          *      Add more data to the send queue.
3772                          */
3773                         flag |= 1;
3774                         tcp_write_xmit(sk);
3775                 }
3776                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3777                         sk->send_head == NULL &&
3778                         sk->ack_backlog == 0 &&
3779                         sk->state != TCP_TIME_WAIT) 
3780                 {
3781                         /*
3782                          *      Data to queue but no room.
3783                          */
3784                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3785                 }               
3786         }
3787         else
3788         {
3789                 /*
3790                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3791                  * from TCP_CLOSE we don't do anything
3792                  *
3793                  * from anything else, if there is write data (or fin) pending,
3794                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3795                  * a KEEPALIVE timeout, else we delete the timer.
3796                  *
3797                  * We do not set flag for nominal write data, otherwise we may
3798                  * force a state where we start to write itsy bitsy tidbits
3799                  * of data.
3800                  */
3801 
3802                 switch(sk->state) {
3803                 case TCP_TIME_WAIT:
3804                         /*
3805                          * keep us in TIME_WAIT until we stop getting packets,
3806                          * reset the timeout.
3807                          */
3808                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3809                         break;
3810                 case TCP_CLOSE:
3811                         /*
3812                          * don't touch the timer.
3813                          */
3814                         break;
3815                 default:
3816                         /*
3817                          *      Must check send_head, write_queue, and ack_backlog
3818                          *      to determine which timeout to use.
3819                          */
3820                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3821                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3822                         } else if (sk->keepopen) {
3823                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3824                         } else {
3825                                 del_timer(&sk->retransmit_timer);
3826                                 sk->ip_xmit_timeout = 0;
3827                         }
3828                         break;
3829                 }
3830         }
3831 
3832         /*
3833          *      We have nothing queued but space to send. Send any partial
3834          *      packets immediately (end of Nagle rule application).
3835          */
3836          
3837         if (sk->packets_out == 0 && sk->partial != NULL &&
3838                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3839         {
3840                 flag |= 1;
3841                 tcp_send_partial(sk);
3842         }
3843 
3844         /*
3845          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3846          * we are now waiting for an acknowledge to our FIN.  The other end is
3847          * already in TIME_WAIT.
3848          *
3849          * Move to TCP_CLOSE on success.
3850          */
3851 
3852         if (sk->state == TCP_LAST_ACK) 
3853         {
3854                 if (!sk->dead)
3855                         sk->state_change(sk);
3856                 if(sk->debug)
3857                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3858                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3859                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3860                 {
3861                         flag |= 1;
3862                         tcp_set_state(sk,TCP_CLOSE);
3863                         sk->shutdown = SHUTDOWN_MASK;
3864                 }
3865         }
3866 
3867         /*
3868          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3869          *
3870          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3871          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3872          */
3873 
3874         if (sk->state == TCP_FIN_WAIT1) 
3875         {
3876 
3877                 if (!sk->dead) 
3878                         sk->state_change(sk);
3879                 if (sk->rcv_ack_seq == sk->write_seq) 
3880                 {
3881                         flag |= 1;
3882                         sk->shutdown |= SEND_SHUTDOWN;
3883                         tcp_set_state(sk, TCP_FIN_WAIT2);
3884                 }
3885         }
3886 
3887         /*
3888          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3889          *
3890          *      Move to TIME_WAIT
3891          */
3892 
3893         if (sk->state == TCP_CLOSING) 
3894         {
3895 
3896                 if (!sk->dead) 
3897                         sk->state_change(sk);
3898                 if (sk->rcv_ack_seq == sk->write_seq) 
3899                 {
3900                         flag |= 1;
3901                         tcp_time_wait(sk);
3902                 }
3903         }
3904         
3905         /*
3906          *      Final ack of a three way shake 
3907          */
3908          
3909         if(sk->state==TCP_SYN_RECV)
3910         {
3911                 tcp_set_state(sk, TCP_ESTABLISHED);
3912                 tcp_options(sk,th);
3913                 sk->dummy_th.dest=th->source;
3914                 sk->copied_seq = sk->acked_seq;
3915                 if(!sk->dead)
3916                         sk->state_change(sk);
3917                 if(sk->max_window==0)
3918                 {
3919                         sk->max_window=32;      /* Sanity check */
3920                         sk->mss=min(sk->max_window,sk->mtu);
3921                 }
3922         }
3923         
3924         /*
3925          * I make no guarantees about the first clause in the following
3926          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3927          * what conditions "!flag" would be true.  However I think the rest
3928          * of the conditions would prevent that from causing any
3929          * unnecessary retransmission. 
3930          *   Clearly if the first packet has expired it should be 
3931          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3932          * harder to explain:  You have to look carefully at how and when the
3933          * timer is set and with what timeout.  The most recent transmission always
3934          * sets the timer.  So in general if the most recent thing has timed
3935          * out, everything before it has as well.  So we want to go ahead and
3936          * retransmit some more.  If we didn't explicitly test for this
3937          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3938          * would not be true.  If you look at the pattern of timing, you can
3939          * show that rto is increased fast enough that the next packet would
3940          * almost never be retransmitted immediately.  Then you'd end up
3941          * waiting for a timeout to send each packet on the retransmission
3942          * queue.  With my implementation of the Karn sampling algorithm,
3943          * the timeout would double each time.  The net result is that it would
3944          * take a hideous amount of time to recover from a single dropped packet.
3945          * It's possible that there should also be a test for TIME_WRITE, but
3946          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3947          * got to be in real retransmission mode.
3948          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3949          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3950          * As long as no further losses occur, this seems reasonable.
3951          */
3952         
3953         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3954                (((flag&2) && sk->retransmits) ||
3955                (sk->send_head->when + sk->rto < jiffies))) 
3956         {
3957                 if(sk->send_head->when + sk->rto < jiffies)
3958                         tcp_retransmit(sk,0);   
3959                 else
3960                 {
3961                         tcp_do_retransmit(sk, 1);
3962                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3963                 }
3964         }
3965 
3966         return(1);
3967 }
3968 
3969 
3970 /*
3971  *      Process the FIN bit. This now behaves as it is supposed to work
3972  *      and the FIN takes effect when it is validly part of sequence
3973  *      space. Not before when we get holes.
3974  *
3975  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3976  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3977  *      TIME-WAIT)
3978  *
3979  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3980  *      close and we go into CLOSING (and later onto TIME-WAIT)
3981  *
3982  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3983  *
3984  */
3985  
3986 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3987 {
3988         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3989 
3990         if (!sk->dead) 
3991         {
3992                 sk->state_change(sk);
3993                 sock_wake_async(sk->socket, 1);
3994         }
3995 
3996         switch(sk->state) 
3997         {
3998                 case TCP_SYN_RECV:
3999                 case TCP_SYN_SENT:
4000                 case TCP_ESTABLISHED:
4001                         /*
4002                          * move to CLOSE_WAIT, tcp_data() already handled
4003                          * sending the ack.
4004                          */
4005                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4006                         if (th->rst)
4007                                 sk->shutdown = SHUTDOWN_MASK;
4008                         break;
4009 
4010                 case TCP_CLOSE_WAIT:
4011                 case TCP_CLOSING:
4012                         /*
4013                          * received a retransmission of the FIN, do
4014                          * nothing.
4015                          */
4016                         break;
4017                 case TCP_TIME_WAIT:
4018                         /*
4019                          * received a retransmission of the FIN,
4020                          * restart the TIME_WAIT timer.
4021                          */
4022                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4023                         return(0);
4024                 case TCP_FIN_WAIT1:
4025                         /*
4026                          * This case occurs when a simultaneous close
4027                          * happens, we must ack the received FIN and
4028                          * enter the CLOSING state.
4029                          *
4030                          * This causes a WRITE timeout, which will either
4031                          * move on to TIME_WAIT when we timeout, or resend
4032                          * the FIN properly (maybe we get rid of that annoying
4033                          * FIN lost hang). The TIME_WRITE code is already correct
4034                          * for handling this timeout.
4035                          */
4036 
4037                         if(sk->ip_xmit_timeout != TIME_WRITE)
4038                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4039                         tcp_set_state(sk,TCP_CLOSING);
4040                         break;
4041                 case TCP_FIN_WAIT2:
4042                         /*
4043                          * received a FIN -- send ACK and enter TIME_WAIT
4044                          */
4045                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4046                         sk->shutdown|=SHUTDOWN_MASK;
4047                         tcp_set_state(sk,TCP_TIME_WAIT);
4048                         break;
4049                 case TCP_CLOSE:
4050                         /*
4051                          * already in CLOSE
4052                          */
4053                         break;
4054                 default:
4055                         tcp_set_state(sk,TCP_LAST_ACK);
4056         
4057                         /* Start the timers. */
4058                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4059                         return(0);
4060         }
4061 
4062         return(0);
4063 }
4064 
4065 
4066 
4067 /*
4068  *      This routine handles the data.  If there is room in the buffer,
4069  *      it will be have already been moved into it.  If there is no
4070  *      room, then we will just have to discard the packet.
4071  */
4072 
4073 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4074          unsigned long saddr, unsigned short len)
4075 {
4076         struct sk_buff *skb1, *skb2;
4077         struct tcphdr *th;
4078         int dup_dumped=0;
4079         u32 new_seq, shut_seq;
4080 
4081         th = skb->h.th;
4082         skb_pull(skb,th->doff*4);
4083         skb_trim(skb,len-(th->doff*4));
4084 
4085         /*
4086          *      The bytes in the receive read/assembly queue has increased. Needed for the
4087          *      low memory discard algorithm 
4088          */
4089            
4090         sk->bytes_rcv += skb->len;
4091         
4092         if (skb->len == 0 && !th->fin) 
4093         {
4094                 /* 
4095                  *      Don't want to keep passing ack's back and forth. 
4096                  *      (someone sent us dataless, boring frame)
4097                  */
4098                 if (!th->ack)
4099                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4100                 kfree_skb(skb, FREE_READ);
4101                 return(0);
4102         }
4103         
4104         /*
4105          *      We no longer have anyone receiving data on this connection.
4106          */
4107 
4108 #ifndef TCP_DONT_RST_SHUTDOWN            
4109 
4110         if(sk->shutdown & RCV_SHUTDOWN)
4111         {
4112                 /*
4113                  *      FIXME: BSD has some magic to avoid sending resets to
4114                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4115                  *      BSD stacks still have broken keepalives so we want to
4116                  *      cope with it.
4117                  */
4118 
4119                 if(skb->len)    /* We don't care if it's just an ack or
4120                                    a keepalive/window probe */
4121                 {
4122                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4123                         
4124                         /* Do this the way 4.4BSD treats it. Not what I'd
4125                            regard as the meaning of the spec but it's what BSD
4126                            does and clearly they know everything 8) */
4127 
4128                         /*
4129                          *      This is valid because of two things
4130                          *
4131                          *      a) The way tcp_data behaves at the bottom.
4132                          *      b) A fin takes effect when read not when received.
4133                          */
4134                          
4135                         shut_seq=sk->acked_seq+1;       /* Last byte */
4136                         
4137                         if(after(new_seq,shut_seq))
4138                         {
4139                                 if(sk->debug)
4140                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4141                                                 sk, new_seq, shut_seq, sk->blog);
4142                                 if(sk->dead)
4143                                 {
4144                                         sk->acked_seq = new_seq + th->fin;
4145                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4146                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4147                                         tcp_statistics.TcpEstabResets++;
4148                                         tcp_set_state(sk,TCP_CLOSE);
4149                                         sk->err = EPIPE;
4150                                         sk->shutdown = SHUTDOWN_MASK;
4151                                         kfree_skb(skb, FREE_READ);
4152                                         return 0;
4153                                 }
4154                         }
4155                 }
4156         }
4157 
4158 #endif
4159 
4160         /*
4161          *      Now we have to walk the chain, and figure out where this one
4162          *      goes into it.  This is set up so that the last packet we received
4163          *      will be the first one we look at, that way if everything comes
4164          *      in order, there will be no performance loss, and if they come
4165          *      out of order we will be able to fit things in nicely.
4166          *
4167          *      [AC: This is wrong. We should assume in order first and then walk
4168          *       forwards from the first hole based upon real traffic patterns.]
4169          *      
4170          */
4171 
4172         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4173         {
4174                 skb_queue_head(&sk->receive_queue,skb);
4175                 skb1= NULL;
4176         } 
4177         else
4178         {
4179                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4180                 {
4181                         if(sk->debug)
4182                         {
4183                                 printk("skb1=%p :", skb1);
4184                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4185                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4186                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4187                                                 sk->acked_seq);
4188                         }
4189                         
4190                         /*
4191                          *      Optimisation: Duplicate frame or extension of previous frame from
4192                          *      same sequence point (lost ack case).
4193                          *      The frame contains duplicate data or replaces a previous frame
4194                          *      discard the previous frame (safe as sk->inuse is set) and put
4195                          *      the new one in its place.
4196                          */
4197                          
4198                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4199                         {
4200                                 skb_append(skb1,skb);
4201                                 skb_unlink(skb1);
4202                                 kfree_skb(skb1,FREE_READ);
4203                                 dup_dumped=1;
4204                                 skb1=NULL;
4205                                 break;
4206                         }
4207                         
4208                         /*
4209                          *      Found where it fits
4210                          */
4211                          
4212                         if (after(th->seq+1, skb1->h.th->seq))
4213                         {
4214                                 skb_append(skb1,skb);
4215                                 break;
4216                         }
4217                         
4218                         /*
4219                          *      See if we've hit the start. If so insert.
4220                          */
4221                         if (skb1 == skb_peek(&sk->receive_queue))
4222                         {
4223                                 skb_queue_head(&sk->receive_queue, skb);
4224                                 break;
4225                         }
4226                 }
4227         }
4228 
4229         /*
4230          *      Figure out what the ack value for this frame is
4231          */
4232          
4233         th->ack_seq = th->seq + skb->len;
4234         if (th->syn) 
4235                 th->ack_seq++;
4236         if (th->fin)
4237                 th->ack_seq++;
4238 
4239         if (before(sk->acked_seq, sk->copied_seq)) 
4240         {
4241                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4242                 sk->acked_seq = sk->copied_seq;
4243         }
4244 
4245         /*
4246          *      Now figure out if we can ack anything. This is very messy because we really want two
4247          *      receive queues, a completed and an assembly queue. We also want only one transmit
4248          *      queue.
4249          */
4250 
4251         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4252         {
4253                 if (before(th->seq, sk->acked_seq+1)) 
4254                 {
4255                         int newwindow;
4256 
4257                         if (after(th->ack_seq, sk->acked_seq)) 
4258                         {
4259                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4260                                 if (newwindow < 0)
4261                                         newwindow = 0;  
4262                                 sk->window = newwindow;
4263                                 sk->acked_seq = th->ack_seq;
4264                         }
4265                         skb->acked = 1;
4266 
4267                         /*
4268                          *      When we ack the fin, we do the FIN 
4269                          *      processing.
4270                          */
4271 
4272                         if (skb->h.th->fin) 
4273                         {
4274                                 tcp_fin(skb,sk,skb->h.th);
4275                         }
4276           
4277                         for(skb2 = skb->next;
4278                             skb2 != (struct sk_buff *)&sk->receive_queue;
4279                             skb2 = skb2->next) 
4280                         {
4281                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4282                                 {
4283                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4284                                         {
4285                                                 newwindow = sk->window -
4286                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4287                                                 if (newwindow < 0)
4288                                                         newwindow = 0;  
4289                                                 sk->window = newwindow;
4290                                                 sk->acked_seq = skb2->h.th->ack_seq;
4291                                         }
4292                                         skb2->acked = 1;
4293                                         /*
4294                                          *      When we ack the fin, we do
4295                                          *      the fin handling.
4296                                          */
4297                                         if (skb2->h.th->fin) 
4298                                         {
4299                                                 tcp_fin(skb,sk,skb->h.th);
4300                                         }
4301 
4302                                         /*
4303                                          *      Force an immediate ack.
4304                                          */
4305                                          
4306                                         sk->ack_backlog = sk->max_ack_backlog;
4307                                 }
4308                                 else
4309                                 {
4310                                         break;
4311                                 }
4312                         }
4313 
4314                         /*
4315                          *      This also takes care of updating the window.
4316                          *      This if statement needs to be simplified.
4317                          */
4318                         if (!sk->delay_acks ||
4319                             sk->ack_backlog >= sk->max_ack_backlog || 
4320                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4321         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4322                         }
4323                         else 
4324                         {
4325                                 sk->ack_backlog++;
4326                                 if(sk->debug)
4327                                         printk("Ack queued.\n");
4328                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4329                         }
4330                 }
4331         }
4332 
4333         /*
4334          *      If we've missed a packet, send an ack.
4335          *      Also start a timer to send another.
4336          */
4337          
4338         if (!skb->acked) 
4339         {
4340         
4341         /*
4342          *      This is important.  If we don't have much room left,
4343          *      we need to throw out a few packets so we have a good
4344          *      window.  Note that mtu is used, not mss, because mss is really
4345          *      for the send side.  He could be sending us stuff as large as mtu.
4346          */
4347                  
4348                 while (sock_rspace(sk) < sk->mtu) 
4349                 {
4350                         skb1 = skb_peek(&sk->receive_queue);
4351                         if (skb1 == NULL) 
4352                         {
4353                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4354                                 break;
4355                         }
4356 
4357                         /*
4358                          *      Don't throw out something that has been acked. 
4359                          */
4360                  
4361                         if (skb1->acked) 
4362                         {
4363                                 break;
4364                         }
4365                 
4366                         skb_unlink(skb1);
4367                         kfree_skb(skb1, FREE_READ);
4368                 }
4369                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4370                 sk->ack_backlog++;
4371                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4372         }
4373         else
4374         {
4375                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4376         }
4377 
4378         /*
4379          *      Now tell the user we may have some data. 
4380          */
4381          
4382         if (!sk->dead) 
4383         {
4384                 if(sk->debug)
4385                         printk("Data wakeup.\n");
4386                 sk->data_ready(sk,0);
4387         } 
4388         return(0);
4389 }
4390 
4391 
4392 /*
4393  *      This routine is only called when we have urgent data
4394  *      signalled. Its the 'slow' part of tcp_urg. It could be
4395  *      moved inline now as tcp_urg is only called from one
4396  *      place. We handle URGent data wrong. We have to - as
4397  *      BSD still doesn't use the correction from RFC961.
4398  */
4399  
4400 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4401 {
4402         u32 ptr = ntohs(th->urg_ptr);
4403 
4404         if (ptr)
4405                 ptr--;
4406         ptr += th->seq;
4407 
4408         /* ignore urgent data that we've already seen and read */
4409         if (after(sk->copied_seq, ptr))
4410                 return;
4411 
4412         /* do we already have a newer (or duplicate) urgent pointer? */
4413         if (sk->urg_data && !after(ptr, sk->urg_seq))
4414                 return;
4415 
4416         /* tell the world about our new urgent pointer */
4417         if (sk->proc != 0) {
4418                 if (sk->proc > 0) {
4419                         kill_proc(sk->proc, SIGURG, 1);
4420                 } else {
4421                         kill_pg(-sk->proc, SIGURG, 1);
4422                 }
4423         }
4424         sk->urg_data = URG_NOTYET;
4425         sk->urg_seq = ptr;
4426 }
4427 
4428 /*
4429  *      This is the 'fast' part of urgent handling.
4430  */
4431  
4432 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4433         unsigned long saddr, unsigned long len)
4434 {
4435         u32 ptr;
4436 
4437         /*
4438          *      Check if we get a new urgent pointer - normally not 
4439          */
4440          
4441         if (th->urg)
4442                 tcp_check_urg(sk,th);
4443 
4444         /*
4445          *      Do we wait for any urgent data? - normally not
4446          */
4447          
4448         if (sk->urg_data != URG_NOTYET)
4449                 return 0;
4450 
4451         /*
4452          *      Is the urgent pointer pointing into this packet? 
4453          */
4454          
4455         ptr = sk->urg_seq - th->seq + th->doff*4;
4456         if (ptr >= len)
4457                 return 0;
4458 
4459         /*
4460          *      Ok, got the correct packet, update info 
4461          */
4462          
4463         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4464         if (!sk->dead)
4465                 sk->data_ready(sk,0);
4466         return 0;
4467 }
4468 
4469 /*
4470  *      This will accept the next outstanding connection. 
4471  */
4472  
4473 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4474 {
4475         struct sock *newsk;
4476         struct sk_buff *skb;
4477   
4478   /*
4479    * We need to make sure that this socket is listening,
4480    * and that it has something pending.
4481    */
4482 
4483         if (sk->state != TCP_LISTEN) 
4484         {
4485                 sk->err = EINVAL;
4486                 return(NULL); 
4487         }
4488 
4489         /* Avoid the race. */
4490         cli();
4491         sk->inuse = 1;
4492 
4493         while((skb = tcp_dequeue_established(sk)) == NULL) 
4494         {
4495                 if (flags & O_NONBLOCK) 
4496                 {
4497                         sti();
4498                         release_sock(sk);
4499                         sk->err = EAGAIN;
4500                         return(NULL);
4501                 }
4502 
4503                 release_sock(sk);
4504                 interruptible_sleep_on(sk->sleep);
4505                 if (current->signal & ~current->blocked) 
4506                 {
4507                         sti();
4508                         sk->err = ERESTARTSYS;
4509                         return(NULL);
4510                 }
4511                 sk->inuse = 1;
4512         }
4513         sti();
4514 
4515         /*
4516          *      Now all we need to do is return skb->sk. 
4517          */
4518 
4519         newsk = skb->sk;
4520 
4521         kfree_skb(skb, FREE_READ);
4522         sk->ack_backlog--;
4523         release_sock(sk);
4524         return(newsk);
4525 }
4526 
4527 
4528 /*
4529  *      This will initiate an outgoing connection. 
4530  */
4531  
4532 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4533 {
4534         struct sk_buff *buff;
4535         struct device *dev=NULL;
4536         unsigned char *ptr;
4537         int tmp;
4538         int atype;
4539         struct tcphdr *t1;
4540         struct rtable *rt;
4541 
4542         if (sk->state != TCP_CLOSE) 
4543                 return(-EISCONN);
4544 
4545         /*
4546          *      Don't allow a double connect.
4547          */
4548                 
4549         if(sk->daddr)
4550                 return -EINVAL;
4551         
4552         if (addr_len < 8) 
4553                 return(-EINVAL);
4554 
4555         if (usin->sin_family && usin->sin_family != AF_INET) 
4556                 return(-EAFNOSUPPORT);
4557 
4558         /*
4559          *      connect() to INADDR_ANY means loopback (BSD'ism).
4560          */
4561         
4562         if(usin->sin_addr.s_addr==INADDR_ANY)
4563                 usin->sin_addr.s_addr=ip_my_addr();
4564                   
4565         /*
4566          *      Don't want a TCP connection going to a broadcast address 
4567          */
4568 
4569         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4570                 return -ENETUNREACH;
4571   
4572         sk->inuse = 1;
4573         sk->daddr = usin->sin_addr.s_addr;
4574         sk->write_seq = tcp_init_seq();
4575         sk->window_seq = sk->write_seq;
4576         sk->rcv_ack_seq = sk->write_seq -1;
4577         sk->err = 0;
4578         sk->dummy_th.dest = usin->sin_port;
4579         release_sock(sk);
4580 
4581         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4582         if (buff == NULL) 
4583         {
4584                 return(-ENOMEM);
4585         }
4586         sk->inuse = 1;
4587         buff->sk = sk;
4588         buff->free = 0;
4589         buff->localroute = sk->localroute;
4590         
4591 
4592         /*
4593          *      Put in the IP header and routing stuff.
4594          */
4595          
4596         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4597                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4598         if (tmp < 0) 
4599         {
4600                 sock_wfree(sk, buff);
4601                 release_sock(sk);
4602                 return(-ENETUNREACH);
4603         }
4604         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4605                 sk->saddr = rt->rt_src;
4606         sk->rcv_saddr = sk->saddr;
4607 
4608         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4609 
4610         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4611         t1->seq = ntohl(sk->write_seq++);
4612         sk->sent_seq = sk->write_seq;
4613         buff->h.seq = sk->write_seq;
4614         t1->ack = 0;
4615         t1->window = 2;
4616         t1->res1=0;
4617         t1->res2=0;
4618         t1->rst = 0;
4619         t1->urg = 0;
4620         t1->psh = 0;
4621         t1->syn = 1;
4622         t1->urg_ptr = 0;
4623         t1->doff = 6;
4624         /* use 512 or whatever user asked for */
4625         
4626         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4627                 sk->window_clamp=rt->rt_window;
4628         else
4629                 sk->window_clamp=0;
4630 
4631         if (sk->user_mss)
4632                 sk->mtu = sk->user_mss;
4633         else if (rt)
4634                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4635         else 
4636                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4637 
4638         /*
4639          *      but not bigger than device MTU 
4640          */
4641 
4642         if(sk->mtu <32)
4643                 sk->mtu = 32;   /* Sanity limit */
4644                 
4645         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4646 
4647 #ifdef CONFIG_SKIP
4648         
4649         /*
4650          *      SKIP devices set their MTU to 65535. This is so they can take packets
4651          *      unfragmented to security process then fragment. They could lie to the
4652          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4653          *      simply because the final package we want unfragmented is going to be
4654          *
4655          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4656          */
4657          
4658         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4659                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4660 #endif
4661         
4662         /*
4663          *      Put in the TCP options to say MTU. 
4664          */
4665 
4666         ptr = skb_put(buff,4);
4667         ptr[0] = 2;
4668         ptr[1] = 4;
4669         ptr[2] = (sk->mtu) >> 8;
4670         ptr[3] = (sk->mtu) & 0xff;
4671         tcp_send_check(t1, sk->saddr, sk->daddr,
4672                   sizeof(struct tcphdr) + 4, sk);
4673 
4674         /*
4675          *      This must go first otherwise a really quick response will get reset. 
4676          */
4677 
4678         tcp_cache_zap();
4679         tcp_set_state(sk,TCP_SYN_SENT);
4680         if(rt&&rt->rt_flags&RTF_IRTT)
4681                 sk->rto = rt->rt_irtt;
4682         else
4683                 sk->rto = TCP_TIMEOUT_INIT;
4684         sk->retransmit_timer.function=&retransmit_timer;
4685         sk->retransmit_timer.data = (unsigned long)sk;
4686         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4687         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4688                                                                                         initial setting */
4689 
4690         sk->prot->queue_xmit(sk, dev, buff, 0);  
4691         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4692         tcp_statistics.TcpActiveOpens++;
4693         tcp_statistics.TcpOutSegs++;
4694   
4695         release_sock(sk);
4696         return(0);
4697 }
4698 
4699 
4700 /*
4701  *      This functions checks to see if the tcp header is actually acceptable. 
4702  */
4703  
4704 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4705              struct options *opt, unsigned long saddr, struct device *dev)
4706 {
4707         u32 next_seq;
4708 
4709         next_seq = len - 4*th->doff;
4710         if (th->fin)
4711                 next_seq++;
4712         /* if we have a zero window, we can't have any data in the packet.. */
4713         if (next_seq && !sk->window)
4714                 goto ignore_it;
4715         next_seq += th->seq;
4716 
4717         /*
4718          * This isn't quite right.  sk->acked_seq could be more recent
4719          * than sk->window.  This is however close enough.  We will accept
4720          * slightly more packets than we should, but it should not cause
4721          * problems unless someone is trying to forge packets.
4722          */
4723 
4724         /* have we already seen all of this packet? */
4725         if (!after(next_seq+1, sk->acked_seq))
4726                 goto ignore_it;
4727         /* or does it start beyond the window? */
4728         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4729                 goto ignore_it;
4730 
4731         /* ok, at least part of this packet would seem interesting.. */
4732         return 1;
4733 
4734 ignore_it:
4735         if (th->rst)
4736                 return 0;
4737 
4738         /*
4739          *      Send a reset if we get something not ours and we are
4740          *      unsynchronized. Note: We don't do anything to our end. We
4741          *      are just killing the bogus remote connection then we will
4742          *      connect again and it will work (with luck).
4743          */
4744          
4745         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4746         {
4747                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4748                 return 1;
4749         }
4750 
4751         /* Try to resync things. */
4752         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4753         return 0;
4754 }
4755 
4756 /*
4757  *      When we get a reset we do this.
4758  */
4759 
4760 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4761 {
4762         sk->zapped = 1;
4763         sk->err = ECONNRESET;
4764         if (sk->state == TCP_SYN_SENT)
4765                 sk->err = ECONNREFUSED;
4766         if (sk->state == TCP_CLOSE_WAIT)
4767                 sk->err = EPIPE;
4768 #ifdef TCP_DO_RFC1337           
4769         /*
4770          *      Time wait assassination protection [RFC1337]
4771          */
4772         if(sk->state!=TCP_TIME_WAIT)
4773         {       
4774                 tcp_set_state(sk,TCP_CLOSE);
4775                 sk->shutdown = SHUTDOWN_MASK;
4776         }
4777 #else   
4778         tcp_set_state(sk,TCP_CLOSE);
4779         sk->shutdown = SHUTDOWN_MASK;
4780 #endif  
4781         if (!sk->dead) 
4782                 sk->state_change(sk);
4783         kfree_skb(skb, FREE_READ);
4784         release_sock(sk);
4785         return(0);
4786 }
4787 
4788 /*
4789  *      A TCP packet has arrived.
4790  *              skb->h.raw is the TCP header.
4791  */
4792  
4793 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4794         __u32 daddr, unsigned short len,
4795         __u32 saddr, int redo, struct inet_protocol * protocol)
4796 {
4797         struct tcphdr *th;
4798         struct sock *sk;
4799         int syn_ok=0;
4800         
4801         tcp_statistics.TcpInSegs++;
4802         if(skb->pkt_type!=PACKET_HOST)
4803         {
4804                 kfree_skb(skb,FREE_READ);
4805                 return(0);
4806         }
4807   
4808         th = skb->h.th;
4809 
4810         /*
4811          *      Find the socket, using the last hit cache if applicable.
4812          */
4813 
4814         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4815         {
4816                 sk=(struct sock *)th_cache_sk;
4817                 /*
4818                  *      We think this is causing the bug so
4819                  */
4820                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4821                         printk("Cache mismatch on TCP.\n");
4822         }
4823         else
4824         {
4825                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4826                 th_cache_saddr=saddr;
4827                 th_cache_daddr=daddr;
4828                 th_cache_dport=th->dest;
4829                 th_cache_sport=th->source;
4830                 th_cache_sk=sk;
4831         }               
4832 
4833         /*
4834          *      If this socket has got a reset it's to all intents and purposes 
4835          *      really dead. Count closed sockets as dead.
4836          *
4837          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4838          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4839          *      exist so should cause resets as if the port was unreachable.
4840          */
4841          
4842         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4843                 sk=NULL;
4844 
4845         if (!redo) 
4846         {
4847                 /*
4848                  *      Pull up the IP header.
4849                  */
4850                 skb_pull(skb, skb->h.raw-skb->data);
4851                 /*
4852                  *      Try to use the device checksum if provided.
4853                  */
4854                 if (
4855                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4856                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4857                     )
4858                 {
4859                         skb->sk = NULL;
4860                         kfree_skb(skb,FREE_READ);
4861                         /*
4862                          *      We don't release the socket because it was
4863                          *      never marked in use.
4864                          */
4865                         return(0);
4866                 }
4867                 th->seq = ntohl(th->seq);
4868 
4869                 /* See if we know about the socket. */
4870                 if (sk == NULL) 
4871                 {
4872                         /*
4873                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4874                          */
4875                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4876                         skb->sk = NULL;
4877                         /*
4878                          *      Discard frame
4879                          */
4880                         kfree_skb(skb, FREE_READ);
4881                         return(0);
4882                 }
4883 
4884                 skb->acked = 0;
4885                 skb->used = 0;
4886                 skb->free = 0;
4887                 skb->saddr = daddr;
4888                 skb->daddr = saddr;
4889         
4890                 /* We may need to add it to the backlog here. */
4891                 cli();
4892                 if (sk->inuse) 
4893                 {
4894                         skb_queue_tail(&sk->back_log, skb);
4895                         sti();
4896                         return(0);
4897                 }
4898                 sk->inuse = 1;
4899                 sti();
4900         }
4901         else
4902         {
4903                 if (sk==NULL) 
4904                 {
4905                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4906                         skb->sk = NULL;
4907                         kfree_skb(skb, FREE_READ);
4908                         return(0);
4909                 }
4910         }
4911 
4912 
4913         if (!sk->prot) 
4914         {
4915                 printk("IMPOSSIBLE 3\n");
4916                 return(0);
4917         }
4918 
4919 
4920         /*
4921          *      Charge the memory to the socket. 
4922          */
4923          
4924         skb->sk=sk;
4925         sk->rmem_alloc += skb->truesize;
4926 
4927         /*
4928          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4929          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4930          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4931          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4932          */
4933 
4934         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4935         {
4936         
4937                 /*
4938                  *      Now deal with unusual cases.
4939                  */
4940          
4941                 if(sk->state==TCP_LISTEN)
4942                 {
4943                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4944                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4945 
4946                         /*
4947                          *      We don't care for RST, and non SYN are absorbed (old segments)
4948                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4949                          *      netmask on a running connection it can go broadcast. Even Sun's have
4950                          *      this problem so I'm ignoring it 
4951                          */
4952                            
4953                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4954                         {
4955                                 kfree_skb(skb, FREE_READ);
4956                                 release_sock(sk);
4957                                 return 0;
4958                         }
4959                 
4960                         /*      
4961                          *      Guess we need to make a new socket up 
4962                          */
4963                 
4964                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4965                 
4966                         /*
4967                          *      Now we have several options: In theory there is nothing else
4968                          *      in the frame. KA9Q has an option to send data with the syn,
4969                          *      BSD accepts data with the syn up to the [to be] advertised window
4970                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4971                          *      it, that fits the spec precisely and avoids incompatibilities. It
4972                          *      would be nice in future to drop through and process the data.
4973                          */
4974                          
4975                         release_sock(sk);
4976                         return 0;
4977                 }
4978         
4979                 /* retransmitted SYN? */
4980                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4981                 {
4982                         kfree_skb(skb, FREE_READ);
4983                         release_sock(sk);
4984                         return 0;
4985                 }
4986                 
4987                 /*
4988                  *      SYN sent means we have to look for a suitable ack and either reset
4989                  *      for bad matches or go to connected 
4990                  */
4991            
4992                 if(sk->state==TCP_SYN_SENT)
4993                 {
4994                         /* Crossed SYN or previous junk segment */
4995                         if(th->ack)
4996                         {
4997                                 /* We got an ack, but it's not a good ack */
4998                                 if(!tcp_ack(sk,th,saddr,len))
4999                                 {
5000                                         /* Reset the ack - its an ack from a 
5001                                            different connection  [ th->rst is checked in tcp_reset()] */
5002                                         tcp_statistics.TcpAttemptFails++;
5003                                         tcp_reset(daddr, saddr, th,
5004                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5005                                         kfree_skb(skb, FREE_READ);
5006                                         release_sock(sk);
5007                                         return(0);
5008                                 }
5009                                 if(th->rst)
5010                                         return tcp_std_reset(sk,skb);
5011                                 if(!th->syn)
5012                                 {
5013                                         /* A valid ack from a different connection
5014                                            start. Shouldn't happen but cover it */
5015                                         kfree_skb(skb, FREE_READ);
5016                                         release_sock(sk);
5017                                         return 0;
5018                                 }
5019                                 /*
5020                                  *      Ok.. it's good. Set up sequence numbers and
5021                                  *      move to established.
5022                                  */
5023                                 syn_ok=1;       /* Don't reset this connection for the syn */
5024                                 sk->acked_seq=th->seq+1;
5025                                 sk->fin_seq=th->seq;
5026                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5027                                 tcp_set_state(sk, TCP_ESTABLISHED);
5028                                 tcp_options(sk,th);
5029                                 sk->dummy_th.dest=th->source;
5030                                 sk->copied_seq = sk->acked_seq;
5031                                 if(!sk->dead)
5032                                 {
5033                                         sk->state_change(sk);
5034                                         sock_wake_async(sk->socket, 0);
5035                                 }
5036                                 if(sk->max_window==0)
5037                                 {
5038                                         sk->max_window = 32;
5039                                         sk->mss = min(sk->max_window, sk->mtu);
5040                                 }
5041                         }
5042                         else
5043                         {
5044                                 /* See if SYN's cross. Drop if boring */
5045                                 if(th->syn && !th->rst)
5046                                 {
5047                                         /* Crossed SYN's are fine - but talking to
5048                                            yourself is right out... */
5049                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5050                                                 sk->dummy_th.source==th->source &&
5051                                                 sk->dummy_th.dest==th->dest)
5052                                         {
5053                                                 tcp_statistics.TcpAttemptFails++;
5054                                                 return tcp_std_reset(sk,skb);
5055                                         }
5056                                         tcp_set_state(sk,TCP_SYN_RECV);
5057                                         
5058                                         /*
5059                                          *      FIXME:
5060                                          *      Must send SYN|ACK here
5061                                          */
5062                                 }               
5063                                 /* Discard junk segment */
5064                                 kfree_skb(skb, FREE_READ);
5065                                 release_sock(sk);
5066                                 return 0;
5067                         }
5068                         /*
5069                          *      SYN_RECV with data maybe.. drop through
5070                          */
5071                         goto rfc_step6;
5072                 }
5073 
5074         /*
5075          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5076          *      a more complex suggestion for fixing these reuse issues in RFC1644
5077          *      but not yet ready for general use. Also see RFC1379.
5078          */
5079         
5080 #define BSD_TIME_WAIT
5081 #ifdef BSD_TIME_WAIT
5082                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5083                         after(th->seq, sk->acked_seq) && !th->rst)
5084                 {
5085                         u32 seq = sk->write_seq;
5086                         if(sk->debug)
5087                                 printk("Doing a BSD time wait\n");
5088                         tcp_statistics.TcpEstabResets++;           
5089                         sk->rmem_alloc -= skb->truesize;
5090                         skb->sk = NULL;
5091                         sk->err=ECONNRESET;
5092                         tcp_set_state(sk, TCP_CLOSE);
5093                         sk->shutdown = SHUTDOWN_MASK;
5094                         release_sock(sk);
5095                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5096                         if (sk && sk->state==TCP_LISTEN)
5097                         {
5098                                 sk->inuse=1;
5099                                 skb->sk = sk;
5100                                 sk->rmem_alloc += skb->truesize;
5101                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5102                                 release_sock(sk);
5103                                 return 0;
5104                         }
5105                         kfree_skb(skb, FREE_READ);
5106                         return 0;
5107                 }
5108 #endif  
5109         }
5110 
5111         /*
5112          *      We are now in normal data flow (see the step list in the RFC)
5113          *      Note most of these are inline now. I'll inline the lot when
5114          *      I have time to test it hard and look at what gcc outputs 
5115          */
5116         
5117         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5118         {
5119                 kfree_skb(skb, FREE_READ);
5120                 release_sock(sk);
5121                 return 0;
5122         }
5123 
5124         if(th->rst)
5125                 return tcp_std_reset(sk,skb);
5126         
5127         /*
5128          *      !syn_ok is effectively the state test in RFC793.
5129          */
5130          
5131         if(th->syn && !syn_ok)
5132         {
5133                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5134                 return tcp_std_reset(sk,skb);   
5135         }
5136 
5137         /*
5138          *      Process the ACK
5139          */
5140          
5141 
5142         if(th->ack && !tcp_ack(sk,th,saddr,len))
5143         {
5144                 /*
5145                  *      Our three way handshake failed.
5146                  */
5147                  
5148                 if(sk->state==TCP_SYN_RECV)
5149                 {
5150                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5151                 }
5152                 kfree_skb(skb, FREE_READ);
5153                 release_sock(sk);
5154                 return 0;
5155         }
5156         
5157 rfc_step6:              /* I'll clean this up later */
5158 
5159         /*
5160          *      If the accepted buffer put us over our queue size we
5161          *      now drop it (we must process the ack first to avoid
5162          *      deadlock cases).
5163          */
5164          
5165         if (sk->rmem_alloc  >= sk->rcvbuf) 
5166         {
5167                 kfree_skb(skb, FREE_READ);
5168                 release_sock(sk);
5169                 return(0);
5170         }
5171 
5172 
5173         /*
5174          *      Process urgent data
5175          */
5176                 
5177         if(tcp_urg(sk, th, saddr, len))
5178         {
5179                 kfree_skb(skb, FREE_READ);
5180                 release_sock(sk);
5181                 return 0;
5182         }
5183         
5184         /*
5185          *      Process the encapsulated data
5186          */
5187         
5188         if(tcp_data(skb,sk, saddr, len))
5189         {
5190                 kfree_skb(skb, FREE_READ);
5191                 release_sock(sk);
5192                 return 0;
5193         }
5194 
5195         /*
5196          *      And done
5197          */     
5198         
5199         release_sock(sk);
5200         return 0;
5201 }
5202 
5203 /*
5204  *      This routine sends a packet with an out of date sequence
5205  *      number. It assumes the other end will try to ack it.
5206  */
5207 
5208 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5209 {
5210         struct sk_buff *buff,*skb;
5211         struct tcphdr *t1;
5212         struct device *dev=NULL;
5213         int tmp;
5214 
5215         if (sk->zapped)
5216                 return; /* After a valid reset we can send no more */
5217 
5218         /*
5219          *      Write data can still be transmitted/retransmitted in the
5220          *      following states.  If any other state is encountered, return.
5221          *      [listen/close will never occur here anyway]
5222          */
5223 
5224         if (sk->state != TCP_ESTABLISHED && 
5225             sk->state != TCP_CLOSE_WAIT &&
5226             sk->state != TCP_FIN_WAIT1 && 
5227             sk->state != TCP_LAST_ACK &&
5228             sk->state != TCP_CLOSING
5229         ) 
5230         {
5231                 return;
5232         }
5233         if ( before(sk->sent_seq, sk->window_seq) && 
5234             (skb=skb_peek(&sk->write_queue)))
5235         {
5236                 /*
5237                  * We are probing the opening of a window
5238                  * but the window size is != 0
5239                  * must have been a result SWS advoidance ( sender )
5240                  */
5241             
5242                 struct iphdr *iph;
5243                 struct tcphdr *th;
5244                 struct tcphdr *nth;
5245                 unsigned long win_size;
5246 #if 0
5247                 unsigned long ow_size;
5248 #endif
5249                 void * tcp_data_start;
5250         
5251                 /*
5252                  *      How many bytes can we send ?
5253                  */
5254                  
5255                 win_size = sk->window_seq - sk->sent_seq;
5256 
5257                 /*
5258                  *      Recover the buffer pointers
5259                  */
5260                  
5261                 iph = (struct iphdr *)skb->ip_hdr;
5262                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5263 
5264                 /*
5265                  *      Grab the data for a temporary frame
5266                  */
5267                  
5268                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5269                                      (iph->ihl << 2) +
5270                                      sk->prot->max_header + 15, 
5271                                      1, GFP_ATOMIC);
5272                 if ( buff == NULL )
5273                         return;
5274 
5275                 /* 
5276                  *      If we strip the packet on the write queue we must
5277                  *      be ready to retransmit this one 
5278                  */
5279             
5280                 buff->free = /*0*/1;
5281 
5282                 buff->sk = sk;
5283                 buff->localroute = sk->localroute;
5284                 
5285                 /*
5286                  *      Put headers on the new packet
5287                  */
5288 
5289                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5290                                          IPPROTO_TCP, sk->opt, buff->truesize,
5291                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5292                 if (tmp < 0) 
5293                 {
5294                         sock_wfree(sk, buff);
5295                         return;
5296                 }
5297                 
5298                 /*
5299                  *      Move the TCP header over
5300                  */
5301 
5302                 buff->dev = dev;
5303 
5304                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5305 
5306                 memcpy(nth, th, th->doff * 4);
5307                 
5308                 /*
5309                  *      Correct the new header
5310                  */
5311                  
5312                 nth->ack = 1; 
5313                 nth->ack_seq = ntohl(sk->acked_seq);
5314                 nth->window = ntohs(tcp_select_window(sk));
5315                 nth->check = 0;
5316 
5317                 /*
5318                  *      Find the first data byte.
5319                  */
5320                  
5321                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5322                                 (iph->ihl << 2) + th->doff * 4;
5323 
5324                 /*
5325                  *      Add it to our new buffer
5326                  */
5327                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5328                 
5329                 /*
5330                  *      Remember our right edge sequence number.
5331                  */
5332                  
5333                 buff->h.seq = sk->sent_seq + win_size;
5334                 sk->sent_seq = buff->h.seq;             /* Hack */
5335 #if 0
5336 
5337                 /*
5338                  *      now: shrink the queue head segment 
5339                  */
5340                  
5341                 th->check = 0;
5342                 ow_size = skb->len - win_size - 
5343                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5344 
5345                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5346                 skb_trim(skb,skb->len-win_size);
5347                 sk->sent_seq += win_size;
5348                 th->seq = htonl(sk->sent_seq);
5349                 if (th->urg)
5350                 {
5351                         unsigned short urg_ptr;
5352         
5353                         urg_ptr = ntohs(th->urg_ptr);
5354                         if (urg_ptr <= win_size)
5355                                 th->urg = 0;
5356                         else
5357                         {
5358                                 urg_ptr -= win_size;
5359                                 th->urg_ptr = htons(urg_ptr);
5360                                 nth->urg_ptr = htons(win_size);
5361                         }
5362                 }
5363 #else
5364                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5365                         nth->urg = 0;
5366 #endif          
5367 
5368                 /*
5369                  *      Checksum the split buffer
5370                  */
5371                  
5372                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5373                            nth->doff * 4 + win_size , sk);
5374         }
5375         else
5376         {       
5377                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5378                 if (buff == NULL) 
5379                         return;
5380 
5381                 buff->free = 1;
5382                 buff->sk = sk;
5383                 buff->localroute = sk->localroute;
5384 
5385                 /*
5386                  *      Put in the IP header and routing stuff. 
5387                  */
5388                  
5389                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5390                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5391                 if (tmp < 0) 
5392                 {
5393                         sock_wfree(sk, buff);
5394                         return;
5395                 }
5396 
5397                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5398                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5399 
5400                 /*
5401                  *      Use a previous sequence.
5402                  *      This should cause the other end to send an ack.
5403                  */
5404          
5405                 t1->seq = htonl(sk->sent_seq-1);
5406                 t1->ack = 1; 
5407                 t1->res1= 0;
5408                 t1->res2= 0;
5409                 t1->rst = 0;
5410                 t1->urg = 0;
5411                 t1->psh = 0;
5412                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5413                 t1->syn = 0;
5414                 t1->ack_seq = ntohl(sk->acked_seq);
5415                 t1->window = ntohs(tcp_select_window(sk));
5416                 t1->doff = sizeof(*t1)/4;
5417                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5418 
5419         }               
5420 
5421         /*
5422          *      Send it.
5423          */
5424         
5425         sk->prot->queue_xmit(sk, dev, buff, 1);
5426         tcp_statistics.TcpOutSegs++;
5427 }
5428 
5429 /*
5430  *      A window probe timeout has occurred.
5431  */
5432 
5433 void tcp_send_probe0(struct sock *sk)
     /*  */
5434 {
5435         if (sk->zapped)
5436                 return;         /* After a valid reset we can send no more */
5437 
5438         tcp_write_wakeup(sk);
5439 
5440         sk->backoff++;
5441         sk->rto = min(sk->rto << 1, 120*HZ);
5442         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5443         sk->retransmits++;
5444         sk->prot->retransmits ++;
5445 }
5446 
5447 /*
5448  *      Socket option code for TCP. 
5449  */
5450   
5451 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5452 {
5453         int val,err;
5454 
5455         if(level!=SOL_TCP)
5456                 return ip_setsockopt(sk,level,optname,optval,optlen);
5457 
5458         if (optval == NULL) 
5459                 return(-EINVAL);
5460 
5461         err=verify_area(VERIFY_READ, optval, sizeof(int));
5462         if(err)
5463                 return err;
5464         
5465         val = get_user((int *)optval);
5466 
5467         switch(optname)
5468         {
5469                 case TCP_MAXSEG:
5470 /*
5471  * values greater than interface MTU won't take effect.  however at
5472  * the point when this call is done we typically don't yet know
5473  * which interface is going to be used
5474  */
5475                         if(val<1||val>MAX_WINDOW)
5476                                 return -EINVAL;
5477                         sk->user_mss=val;
5478                         return 0;
5479                 case TCP_NODELAY:
5480                         sk->nonagle=(val==0)?0:1;
5481                         return 0;
5482                 default:
5483                         return(-ENOPROTOOPT);
5484         }
5485 }
5486 
5487 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5488 {
5489         int val,err;
5490 
5491         if(level!=SOL_TCP)
5492                 return ip_getsockopt(sk,level,optname,optval,optlen);
5493                         
5494         switch(optname)
5495         {
5496                 case TCP_MAXSEG:
5497                         val=sk->user_mss;
5498                         break;
5499                 case TCP_NODELAY:
5500                         val=sk->nonagle;
5501                         break;
5502                 default:
5503                         return(-ENOPROTOOPT);
5504         }
5505         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5506         if(err)
5507                 return err;
5508         put_user(sizeof(int),(int *) optlen);
5509 
5510         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5511         if(err)
5512                 return err;
5513         put_user(val,(int *)optval);
5514 
5515         return(0);
5516 }       
5517 
5518 
5519 struct proto tcp_prot = {
5520         tcp_close,
5521         ip_build_header,
5522         tcp_connect,
5523         tcp_accept,
5524         ip_queue_xmit,
5525         tcp_retransmit,
5526         tcp_write_wakeup,
5527         tcp_read_wakeup,
5528         tcp_rcv,
5529         tcp_select,
5530         tcp_ioctl,
5531         NULL,
5532         tcp_shutdown,
5533         tcp_setsockopt,
5534         tcp_getsockopt,
5535         tcp_sendmsg,
5536         tcp_recvmsg,
5537         NULL,           /* No special bind() */
5538         128,
5539         0,
5540         "TCP",
5541         0, 0,
5542         {NULL,}
5543 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS