net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_write
tcp_sendto
tcp_read_wakeup
cleanup_rbuf
tcp_read_urg
tcp_read
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_recvfrom
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications
 178  *
 179  *
 180  * To Fix:
 181  *              Fast path the code. Two things here - fix the window calculation
 182  *              so it doesn't iterate over the queue, also spot packets with no funny
 183  *              options arriving in order and process directly.
 184  *
 185  *              Implement RFC 1191 [Path MTU discovery]
 186  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 187  *              Rewrite output state machine to use a single queue and do low window
 188  *              situations as per the spec (RFC 1122)
 189  *              Speed up input assembly algorithm.
 190  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 191  *              could do with it working on IPv4
 192  *              User settable/learned rtt/max window/mtu
 193  *              Cope with MTU/device switches when retransmitting in tcp.
 194  *              Fix the window handling to use PR's new code.
 195  *
 196  *              Change the fundamental structure to a single send queue maintained
 197  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 198  *              active routes too]). Cut the queue off in tcp_retransmit/
 199  *              tcp_transmit.
 200  *              Change the receive queue to assemble as it goes. This lets us
 201  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 202  *              tcp_data/tcp_read as well as the window shrink crud.
 203  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 204  *              tcp_queue_skb seem obvious routines to extract.
 205  *      
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *              
 243  *      TCP_CLOSE               socket is finished
 244  */
 245 
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * 
 253  * Use of PSH (4.2.2.2)
 254  *   MAY aggregate data sent without the PSH flag. (does)
 255  *   MAY queue data recieved without the PSH flag. (does)
 256  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 257  *   MAY implement PSH on send calls. (doesn't, thus:)
 258  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 259  *     MUST set PSH on last segment (does)
 260  *   MAY pass received PSH to application layer (doesn't)
 261  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 262  * 
 263  * Window Size (4.2.2.3, 4.2.2.16)
 264  *   MUST treat window size as an unsigned number (does)
 265  *   SHOULD treat window size as a 32-bit number (does not)
 266  *   MUST NOT shrink window once it is offered (does not normally)
 267  *   
 268  * Urgent Pointer (4.2.2.4)
 269  * **MUST point urgent pointer to last byte of urgent data (not right
 270  *     after). (doesn't, to be like BSD)
 271  *   MUST inform application layer asynchronously of incoming urgent
 272  *     data. (does)
 273  *   MUST provide application with means of determining the amount of
 274  *     urgent data pending. (does)
 275  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 276  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 277  *      [Follows BSD 1 byte of urgent data]
 278  * 
 279  * TCP Options (4.2.2.5)
 280  *   MUST be able to recieve TCP options in any segment. (does)
 281  *   MUST ignore unsupported options (does)
 282  *   
 283  * Maximum Segment Size Option (4.2.2.6)
 284  *   MUST implement both sending and receiving MSS. (does)
 285  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 286  *     it always). (does, even when MSS == 536, which is legal)
 287  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 288  *   MUST calculate "effective send MSS" correctly:
 289  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 290  *     (does - but allows operator override)
 291  *  
 292  * TCP Checksum (4.2.2.7)
 293  *   MUST generate and check TCP checksum. (does)
 294  * 
 295  * Initial Sequence Number Selection (4.2.2.8)
 296  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 297  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 298  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 299  * 
 300  * Simultaneous Open Attempts (4.2.2.10)
 301  *   MUST support simultaneous open attempts (does)
 302  * 
 303  * Recovery from Old Duplicate SYN (4.2.2.11)
 304  *   MUST keep track of active vs. passive open (does)
 305  * 
 306  * RST segment (4.2.2.12)
 307  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 308  *     anything with it, which is standard)
 309  * 
 310  * Closing a Connection (4.2.2.13)
 311  *   MUST inform application of whether connectin was closed by RST or
 312  *     normal close. (does)
 313  *   MAY allow "half-duplex" close (treat connection as closed for the
 314  *     local app, even before handshake is done). (does)
 315  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 316  * 
 317  * Retransmission Timeout (4.2.2.15)
 318  *   MUST implement Jacobson's slow start and congestion avoidance
 319  *     stuff. (does) 
 320  * 
 321  * Probing Zero Windows (4.2.2.17)
 322  *   MUST support probing of zero windows. (does)
 323  *   MAY keep offered window closed indefinitely. (does)
 324  *   MUST allow remote window to stay closed indefinitely. (does)
 325  * 
 326  * Passive Open Calls (4.2.2.18)
 327  *   MUST NOT let new passive open affect other connections. (doesn't)
 328  *   MUST support passive opens (LISTENs) concurrently. (does)
 329  *   
 330  * Time to Live (4.2.2.19)
 331  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 332  * 
 333  * Event Processing (4.2.2.20)
 334  *   SHOULD queue out-of-order segments. (does)
 335  *   MUST aggregate ACK segments whenever possible. (does but badly)
 336  *   
 337  * Retransmission Timeout Calculation (4.2.3.1)
 338  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 339  *     calculation. (does, or at least explains them in the comments 8*b)
 340  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 341  * 
 342  * When to Send an ACK Segment (4.2.3.2)
 343  *   SHOULD implement delayed ACK. (does not)
 344  *   MUST keep ACK delay < 0.5 sec. (N/A)
 345  * 
 346  * When to Send a Window Update (4.2.3.3)
 347  *   MUST implement receiver-side SWS. (does)
 348  *   
 349  * When to Send Data (4.2.3.4)
 350  *   MUST implement sender-side SWS. (does - imperfectly)
 351  *   SHOULD implement Nagle algorithm. (does)
 352  * 
 353  * TCP Connection Failures (4.2.3.5)
 354  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 355  *   SHOULD inform application layer of soft errors. (doesn't)
 356  *   
 357  * TCP Keep-Alives (4.2.3.6)
 358  *   MAY provide keep-alives. (does)
 359  *   MUST make keep-alives configurable on a per-connection basis. (does)
 360  *   MUST default to no keep-alives. (does)
 361  * **MUST make keep-alive interval configurable. (doesn't)
 362  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 363  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 364  *     connection. (doesn't)
 365  *   SHOULD send keep-alive with no data. (does)
 366  * 
 367  * TCP Multihoming (4.2.3.7)
 368  *   MUST get source address from IP layer before sending first
 369  *     SYN. (does)
 370  *   MUST use same local address for all segments of a connection. (does)
 371  * 
 372  * IP Options (4.2.3.8)
 373  *   (I don't think the IP layer sees the IP options, yet.)
 374  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 375  *   MAY support Time Stamp and Record Route. (doesn't)
 376  * **MUST allow application to specify a source route. (doesn't?)
 377  * **MUST allow receieved Source Route option to set route for all future
 378  *     segments on this connection. (doesn't, not that I think it's a
 379  *     huge problem)
 380  * 
 381  * ICMP messages (4.2.3.9)
 382  *   MUST act on ICMP errors. (does)
 383  *   MUST slow transmission upon receipt of a Source Quench. (does)
 384  *   MUST NOT abort connection upon receipt of soft Destination
 385  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 386  *     Problems. (doesn't)
 387  *   SHOULD report soft Destination Unreachables etc. to the
 388  *     application. (doesn't)
 389  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 390  *     messages (2, 3, 4). (does)
 391  * 
 392  * Remote Address Validation (4.2.3.10)
 393  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 394  *   MUST ignore SYN with invalid source address. (does)
 395  *   MUST silently discard incoming SYN for broadcast/multicast
 396  *     address. (does) 
 397  * 
 398  * Asynchronous Reports (4.2.4.1)
 399  * **MUST provide mechanism for reporting soft errors to application
 400  *     layer. (doesn't)
 401  * 
 402  * Type of Service (4.2.4.2)
 403  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 404  * 
 405  * (Whew. -- MS 950903)
 406  **/
 407 
 408 #include <linux/types.h>
 409 #include <linux/sched.h>
 410 #include <linux/mm.h>
 411 #include <linux/time.h>
 412 #include <linux/string.h>
 413 #include <linux/config.h>
 414 #include <linux/socket.h>
 415 #include <linux/sockios.h>
 416 #include <linux/termios.h>
 417 #include <linux/in.h>
 418 #include <linux/fcntl.h>
 419 #include <linux/inet.h>
 420 #include <linux/netdevice.h>
 421 #include <net/snmp.h>
 422 #include <net/ip.h>
 423 #include <net/protocol.h>
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426 #include <net/arp.h>
 427 #include <linux/skbuff.h>
 428 #include <net/sock.h>
 429 #include <net/route.h>
 430 #include <linux/errno.h>
 431 #include <linux/timer.h>
 432 #include <asm/system.h>
 433 #include <asm/segment.h>
 434 #include <linux/mm.h>
 435 #include <net/checksum.h>
 436 
 437 /*
 438  *      The MSL timer is the 'normal' timer.
 439  */
 440  
 441 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 442 
 443 #define SEQ_TICK 3
 444 unsigned long seq_offset;
 445 struct tcp_mib  tcp_statistics;
 446 
 447 /*
 448  *      Cached last hit socket
 449  */
 450  
 451 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 452 volatile unsigned short  th_cache_dport, th_cache_sport;
 453 volatile struct sock *th_cache_sk;
 454 
 455 void tcp_cache_zap(void)
     /*  */
 456 {
 457         unsigned long flags;
 458         save_flags(flags);
 459         cli();
 460         th_cache_saddr=0;
 461         th_cache_daddr=0;
 462         th_cache_dport=0;
 463         th_cache_sport=0;
 464         th_cache_sk=NULL;
 465         restore_flags(flags);
 466 }
 467 
 468 static void tcp_close(struct sock *sk, int timeout);
 469 
 470 
 471 /*
 472  *      The less said about this the better, but it works and will do for 1.2 
 473  */
 474 
 475 static struct wait_queue *master_select_wakeup;
 476 
 477 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 478 {
 479         if (a < b) 
 480                 return(a);
 481         return(b);
 482 }
 483 
 484 #undef STATE_TRACE
 485 
 486 #ifdef STATE_TRACE
 487 static char *statename[]={
 488         "Unused","Established","Syn Sent","Syn Recv",
 489         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 490         "Close Wait","Last ACK","Listen","Closing"
 491 };
 492 #endif
 493 
 494 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 495 {
 496         if(sk->state==TCP_ESTABLISHED)
 497                 tcp_statistics.TcpCurrEstab--;
 498 #ifdef STATE_TRACE
 499         if(sk->debug)
 500                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 501 #endif  
 502         /* This is a hack but it doesn't occur often and it's going to
 503            be a real        to fix nicely */
 504            
 505         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 506         {
 507                 wake_up_interruptible(&master_select_wakeup);
 508         }
 509         sk->state=state;
 510         if(state==TCP_ESTABLISHED)
 511                 tcp_statistics.TcpCurrEstab++;
 512 }
 513 
 514 /*
 515  *      This routine picks a TCP windows for a socket based on
 516  *      the following constraints
 517  *  
 518  *      1. The window can never be shrunk once it is offered (RFC 793)
 519  *      2. We limit memory per socket
 520  *   
 521  *      For now we use NET2E3's heuristic of offering half the memory
 522  *      we have handy. All is not as bad as this seems however because
 523  *      of two things. Firstly we will bin packets even within the window
 524  *      in order to get the data we are waiting for into the memory limit.
 525  *      Secondly we bin common duplicate forms at receive time
 526  *      Better heuristics welcome
 527  */
 528    
 529 int tcp_select_window(struct sock *sk)
     /*  */
 530 {
 531         int new_window = sk->prot->rspace(sk);
 532         
 533         if(sk->window_clamp)
 534                 new_window=min(sk->window_clamp,new_window);
 535         /*
 536          *      Two things are going on here.  First, we don't ever offer a
 537          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 538          *      receiver side of SWS as specified in RFC1122.
 539          *      Second, we always give them at least the window they
 540          *      had before, in order to avoid retracting window.  This
 541          *      is technically allowed, but RFC1122 advises against it and
 542          *      in practice it causes trouble.
 543          *
 544          *      Fixme: This doesn't correctly handle the case where
 545          *      new_window > sk->window but not by enough to allow for the
 546          *      shift in sequence space. 
 547          */
 548         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 549                 return(sk->window);
 550         return(new_window);
 551 }
 552 
 553 /*
 554  *      Find someone to 'accept'. Must be called with
 555  *      sk->inuse=1 or cli()
 556  */ 
 557 
 558 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 559 {
 560         struct sk_buff *p=skb_peek(&s->receive_queue);
 561         if(p==NULL)
 562                 return NULL;
 563         do
 564         {
 565                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 566                         return p;
 567                 p=p->next;
 568         }
 569         while(p!=(struct sk_buff *)&s->receive_queue);
 570         return NULL;
 571 }
 572 
 573 /*
 574  *      Remove a completed connection and return it. This is used by
 575  *      tcp_accept() to get connections from the queue.
 576  */
 577 
 578 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 579 {
 580         struct sk_buff *skb;
 581         unsigned long flags;
 582         save_flags(flags);
 583         cli(); 
 584         skb=tcp_find_established(s);
 585         if(skb!=NULL)
 586                 skb_unlink(skb);        /* Take it off the queue */
 587         restore_flags(flags);
 588         return skb;
 589 }
 590 
 591 /* 
 592  *      This routine closes sockets which have been at least partially
 593  *      opened, but not yet accepted. Currently it is only called by
 594  *      tcp_close, and timeout mirrors the value there. 
 595  */
 596 
 597 static void tcp_close_pending (struct sock *sk) 
     /*  */
 598 {
 599         struct sk_buff *skb;
 600 
 601         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 602         {
 603                 skb->sk->dead=1;
 604                 tcp_close(skb->sk, 0);
 605                 kfree_skb(skb, FREE_READ);
 606         }
 607         return;
 608 }
 609 
 610 /*
 611  *      Enter the time wait state. 
 612  */
 613 
 614 static void tcp_time_wait(struct sock *sk)
     /*  */
 615 {
 616         tcp_set_state(sk,TCP_TIME_WAIT);
 617         sk->shutdown = SHUTDOWN_MASK;
 618         if (!sk->dead)
 619                 sk->state_change(sk);
 620         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 621 }
 622 
 623 /*
 624  *      A socket has timed out on its send queue and wants to do a
 625  *      little retransmitting. Currently this means TCP.
 626  */
 627 
 628 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 629 {
 630         struct sk_buff * skb;
 631         struct proto *prot;
 632         struct device *dev;
 633         int ct=0;
 634         struct rtable *rt;
 635 
 636         prot = sk->prot;
 637         skb = sk->send_head;
 638 
 639         while (skb != NULL)
 640         {
 641                 struct tcphdr *th;
 642                 struct iphdr *iph;
 643                 int size;
 644 
 645                 dev = skb->dev;
 646                 IS_SKB(skb);
 647                 skb->when = jiffies;
 648 
 649                 /*
 650                  *      Discard the surplus MAC header
 651                  */
 652                  
 653                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 654 
 655                 /*
 656                  * In general it's OK just to use the old packet.  However we
 657                  * need to use the current ack and window fields.  Urg and
 658                  * urg_ptr could possibly stand to be updated as well, but we
 659                  * don't keep the necessary data.  That shouldn't be a problem,
 660                  * if the other end is doing the right thing.  Since we're
 661                  * changing the packet, we have to issue a new IP identifier.
 662                  */
 663 
 664                 iph = (struct iphdr *)skb->data;
 665                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 666                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 667                 
 668                 /*
 669                  *      Note: We ought to check for window limits here but
 670                  *      currently this is done (less efficiently) elsewhere.
 671                  */
 672 
 673                 iph->id = htons(ip_id_count++);
 674                 ip_send_check(iph);
 675                 
 676                 /*
 677                  *      Put a MAC header back on (may cause ARPing)
 678                  */
 679                  
 680                 if(skb->localroute)
 681                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 682                 else
 683                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 684                         
 685                 if(rt==NULL)    /* Deep poo */
 686                 {
 687                         if(skb->sk)
 688                         {
 689                                 skb->sk->err=ENETUNREACH;
 690                                 skb->sk->error_report(skb->sk);
 691                         }
 692                 }
 693                 else
 694                 {
 695                         dev=rt->rt_dev;
 696                         skb->raddr=rt->rt_gateway;
 697                         if(skb->raddr==0)
 698                                 skb->raddr=iph->daddr;
 699                         skb->dev=dev;
 700                         skb->arp=1;
 701                         if(dev->hard_header)
 702                         {
 703                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 704                                         skb->arp=0;
 705                         }
 706                 
 707                         /*
 708                          *      This is not the right way to handle this. We have to
 709                          *      issue an up to date window and ack report with this 
 710                          *      retransmit to keep the odd buggy tcp that relies on 
 711                          *      the fact BSD does this happy. 
 712                          *      We don't however need to recalculate the entire 
 713                          *      checksum, so someone wanting a small problem to play
 714                          *      with might like to implement RFC1141/RFC1624 and speed
 715                          *      this up by avoiding a full checksum.
 716                          */
 717                  
 718                         th->ack_seq = ntohl(sk->acked_seq);
 719                         th->window = ntohs(tcp_select_window(sk));
 720                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 721                 
 722                         /*
 723                          *      If the interface is (still) up and running, kick it.
 724                          */
 725         
 726                         if (dev->flags & IFF_UP)
 727                         {
 728                                 /*
 729                                  *      If the packet is still being sent by the device/protocol
 730                                  *      below then don't retransmit. This is both needed, and good -
 731                                  *      especially with connected mode AX.25 where it stops resends
 732                                  *      occurring of an as yet unsent anyway frame!
 733                                  *      We still add up the counts as the round trip time wants
 734                                  *      adjusting.
 735                                  */
 736                                 if (sk && !skb_device_locked(skb))
 737                                 {
 738                                         /* Remove it from any existing driver queue first! */
 739                                         skb_unlink(skb);
 740                                         /* Now queue it */
 741                                         ip_statistics.IpOutRequests++;
 742                                         dev_queue_xmit(skb, dev, sk->priority);
 743                                 }
 744                         }
 745                 }
 746                 
 747                 /*
 748                  *      Count retransmissions
 749                  */
 750                  
 751                 ct++;
 752                 sk->prot->retransmits ++;
 753                 tcp_statistics.TcpRetransSegs++;
 754                 
 755 
 756                 /*
 757                  *      Only one retransmit requested.
 758                  */
 759         
 760                 if (!all)
 761                         break;
 762 
 763                 /*
 764                  *      This should cut it off before we send too many packets.
 765                  */
 766 
 767                 if (ct >= sk->cong_window)
 768                         break;
 769                 skb = skb->link3;
 770         }
 771 }
 772 
 773 /*
 774  *      Reset the retransmission timer
 775  */
 776  
 777 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 778 {
 779         del_timer(&sk->retransmit_timer);
 780         sk->ip_xmit_timeout = why;
 781         if((int)when < 0)
 782         {
 783                 when=3;
 784                 printk("Error: Negative timer in xmit_timer\n");
 785         }
 786         sk->retransmit_timer.expires=jiffies+when;
 787         add_timer(&sk->retransmit_timer);
 788 }
 789 
 790 /*
 791  *      This is the normal code called for timeouts.  It does the retransmission
 792  *      and then does backoff.  tcp_do_retransmit is separated out because
 793  *      tcp_ack needs to send stuff from the retransmit queue without
 794  *      initiating a backoff.
 795  */
 796 
 797 
 798 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 799 {
 800         tcp_do_retransmit(sk, all);
 801 
 802         /*
 803          * Increase the timeout each time we retransmit.  Note that
 804          * we do not increase the rtt estimate.  rto is initialized
 805          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 806          * that doubling rto each time is the least we can get away with.
 807          * In KA9Q, Karn uses this for the first few times, and then
 808          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 809          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 810          * defined in the protocol as the maximum possible RTT.  I guess
 811          * we'll have to use something other than TCP to talk to the
 812          * University of Mars.
 813          *
 814          * PAWS allows us longer timeouts and large windows, so once
 815          * implemented ftp to mars will work nicely. We will have to fix
 816          * the 120 second clamps though!
 817          */
 818 
 819         sk->retransmits++;
 820         sk->prot->retransmits++;
 821         sk->backoff++;
 822         sk->rto = min(sk->rto << 1, 120*HZ);
 823         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 824 }
 825 
 826 
 827 /*
 828  *      A timer event has trigger a tcp retransmit timeout. The
 829  *      socket xmit queue is ready and set up to send. Because
 830  *      the ack receive code keeps the queue straight we do
 831  *      nothing clever here.
 832  */
 833 
 834 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 835 {
 836         if (all) 
 837         {
 838                 tcp_retransmit_time(sk, all);
 839                 return;
 840         }
 841 
 842         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 843         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 844         sk->cong_count = 0;
 845 
 846         sk->cong_window = 1;
 847 
 848         /* Do the actual retransmit. */
 849         tcp_retransmit_time(sk, all);
 850 }
 851 
 852 /*
 853  *      A write timeout has occurred. Process the after effects.
 854  */
 855 
 856 static int tcp_write_timeout(struct sock *sk)
     /*  */
 857 {
 858         /*
 859          *      Look for a 'soft' timeout.
 860          */
 861         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 862                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 863         {
 864                 /*
 865                  *      Attempt to recover if arp has changed (unlikely!) or
 866                  *      a route has shifted (not supported prior to 1.3).
 867                  */
 868                 arp_destroy (sk->daddr, 0);
 869                 /*ip_route_check (sk->daddr);*/
 870         }
 871         
 872         /*
 873          *      Have we tried to SYN too many times (repent repent 8))
 874          */
 875          
 876         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 877         {
 878                 sk->err=ETIMEDOUT;
 879                 sk->error_report(sk);
 880                 del_timer(&sk->retransmit_timer);
 881                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 882                 tcp_set_state(sk,TCP_CLOSE);
 883                 /* Don't FIN, we got nothing back */
 884                 release_sock(sk);
 885                 return 0;
 886         }
 887         /*
 888          *      Has it gone just too far ?
 889          */
 890         if (sk->retransmits > TCP_RETR2) 
 891         {
 892                 sk->err = ETIMEDOUT;
 893                 sk->error_report(sk);
 894                 del_timer(&sk->retransmit_timer);
 895                 /*
 896                  *      Time wait the socket 
 897                  */
 898                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 899                 {
 900                         tcp_set_state(sk,TCP_TIME_WAIT);
 901                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 902                 }
 903                 else
 904                 {
 905                         /*
 906                          *      Clean up time.
 907                          */
 908                         tcp_set_state(sk, TCP_CLOSE);
 909                         release_sock(sk);
 910                         return 0;
 911                 }
 912         }
 913         return 1;
 914 }
 915 
 916 /*
 917  *      The TCP retransmit timer. This lacks a few small details.
 918  *
 919  *      1.      An initial rtt timeout on the probe0 should cause what we can
 920  *              of the first write queue buffer to be split and sent.
 921  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 922  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 923  *              tcp_err should save a 'soft error' for us.
 924  */
 925 
 926 static void retransmit_timer(unsigned long data)
     /*  */
 927 {
 928         struct sock *sk = (struct sock*)data;
 929         int why = sk->ip_xmit_timeout;
 930 
 931         /* 
 932          * only process if socket is not in use
 933          */
 934 
 935         cli();
 936         if (sk->inuse || in_bh) 
 937         {
 938                 /* Try again in 1 second */
 939                 sk->retransmit_timer.expires = jiffies+HZ;
 940                 add_timer(&sk->retransmit_timer);
 941                 sti();
 942                 return;
 943         }
 944 
 945         sk->inuse = 1;
 946         sti();
 947 
 948         /* Always see if we need to send an ack. */
 949 
 950         if (sk->ack_backlog && !sk->zapped) 
 951         {
 952                 sk->prot->read_wakeup (sk);
 953                 if (! sk->dead)
 954                         sk->data_ready(sk,0);
 955         }
 956 
 957         /* Now we need to figure out why the socket was on the timer. */
 958 
 959         switch (why) 
 960         {
 961                 /* Window probing */
 962                 case TIME_PROBE0:
 963                         tcp_send_probe0(sk);
 964                         tcp_write_timeout(sk);
 965                         break;
 966                 /* Retransmitting */
 967                 case TIME_WRITE:
 968                         /* It could be we got here because we needed to send an ack.
 969                          * So we need to check for that.
 970                          */
 971                 {
 972                         struct sk_buff *skb;
 973                         unsigned long flags;
 974 
 975                         save_flags(flags);
 976                         cli();
 977                         skb = sk->send_head;
 978                         if (!skb) 
 979                         {
 980                                 restore_flags(flags);
 981                         } 
 982                         else 
 983                         {
 984                                 /*
 985                                  *      Kicked by a delayed ack. Reset timer
 986                                  *      correctly now
 987                                  */
 988                                 if (jiffies < skb->when + sk->rto) 
 989                                 {
 990                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 991                                         restore_flags(flags);
 992                                         break;
 993                                 }
 994                                 restore_flags(flags);
 995                                 /*
 996                                  *      Retransmission
 997                                  */
 998                                 sk->retransmits++;
 999                                 sk->prot->retransmits++;
1000                                 sk->prot->retransmit (sk, 0);
1001                                 tcp_write_timeout(sk);
1002                         }
1003                         break;
1004                 }
1005                 /* Sending Keepalives */
1006                 case TIME_KEEPOPEN:
1007                         /* 
1008                          * this reset_timer() call is a hack, this is not
1009                          * how KEEPOPEN is supposed to work.
1010                          */
1011                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1012 
1013                         /* Send something to keep the connection open. */
1014                         if (sk->prot->write_wakeup)
1015                                   sk->prot->write_wakeup (sk);
1016                         sk->retransmits++;
1017                         sk->prot->retransmits++;
1018                         tcp_write_timeout(sk);
1019                         break;
1020                 default:
1021                         printk ("rexmit_timer: timer expired - reason unknown\n");
1022                         break;
1023         }
1024         release_sock(sk);
1025 }
1026 
1027 /*
1028  * This routine is called by the ICMP module when it gets some
1029  * sort of error condition.  If err < 0 then the socket should
1030  * be closed and the error returned to the user.  If err > 0
1031  * it's just the icmp type << 8 | icmp code.  After adjustment
1032  * header points to the first 8 bytes of the tcp header.  We need
1033  * to find the appropriate port.
1034  */
1035 
1036 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1037         __u32 saddr, struct inet_protocol *protocol)
1038 {
1039         struct tcphdr *th;
1040         struct sock *sk;
1041         struct iphdr *iph=(struct iphdr *)header;
1042   
1043         header+=4*iph->ihl;
1044    
1045 
1046         th =(struct tcphdr *)header;
1047         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1048 
1049         if (sk == NULL) 
1050                 return;
1051   
1052         if (type == ICMP_SOURCE_QUENCH) 
1053         {
1054                 /*
1055                  * FIXME:
1056                  * For now we will just trigger a linear backoff.
1057                  * The slow start code should cause a real backoff here.
1058                  */
1059                 if (sk->cong_window > 4)
1060                         sk->cong_window--;
1061                 return;
1062         }
1063         
1064         if (type == ICMP_PARAMETERPROB)
1065         {
1066                 sk->err=EPROTO;
1067                 sk->error_report(sk);
1068         }
1069 
1070         /*
1071          * If we've already connected we will keep trying
1072          * until we time out, or the user gives up.
1073          */
1074 
1075         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1076         {
1077                 sk->err = icmp_err_convert[code].errno;
1078                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1079                 {
1080                         tcp_statistics.TcpAttemptFails++;
1081                         tcp_set_state(sk,TCP_CLOSE);
1082                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1083                 }
1084         }
1085         return;
1086 }
1087 
1088 
1089 /*
1090  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1091  *      in the received data queue (ie a frame missing that needs sending to us). Not
1092  *      sorting using two queues as data arrives makes life so much harder.
1093  */
1094 
1095 static int tcp_readable(struct sock *sk)
     /*  */
1096 {
1097         unsigned long counted;
1098         unsigned long amount;
1099         struct sk_buff *skb;
1100         int sum;
1101         unsigned long flags;
1102 
1103         if(sk && sk->debug)
1104                 printk("tcp_readable: %p - ",sk);
1105 
1106         save_flags(flags);
1107         cli();
1108         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1109         {
1110                 restore_flags(flags);
1111                 if(sk && sk->debug) 
1112                         printk("empty\n");
1113                 return(0);
1114         }
1115   
1116         counted = sk->copied_seq;       /* Where we are at the moment */
1117         amount = 0;
1118   
1119         /* 
1120          *      Do until a push or until we are out of data. 
1121          */
1122          
1123         do 
1124         {
1125                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1126                         break;
1127                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1128                 if (skb->h.th->syn)
1129                         sum++;
1130                 if (sum > 0) 
1131                 {                                       /* Add it up, move on */
1132                         amount += sum;
1133                         if (skb->h.th->syn) 
1134                                 amount--;
1135                         counted += sum;
1136                 }
1137                 /*
1138                  * Don't count urg data ... but do it in the right place!
1139                  * Consider: "old_data (ptr is here) URG PUSH data"
1140                  * The old code would stop at the first push because
1141                  * it counted the urg (amount==1) and then does amount--
1142                  * *after* the loop.  This means tcp_readable() always
1143                  * returned zero if any URG PUSH was in the queue, even
1144                  * though there was normal data available. If we subtract
1145                  * the urg data right here, we even get it to work for more
1146                  * than one URG PUSH skb without normal data.
1147                  * This means that select() finally works now with urg data
1148                  * in the queue.  Note that rlogin was never affected
1149                  * because it doesn't use select(); it uses two processes
1150                  * and a blocking read().  And the queue scan in tcp_read()
1151                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1152                  */
1153                 if (skb->h.th->urg)
1154                         amount--;       /* don't count urg data */
1155                 if (amount && skb->h.th->psh) break;
1156                 skb = skb->next;
1157         }
1158         while(skb != (struct sk_buff *)&sk->receive_queue);
1159 
1160         restore_flags(flags);
1161         if(sk->debug)
1162                 printk("got %lu bytes.\n",amount);
1163         return(amount);
1164 }
1165 
1166 /*
1167  * LISTEN is a special case for select..
1168  */
1169 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1170 {
1171         if (sel_type == SEL_IN) {
1172                 int retval;
1173 
1174                 sk->inuse = 1;
1175                 retval = (tcp_find_established(sk) != NULL);
1176                 release_sock(sk);
1177                 if (!retval)
1178                         select_wait(&master_select_wakeup,wait);
1179                 return retval;
1180         }
1181         return 0;
1182 }
1183 
1184 
1185 /*
1186  *      Wait for a TCP event.
1187  *
1188  *      Note that we don't need to set "sk->inuse", as the upper select layers
1189  *      take care of normal races (between the test and the event) and we don't
1190  *      go look at any of the socket buffers directly.
1191  */
1192 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1193 {
1194         if (sk->state == TCP_LISTEN)
1195                 return tcp_listen_select(sk, sel_type, wait);
1196 
1197         switch(sel_type) {
1198         case SEL_IN:
1199                 if (sk->err)
1200                         return 1;
1201                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1202                         break;
1203 
1204                 if (sk->shutdown & RCV_SHUTDOWN)
1205                         return 1;
1206                         
1207                 if (sk->acked_seq == sk->copied_seq)
1208                         break;
1209 
1210                 if (sk->urg_seq != sk->copied_seq ||
1211                     sk->acked_seq != sk->copied_seq+1 ||
1212                     sk->urginline || !sk->urg_data)
1213                         return 1;
1214                 break;
1215 
1216         case SEL_OUT:
1217                 if (sk->err)
1218                         return 1;
1219                 if (sk->shutdown & SEND_SHUTDOWN) 
1220                         return 0;
1221                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1222                         break;
1223                 /*
1224                  * This is now right thanks to a small fix
1225                  * by Matt Dillon.
1226                  */
1227 
1228                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1229                         break;
1230                 return 1;
1231 
1232         case SEL_EX:
1233                 if (sk->urg_data)
1234                         return 1;
1235                 break;
1236         }
1237         select_wait(sk->sleep, wait);
1238         return 0;
1239 }
1240 
1241 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1242 {
1243         int err;
1244         switch(cmd) 
1245         {
1246 
1247                 case TIOCINQ:
1248 #ifdef FIXME    /* FIXME: */
1249                 case FIONREAD:
1250 #endif
1251                 {
1252                         unsigned long amount;
1253 
1254                         if (sk->state == TCP_LISTEN) 
1255                                 return(-EINVAL);
1256 
1257                         sk->inuse = 1;
1258                         amount = tcp_readable(sk);
1259                         release_sock(sk);
1260                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1261                         if(err)
1262                                 return err;
1263                         put_user(amount, (int *)arg);
1264                         return(0);
1265                 }
1266                 case SIOCATMARK:
1267                 {
1268                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1269 
1270                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1271                         if (err)
1272                                 return err;
1273                         put_user(answ,(int *) arg);
1274                         return(0);
1275                 }
1276                 case TIOCOUTQ:
1277                 {
1278                         unsigned long amount;
1279 
1280                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1281                         amount = sk->prot->wspace(sk);
1282                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1283                         if(err)
1284                                 return err;
1285                         put_user(amount, (int *)arg);
1286                         return(0);
1287                 }
1288                 default:
1289                         return(-EINVAL);
1290         }
1291 }
1292 
1293 
1294 /*
1295  *      This routine computes a TCP checksum. 
1296  *
1297  *      Modified January 1995 from a go-faster DOS routine by
1298  *      Jorge Cwik <jorge@laser.satlink.net>
1299  */
1300  
1301 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1302           unsigned long saddr, unsigned long daddr, unsigned long base)
1303 {     
1304         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1305 }
1306 
1307 
1308 
1309 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1310                 unsigned long daddr, int len, struct sock *sk)
1311 {
1312         th->check = 0;
1313         th->check = tcp_check(th, len, saddr, daddr,
1314                 csum_partial((char *)th,len,0));
1315         return;
1316 }
1317 
1318 /*
1319  *      This is the main buffer sending routine. We queue the buffer
1320  *      having checked it is sane seeming.
1321  */
1322  
1323 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1324 {
1325         int size;
1326         struct tcphdr * th = skb->h.th;
1327 
1328         /*
1329          *      length of packet (not counting length of pre-tcp headers) 
1330          */
1331          
1332         size = skb->len - ((unsigned char *) th - skb->data);
1333 
1334         /*
1335          *      Sanity check it.. 
1336          */
1337          
1338         if (size < sizeof(struct tcphdr) || size > skb->len) 
1339         {
1340                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1341                         skb, skb->data, th, skb->len);
1342                 kfree_skb(skb, FREE_WRITE);
1343                 return;
1344         }
1345 
1346         /*
1347          *      If we have queued a header size packet.. (these crash a few
1348          *      tcp stacks if ack is not set)
1349          */
1350          
1351         if (size == sizeof(struct tcphdr)) 
1352         {
1353                 /* If it's got a syn or fin it's notionally included in the size..*/
1354                 if(!th->syn && !th->fin) 
1355                 {
1356                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1357                         kfree_skb(skb,FREE_WRITE);
1358                         return;
1359                 }
1360         }
1361 
1362         /*
1363          *      Actual processing.
1364          */
1365          
1366         tcp_statistics.TcpOutSegs++;  
1367         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1368         
1369         /*
1370          *      We must queue if
1371          *
1372          *      a) The right edge of this frame exceeds the window
1373          *      b) We are retransmitting (Nagle's rule)
1374          *      c) We have too many packets 'in flight'
1375          */
1376          
1377         if (after(skb->h.seq, sk->window_seq) ||
1378             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1379              sk->packets_out >= sk->cong_window) 
1380         {
1381                 /* checksum will be supplied by tcp_write_xmit.  So
1382                  * we shouldn't need to set it at all.  I'm being paranoid */
1383                 th->check = 0;
1384                 if (skb->next != NULL) 
1385                 {
1386                         printk("tcp_send_partial: next != NULL\n");
1387                         skb_unlink(skb);
1388                 }
1389                 skb_queue_tail(&sk->write_queue, skb);
1390                 
1391                 /*
1392                  *      If we don't fit we have to start the zero window
1393                  *      probes. This is broken - we really need to do a partial
1394                  *      send _first_ (This is what causes the Cisco and PC/TCP
1395                  *      grief).
1396                  */
1397                  
1398                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1399                     sk->send_head == NULL && sk->ack_backlog == 0)
1400                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1401         } 
1402         else 
1403         {
1404                 /*
1405                  *      This is going straight out
1406                  */
1407                  
1408                 th->ack_seq = ntohl(sk->acked_seq);
1409                 th->window = ntohs(tcp_select_window(sk));
1410 
1411                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1412 
1413                 sk->sent_seq = sk->write_seq;
1414                 
1415                 /*
1416                  *      This is mad. The tcp retransmit queue is put together
1417                  *      by the ip layer. This causes half the problems with
1418                  *      unroutable FIN's and other things.
1419                  */
1420                  
1421                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1422                 
1423                 /*
1424                  *      Set for next retransmit based on expected ACK time.
1425                  *      FIXME: We set this every time which means our 
1426                  *      retransmits are really about a window behind.
1427                  */
1428 
1429                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1430         }
1431 }
1432 
1433 /*
1434  *      Locking problems lead us to a messy situation where we can have
1435  *      multiple partially complete buffers queued up. This is really bad
1436  *      as we don't want to be sending partial buffers. Fix this with
1437  *      a semaphore or similar to lock tcp_write per socket.
1438  *
1439  *      These routines are pretty self descriptive.
1440  */
1441  
1442 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1443 {
1444         struct sk_buff * skb;
1445         unsigned long flags;
1446 
1447         save_flags(flags);
1448         cli();
1449         skb = sk->partial;
1450         if (skb) {
1451                 sk->partial = NULL;
1452                 del_timer(&sk->partial_timer);
1453         }
1454         restore_flags(flags);
1455         return skb;
1456 }
1457 
1458 /*
1459  *      Empty the partial queue
1460  */
1461  
1462 static void tcp_send_partial(struct sock *sk)
     /*  */
1463 {
1464         struct sk_buff *skb;
1465 
1466         if (sk == NULL)
1467                 return;
1468         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1469                 tcp_send_skb(sk, skb);
1470 }
1471 
1472 /*
1473  *      Queue a partial frame
1474  */
1475  
1476 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1477 {
1478         struct sk_buff * tmp;
1479         unsigned long flags;
1480 
1481         save_flags(flags);
1482         cli();
1483         tmp = sk->partial;
1484         if (tmp)
1485                 del_timer(&sk->partial_timer);
1486         sk->partial = skb;
1487         init_timer(&sk->partial_timer);
1488         /*
1489          *      Wait up to 1 second for the buffer to fill.
1490          */
1491         sk->partial_timer.expires = jiffies+HZ;
1492         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1493         sk->partial_timer.data = (unsigned long) sk;
1494         add_timer(&sk->partial_timer);
1495         restore_flags(flags);
1496         if (tmp)
1497                 tcp_send_skb(sk, tmp);
1498 }
1499 
1500 
1501 /*
1502  *      This routine sends an ack and also updates the window. 
1503  */
1504  
1505 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1506              struct sock *sk,
1507              struct tcphdr *th, unsigned long daddr)
1508 {
1509         struct sk_buff *buff;
1510         struct tcphdr *t1;
1511         struct device *dev = NULL;
1512         int tmp;
1513 
1514         if(sk->zapped)
1515                 return;         /* We have been reset, we may not send again */
1516                 
1517         /*
1518          * We need to grab some memory, and put together an ack,
1519          * and then put it into the queue to be sent.
1520          */
1521 
1522         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1523         if (buff == NULL) 
1524         {
1525                 /* 
1526                  *      Force it to send an ack. We don't have to do this
1527                  *      (ACK is unreliable) but it's much better use of 
1528                  *      bandwidth on slow links to send a spare ack than
1529                  *      resend packets. 
1530                  */
1531                  
1532                 sk->ack_backlog++;
1533                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1534                 {
1535                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1536                 }
1537                 return;
1538         }
1539 
1540         /*
1541          *      Assemble a suitable TCP frame
1542          */
1543          
1544         buff->sk = sk;
1545         buff->localroute = sk->localroute;
1546 
1547         /* 
1548          *      Put in the IP header and routing stuff. 
1549          */
1550          
1551         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1552                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1553         if (tmp < 0) 
1554         {
1555                 buff->free = 1;
1556                 sk->prot->wfree(sk, buff);
1557                 return;
1558         }
1559         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1560 
1561         memcpy(t1, th, sizeof(*t1));
1562 
1563         /*
1564          *      Swap the send and the receive. 
1565          */
1566          
1567         t1->dest = th->source;
1568         t1->source = th->dest;
1569         t1->seq = ntohl(sequence);
1570         t1->ack = 1;
1571         sk->window = tcp_select_window(sk);
1572         t1->window = ntohs(sk->window);
1573         t1->res1 = 0;
1574         t1->res2 = 0;
1575         t1->rst = 0;
1576         t1->urg = 0;
1577         t1->syn = 0;
1578         t1->psh = 0;
1579         t1->fin = 0;
1580         
1581         /*
1582          *      If we have nothing queued for transmit and the transmit timer
1583          *      is on we are just doing an ACK timeout and need to switch
1584          *      to a keepalive.
1585          */
1586          
1587         if (ack == sk->acked_seq) 
1588         {
1589                 sk->ack_backlog = 0;
1590                 sk->bytes_rcv = 0;
1591                 sk->ack_timed = 0;
1592                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1593                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1594                 {
1595                         if(sk->keepopen) {
1596                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1597                         } else {
1598                                 delete_timer(sk);
1599                         }
1600                 }
1601         }
1602         
1603         /*
1604          *      Fill in the packet and send it
1605          */
1606          
1607         t1->ack_seq = ntohl(ack);
1608         t1->doff = sizeof(*t1)/4;
1609         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1610         if (sk->debug)
1611                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1612         tcp_statistics.TcpOutSegs++;
1613         sk->prot->queue_xmit(sk, dev, buff, 1);
1614 }
1615 
1616 
1617 /* 
1618  *      This routine builds a generic TCP header. 
1619  */
1620  
1621 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1622 {
1623 
1624         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1625         th->seq = htonl(sk->write_seq);
1626         th->psh =(push == 0) ? 1 : 0;
1627         th->doff = sizeof(*th)/4;
1628         th->ack = 1;
1629         th->fin = 0;
1630         sk->ack_backlog = 0;
1631         sk->bytes_rcv = 0;
1632         sk->ack_timed = 0;
1633         th->ack_seq = htonl(sk->acked_seq);
1634         sk->window = tcp_select_window(sk);
1635         th->window = htons(sk->window);
1636 
1637         return(sizeof(*th));
1638 }
1639 
1640 /*
1641  *      This routine copies from a user buffer into a socket,
1642  *      and starts the transmit system.
1643  */
1644 
1645 static int tcp_write(struct sock *sk, const unsigned char *from,
     /*  */
1646           int len, int nonblock, unsigned flags)
1647 {
1648         int copied = 0;
1649         int copy;
1650         int tmp;
1651         struct sk_buff *skb;
1652         struct sk_buff *send_tmp;
1653         struct proto *prot;
1654         struct device *dev = NULL;
1655 
1656         sk->inuse=1;
1657         prot = sk->prot;
1658         while(len > 0) 
1659         {
1660                 if (sk->err) 
1661                 {                       /* Stop on an error */
1662                         release_sock(sk);
1663                         if (copied) 
1664                                 return(copied);
1665                         tmp = -sk->err;
1666                         sk->err = 0;
1667                         return(tmp);
1668                 }
1669 
1670                 /*
1671                  *      First thing we do is make sure that we are established. 
1672                  */
1673         
1674                 if (sk->shutdown & SEND_SHUTDOWN) 
1675                 {
1676                         release_sock(sk);
1677                         sk->err = EPIPE;
1678                         if (copied) 
1679                                 return(copied);
1680                         sk->err = 0;
1681                         return(-EPIPE);
1682                 }
1683 
1684                 /* 
1685                  *      Wait for a connection to finish.
1686                  */
1687         
1688                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1689                 {
1690                         if (sk->err) 
1691                         {
1692                                 release_sock(sk);
1693                                 if (copied) 
1694                                         return(copied);
1695                                 tmp = -sk->err;
1696                                 sk->err = 0;
1697                                 return(tmp);
1698                         }
1699 
1700                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1701                         {
1702                                 release_sock(sk);
1703                                 if (copied) 
1704                                         return(copied);
1705 
1706                                 if (sk->err) 
1707                                 {
1708                                         tmp = -sk->err;
1709                                         sk->err = 0;
1710                                         return(tmp);
1711                                 }
1712 
1713                                 if (sk->keepopen) 
1714                                 {
1715                                         send_sig(SIGPIPE, current, 0);
1716                                 }
1717                                 return(-EPIPE);
1718                         }
1719 
1720                         if (nonblock || copied) 
1721                         {
1722                                 release_sock(sk);
1723                                 if (copied) 
1724                                         return(copied);
1725                                 return(-EAGAIN);
1726                         }
1727 
1728                         release_sock(sk);
1729                         cli();
1730                 
1731                         if (sk->state != TCP_ESTABLISHED &&
1732                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1733                         {
1734                                 interruptible_sleep_on(sk->sleep);
1735                                 if (current->signal & ~current->blocked) 
1736                                 {
1737                                         sti();
1738                                         if (copied) 
1739                                                 return(copied);
1740                                         return(-ERESTARTSYS);
1741                                 }
1742                         }
1743                         sk->inuse = 1;
1744                         sti();
1745                 }
1746 
1747         /*
1748          * The following code can result in copy <= if sk->mss is ever
1749          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1750          * sk->mtu is constant once SYN processing is finished.  I.e. we
1751          * had better not get here until we've seen his SYN and at least one
1752          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1753          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1754          * non-decreasing.  Note that any ioctl to set user_mss must be done
1755          * before the exchange of SYN's.  If the initial ack from the other
1756          * end has a window of 0, max_window and thus mss will both be 0.
1757          */
1758 
1759         /* 
1760          *      Now we need to check if we have a half built packet. 
1761          */
1762 
1763                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1764                 {
1765                         int hdrlen;
1766 
1767                          /* IP header + TCP header */
1768                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1769                                  + sizeof(struct tcphdr);
1770         
1771                         /* Add more stuff to the end of skb->len */
1772                         if (!(flags & MSG_OOB)) 
1773                         {
1774                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1775                                 /* FIXME: this is really a bug. */
1776                                 if (copy <= 0) 
1777                                 {
1778                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1779                                         copy = 0;
1780                                 }
1781           
1782                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1783                                 from += copy;
1784                                 copied += copy;
1785                                 len -= copy;
1786                                 sk->write_seq += copy;
1787                         }
1788                         if ((skb->len - hdrlen) >= sk->mss ||
1789                                 (flags & MSG_OOB) || !sk->packets_out)
1790                                 tcp_send_skb(sk, skb);
1791                         else
1792                                 tcp_enqueue_partial(skb, sk);
1793                         continue;
1794                 }
1795 
1796         /*
1797          * We also need to worry about the window.
1798          * If window < 1/2 the maximum window we've seen from this
1799          *   host, don't use it.  This is sender side
1800          *   silly window prevention, as specified in RFC1122.
1801          *   (Note that this is different than earlier versions of
1802          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1803          *   use the whole MSS.  Since the results in the right
1804          *   edge of the packet being outside the window, it will
1805          *   be queued for later rather than sent.
1806          */
1807 
1808                 copy = sk->window_seq - sk->write_seq;
1809                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1810                         copy = sk->mss;
1811                 if (copy > len)
1812                         copy = len;
1813 
1814         /*
1815          *      We should really check the window here also. 
1816          */
1817          
1818                 send_tmp = NULL;
1819                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1820                 {
1821                         /*
1822                          *      We will release the socket in case we sleep here. 
1823                          */
1824                         release_sock(sk);
1825                         /*
1826                          *      NB: following must be mtu, because mss can be increased.
1827                          *      mss is always <= mtu 
1828                          */
1829                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1830                         sk->inuse = 1;
1831                         send_tmp = skb;
1832                 } 
1833                 else 
1834                 {
1835                         /*
1836                          *      We will release the socket in case we sleep here. 
1837                          */
1838                         release_sock(sk);
1839                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1840                         sk->inuse = 1;
1841                 }
1842 
1843                 /*
1844                  *      If we didn't get any memory, we need to sleep. 
1845                  */
1846 
1847                 if (skb == NULL) 
1848                 {
1849                         sk->socket->flags |= SO_NOSPACE;
1850                         if (nonblock) 
1851                         {
1852                                 release_sock(sk);
1853                                 if (copied) 
1854                                         return(copied);
1855                                 return(-EAGAIN);
1856                         }
1857 
1858                         /*
1859                          *      FIXME: here is another race condition. 
1860                          */
1861 
1862                         tmp = sk->wmem_alloc;
1863                         release_sock(sk);
1864                         cli();
1865                         /*
1866                          *      Again we will try to avoid it. 
1867                          */
1868                         if (tmp <= sk->wmem_alloc &&
1869                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1870                                 && sk->err == 0) 
1871                         {
1872                                 sk->socket->flags &= ~SO_NOSPACE;
1873                                 interruptible_sleep_on(sk->sleep);
1874                                 if (current->signal & ~current->blocked) 
1875                                 {
1876                                         sti();
1877                                         if (copied) 
1878                                                 return(copied);
1879                                         return(-ERESTARTSYS);
1880                                 }
1881                         }
1882                         sk->inuse = 1;
1883                         sti();
1884                         continue;
1885                 }
1886 
1887                 skb->sk = sk;
1888                 skb->free = 0;
1889                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1890         
1891                 /*
1892                  * FIXME: we need to optimize this.
1893                  * Perhaps some hints here would be good.
1894                  */
1895                 
1896                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1897                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1898                 if (tmp < 0 ) 
1899                 {
1900                         prot->wfree(sk, skb);
1901                         release_sock(sk);
1902                         if (copied) 
1903                                 return(copied);
1904                         return(tmp);
1905                 }
1906                 skb->dev = dev;
1907                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1908                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1909                 if (tmp < 0) 
1910                 {
1911                         prot->wfree(sk, skb);
1912                         release_sock(sk);
1913                         if (copied) 
1914                                 return(copied);
1915                         return(tmp);
1916                 }
1917 
1918                 if (flags & MSG_OOB) 
1919                 {
1920                         skb->h.th->urg = 1;
1921                         skb->h.th->urg_ptr = ntohs(copy);
1922                 }
1923 
1924                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1925                 
1926                 from += copy;
1927                 copied += copy;
1928                 len -= copy;
1929                 skb->free = 0;
1930                 sk->write_seq += copy;
1931         
1932                 if (send_tmp != NULL && sk->packets_out) 
1933                 {
1934                         tcp_enqueue_partial(send_tmp, sk);
1935                         continue;
1936                 }
1937                 tcp_send_skb(sk, skb);
1938         }
1939         sk->err = 0;
1940 
1941 /*
1942  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1943  *      interactive fast network servers. It's meant to be on and
1944  *      it really improves the throughput though not the echo time
1945  *      on my slow slip link - Alan
1946  */
1947 
1948 /*
1949  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1950  */
1951  
1952         if(sk->partial && ((!sk->packets_out) 
1953      /* If not nagling we can send on the before case too.. */
1954               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1955         ))
1956                 tcp_send_partial(sk);
1957 
1958         release_sock(sk);
1959         return(copied);
1960 }
1961 
1962 /*
1963  *      This is just a wrapper. 
1964  */
1965 
1966 static int tcp_sendto(struct sock *sk, const unsigned char *from,
     /*  */
1967            int len, int nonblock, unsigned flags,
1968            struct sockaddr_in *addr, int addr_len)
1969 {
1970         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1971                 return -EINVAL;
1972         if (sk->state == TCP_CLOSE)
1973                 return -ENOTCONN;
1974         if (addr_len < sizeof(*addr))
1975                 return -EINVAL;
1976         if (addr->sin_family && addr->sin_family != AF_INET) 
1977                 return -EINVAL;
1978         if (addr->sin_port != sk->dummy_th.dest) 
1979                 return -EISCONN;
1980         if (addr->sin_addr.s_addr != sk->daddr) 
1981                 return -EISCONN;
1982         return tcp_write(sk, from, len, nonblock, flags);
1983 }
1984 
1985 
1986 /*
1987  *      Send an ack if one is backlogged at this point. Ought to merge
1988  *      this with tcp_send_ack().
1989  */
1990  
1991 static void tcp_read_wakeup(struct sock *sk)
     /*  */
1992 {
1993         int tmp;
1994         struct device *dev = NULL;
1995         struct tcphdr *t1;
1996         struct sk_buff *buff;
1997 
1998         if (!sk->ack_backlog) 
1999                 return;
2000 
2001         /*
2002          * If we're closed, don't send an ack, or we'll get a RST
2003          * from the closed destination.
2004          */
2005         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2006                 return; 
2007 
2008         /*
2009          * FIXME: we need to put code here to prevent this routine from
2010          * being called.  Being called once in a while is ok, so only check
2011          * if this is the second time in a row.
2012          */
2013 
2014         /*
2015          * We need to grab some memory, and put together an ack,
2016          * and then put it into the queue to be sent.
2017          */
2018 
2019         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2020         if (buff == NULL) 
2021         {
2022                 /* Try again real soon. */
2023                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2024                 return;
2025         }
2026 
2027         buff->sk = sk;
2028         buff->localroute = sk->localroute;
2029         
2030         /*
2031          *      Put in the IP header and routing stuff. 
2032          */
2033 
2034         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2035                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2036         if (tmp < 0) 
2037         {
2038                 buff->free = 1;
2039                 sk->prot->wfree(sk, buff);
2040                 return;
2041         }
2042 
2043         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2044 
2045         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2046         t1->seq = htonl(sk->sent_seq);
2047         t1->ack = 1;
2048         t1->res1 = 0;
2049         t1->res2 = 0;
2050         t1->rst = 0;
2051         t1->urg = 0;
2052         t1->syn = 0;
2053         t1->psh = 0;
2054         sk->ack_backlog = 0;
2055         sk->bytes_rcv = 0;
2056         sk->window = tcp_select_window(sk);
2057         t1->window = ntohs(sk->window);
2058         t1->ack_seq = ntohl(sk->acked_seq);
2059         t1->doff = sizeof(*t1)/4;
2060         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2061         sk->prot->queue_xmit(sk, dev, buff, 1);
2062         tcp_statistics.TcpOutSegs++;
2063 }
2064 
2065 
2066 /*
2067  *      FIXME:
2068  *      This routine frees used buffers.
2069  *      It should consider sending an ACK to let the
2070  *      other end know we now have a bigger window.
2071  */
2072 
2073 static void cleanup_rbuf(struct sock *sk)
     /*  */
2074 {
2075         unsigned long flags;
2076         unsigned long left;
2077         struct sk_buff *skb;
2078         unsigned long rspace;
2079 
2080         if(sk->debug)
2081                 printk("cleaning rbuf for sk=%p\n", sk);
2082   
2083         save_flags(flags);
2084         cli();
2085   
2086         left = sk->prot->rspace(sk);
2087  
2088         /*
2089          *      We have to loop through all the buffer headers,
2090          *      and try to free up all the space we can.
2091          */
2092 
2093         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2094         {
2095                 if (!skb->used || skb->users) 
2096                         break;
2097                 skb_unlink(skb);
2098                 skb->sk = sk;
2099                 kfree_skb(skb, FREE_READ);
2100         }
2101 
2102         restore_flags(flags);
2103 
2104         /*
2105          *      FIXME:
2106          *      At this point we should send an ack if the difference
2107          *      in the window, and the amount of space is bigger than
2108          *      TCP_WINDOW_DIFF.
2109          */
2110 
2111         if(sk->debug)
2112                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
2113                                             left);
2114         if ((rspace=sk->prot->rspace(sk)) != left) 
2115         {
2116                 /*
2117                  * This area has caused the most trouble.  The current strategy
2118                  * is to simply do nothing if the other end has room to send at
2119                  * least 3 full packets, because the ack from those will auto-
2120                  * matically update the window.  If the other end doesn't think
2121                  * we have much space left, but we have room for at least 1 more
2122                  * complete packet than it thinks we do, we will send an ack
2123                  * immediately.  Otherwise we will wait up to .5 seconds in case
2124                  * the user reads some more.
2125                  */
2126                 sk->ack_backlog++;
2127         /*
2128          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2129          * if the other end is offering a window smaller than the agreed on MSS
2130          * (called sk->mtu here).  In theory there's no connection between send
2131          * and receive, and so no reason to think that they're going to send
2132          * small packets.  For the moment I'm using the hack of reducing the mss
2133          * only on the send side, so I'm putting mtu here.
2134          */
2135 
2136                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2137                 {
2138                         /* Send an ack right now. */
2139                         tcp_read_wakeup(sk);
2140                 } 
2141                 else 
2142                 {
2143                         /* Force it to send an ack soon. */
2144                         int was_active = del_timer(&sk->retransmit_timer);
2145                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2146                         {
2147                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2148                         } 
2149                         else
2150                                 add_timer(&sk->retransmit_timer);
2151                 }
2152         }
2153 } 
2154 
2155 
2156 /*
2157  *      Handle reading urgent data. BSD has very simple semantics for
2158  *      this, no blocking and very strange errors 8)
2159  */
2160  
2161 static int tcp_read_urg(struct sock * sk, int nonblock,
     /*  */
2162              unsigned char *to, int len, unsigned flags)
2163 {
2164         /*
2165          *      No URG data to read
2166          */
2167         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2168                 return -EINVAL; /* Yes this is right ! */
2169                 
2170         if (sk->err) 
2171         {
2172                 int tmp = -sk->err;
2173                 sk->err = 0;
2174                 return tmp;
2175         }
2176 
2177         if (sk->state == TCP_CLOSE || sk->done) 
2178         {
2179                 if (!sk->done) {
2180                         sk->done = 1;
2181                         return 0;
2182                 }
2183                 return -ENOTCONN;
2184         }
2185 
2186         if (sk->shutdown & RCV_SHUTDOWN) 
2187         {
2188                 sk->done = 1;
2189                 return 0;
2190         }
2191         sk->inuse = 1;
2192         if (sk->urg_data & URG_VALID) 
2193         {
2194                 char c = sk->urg_data;
2195                 if (!(flags & MSG_PEEK))
2196                         sk->urg_data = URG_READ;
2197                 put_fs_byte(c, to);
2198                 release_sock(sk);
2199                 return 1;
2200         }
2201         release_sock(sk);
2202         
2203         /*
2204          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2205          * the available implementations agree in this case:
2206          * this call should never block, independent of the
2207          * blocking state of the socket.
2208          * Mike <pall@rz.uni-karlsruhe.de>
2209          */
2210         return -EAGAIN;
2211 }
2212 
2213 
2214 /*
2215  *      This routine copies from a sock struct into the user buffer. 
2216  */
2217  
2218 static int tcp_read(struct sock *sk, unsigned char *to,
     /*  */
2219         int len, int nonblock, unsigned flags)
2220 {
2221         struct wait_queue wait = { current, NULL };
2222         int copied = 0;
2223         u32 peek_seq;
2224         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2225         unsigned long used;
2226 
2227         /* 
2228          *      This error should be checked. 
2229          */
2230          
2231         if (sk->state == TCP_LISTEN)
2232                 return -ENOTCONN;
2233 
2234         /*
2235          *      Urgent data needs to be handled specially. 
2236          */
2237          
2238         if (flags & MSG_OOB)
2239                 return tcp_read_urg(sk, nonblock, to, len, flags);
2240 
2241         /*
2242          *      Copying sequence to update. This is volatile to handle
2243          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2244          *      inline and thus not flush cached variables otherwise).
2245          */
2246          
2247         peek_seq = sk->copied_seq;
2248         seq = &sk->copied_seq;
2249         if (flags & MSG_PEEK)
2250                 seq = &peek_seq;
2251 
2252         add_wait_queue(sk->sleep, &wait);
2253         sk->inuse = 1;
2254         while (len > 0) 
2255         {
2256                 struct sk_buff * skb;
2257                 u32 offset;
2258         
2259                 /*
2260                  * Are we at urgent data? Stop if we have read anything.
2261                  */
2262                  
2263                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2264                         break;
2265 
2266                 /*
2267                  *      Next get a buffer.
2268                  */
2269                  
2270                 current->state = TASK_INTERRUPTIBLE;
2271 
2272                 skb = skb_peek(&sk->receive_queue);
2273                 do 
2274                 {
2275                         if (!skb)
2276                                 break;
2277                         if (before(*seq, skb->h.th->seq))
2278                                 break;
2279                         offset = *seq - skb->h.th->seq;
2280                         if (skb->h.th->syn)
2281                                 offset--;
2282                         if (offset < skb->len)
2283                                 goto found_ok_skb;
2284                         if (skb->h.th->fin)
2285                                 goto found_fin_ok;
2286                         if (!(flags & MSG_PEEK))
2287                                 skb->used = 1;
2288                         skb = skb->next;
2289                 }
2290                 while (skb != (struct sk_buff *)&sk->receive_queue);
2291 
2292                 if (copied)
2293                         break;
2294 
2295                 if (sk->err) 
2296                 {
2297                         copied = -sk->err;
2298                         sk->err = 0;
2299                         break;
2300                 }
2301 
2302                 if (sk->state == TCP_CLOSE) 
2303                 {
2304                         if (!sk->done) 
2305                         {
2306                                 sk->done = 1;
2307                                 break;
2308                         }
2309                         copied = -ENOTCONN;
2310                         break;
2311                 }
2312 
2313                 if (sk->shutdown & RCV_SHUTDOWN) 
2314                 {
2315                         sk->done = 1;
2316                         break;
2317                 }
2318                         
2319                 if (nonblock) 
2320                 {
2321                         copied = -EAGAIN;
2322                         break;
2323                 }
2324 
2325                 cleanup_rbuf(sk);
2326                 release_sock(sk);
2327                 sk->socket->flags |= SO_WAITDATA;
2328                 schedule();
2329                 sk->socket->flags &= ~SO_WAITDATA;
2330                 sk->inuse = 1;
2331 
2332                 if (current->signal & ~current->blocked) 
2333                 {
2334                         copied = -ERESTARTSYS;
2335                         break;
2336                 }
2337                 continue;
2338 
2339         found_ok_skb:
2340                 /*
2341                  *      Lock the buffer. We can be fairly relaxed as
2342                  *      an interrupt will never steal a buffer we are 
2343                  *      using unless I've missed something serious in
2344                  *      tcp_data.
2345                  */
2346                 
2347                 skb->users++;
2348                 
2349                 /*
2350                  *      Ok so how much can we use ? 
2351                  */
2352                  
2353                 used = skb->len - offset;
2354                 if (len < used)
2355                         used = len;
2356                 /*
2357                  *      Do we have urgent data here? 
2358                  */
2359                 
2360                 if (sk->urg_data) 
2361                 {
2362                         u32 urg_offset = sk->urg_seq - *seq;
2363                         if (urg_offset < used) 
2364                         {
2365                                 if (!urg_offset) 
2366                                 {
2367                                         if (!sk->urginline) 
2368                                         {
2369                                                 ++*seq;
2370                                                 offset++;
2371                                                 used--;
2372                                         }
2373                                 }
2374                                 else
2375                                         used = urg_offset;
2376                         }
2377                 }
2378                 
2379                 /*
2380                  *      Copy it - We _MUST_ update *seq first so that we
2381                  *      don't ever double read when we have dual readers
2382                  */
2383                  
2384                 *seq += used;
2385 
2386                 /*
2387                  *      This memcpy_tofs can sleep. If it sleeps and we
2388                  *      do a second read it relies on the skb->users to avoid
2389                  *      a crash when cleanup_rbuf() gets called.
2390                  */
2391                  
2392                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2393                         skb->h.th->doff*4 + offset, used);
2394                 copied += used;
2395                 len -= used;
2396                 to += used;
2397                 
2398                 /*
2399                  *      We now will not sleep again until we are finished
2400                  *      with skb. Sorry if you are doing the SMP port
2401                  *      but you'll just have to fix it neatly ;)
2402                  */
2403                  
2404                 skb->users --;
2405                 
2406                 if (after(sk->copied_seq,sk->urg_seq))
2407                         sk->urg_data = 0;
2408                 if (used + offset < skb->len)
2409                         continue;
2410                 
2411                 /*
2412                  *      Process the FIN.
2413                  */
2414 
2415                 if (skb->h.th->fin)
2416                         goto found_fin_ok;
2417                 if (flags & MSG_PEEK)
2418                         continue;
2419                 skb->used = 1;
2420                 continue;
2421 
2422         found_fin_ok:
2423                 ++*seq;
2424                 if (flags & MSG_PEEK)
2425                         break;
2426                         
2427                 /*
2428                  *      All is done
2429                  */
2430                  
2431                 skb->used = 1;
2432                 sk->shutdown |= RCV_SHUTDOWN;
2433                 break;
2434 
2435         }
2436         remove_wait_queue(sk->sleep, &wait);
2437         current->state = TASK_RUNNING;
2438 
2439         /* Clean up data we have read: This will do ACK frames */
2440         cleanup_rbuf(sk);
2441         release_sock(sk);
2442         return copied;
2443 }
2444 
2445 /*
2446  *      State processing on a close. This implements the state shift for
2447  *      sending our FIN frame. Note that we only send a FIN for some 
2448  *      states. A shutdown() may have already sent the FIN, or we may be
2449  *      closed.
2450  */
2451  
2452 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2453 {
2454         int ns=TCP_CLOSE;
2455         int send_fin=0;
2456         switch(sk->state)
2457         {
2458                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2459                         break;
2460                 case TCP_SYN_RECV:
2461                 case TCP_ESTABLISHED:   /* Closedown begin */
2462                         ns=TCP_FIN_WAIT1;
2463                         send_fin=1;
2464                         break;
2465                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2466                 case TCP_FIN_WAIT2:
2467                 case TCP_CLOSING:
2468                         ns=sk->state;
2469                         break;
2470                 case TCP_CLOSE:
2471                 case TCP_LISTEN:
2472                         break;
2473                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2474                                            wait only for the ACK */
2475                         ns=TCP_LAST_ACK;
2476                         send_fin=1;
2477         }
2478         
2479         tcp_set_state(sk,ns);
2480                 
2481         /*
2482          *      This is a (useful) BSD violating of the RFC. There is a
2483          *      problem with TCP as specified in that the other end could
2484          *      keep a socket open forever with no application left this end.
2485          *      We use a 3 minute timeout (about the same as BSD) then kill
2486          *      our end. If they send after that then tough - BUT: long enough
2487          *      that we won't make the old 4*rto = almost no time - whoops
2488          *      reset mistake.
2489          */
2490         if(dead && ns==TCP_FIN_WAIT2)
2491         {
2492                 int timer_active=del_timer(&sk->timer);
2493                 if(timer_active)
2494                         add_timer(&sk->timer);
2495                 else
2496                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2497         }
2498         
2499         return send_fin;
2500 }
2501 
2502 /*
2503  *      Send a fin.
2504  */
2505 
2506 static void tcp_send_fin(struct sock *sk)
     /*  */
2507 {
2508         struct proto *prot =(struct proto *)sk->prot;
2509         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2510         struct tcphdr *t1;
2511         struct sk_buff *buff;
2512         struct device *dev=NULL;
2513         int tmp;
2514                 
2515         release_sock(sk); /* in case the malloc sleeps. */
2516         
2517         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2518         sk->inuse = 1;
2519 
2520         if (buff == NULL)
2521         {
2522                 /* This is a disaster if it occurs */
2523                 printk("tcp_send_fin: Impossible malloc failure");
2524                 return;
2525         }
2526 
2527         /*
2528          *      Administrivia
2529          */
2530          
2531         buff->sk = sk;
2532         buff->localroute = sk->localroute;
2533 
2534         /*
2535          *      Put in the IP header and routing stuff. 
2536          */
2537 
2538         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2539                            IPPROTO_TCP, sk->opt,
2540                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2541         if (tmp < 0) 
2542         {
2543                 int t;
2544                 /*
2545                  *      Finish anyway, treat this as a send that got lost. 
2546                  *      (Not good).
2547                  */
2548                  
2549                 buff->free = 1;
2550                 prot->wfree(sk,buff);
2551                 sk->write_seq++;
2552                 t=del_timer(&sk->timer);
2553                 if(t)
2554                         add_timer(&sk->timer);
2555                 else
2556                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2557                 return;
2558         }
2559         
2560         /*
2561          *      We ought to check if the end of the queue is a buffer and
2562          *      if so simply add the fin to that buffer, not send it ahead.
2563          */
2564 
2565         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2566         buff->dev = dev;
2567         memcpy(t1, th, sizeof(*t1));
2568         t1->seq = ntohl(sk->write_seq);
2569         sk->write_seq++;
2570         buff->h.seq = sk->write_seq;
2571         t1->ack = 1;
2572         t1->ack_seq = ntohl(sk->acked_seq);
2573         t1->window = ntohs(sk->window=tcp_select_window(sk));
2574         t1->fin = 1;
2575         t1->rst = 0;
2576         t1->doff = sizeof(*t1)/4;
2577         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2578 
2579         /*
2580          * If there is data in the write queue, the fin must be appended to
2581          * the write queue.
2582          */
2583         
2584         if (skb_peek(&sk->write_queue) != NULL) 
2585         {
2586                 buff->free = 0;
2587                 if (buff->next != NULL) 
2588                 {
2589                         printk("tcp_send_fin: next != NULL\n");
2590                         skb_unlink(buff);
2591                 }
2592                 skb_queue_tail(&sk->write_queue, buff);
2593         } 
2594         else 
2595         {
2596                 sk->sent_seq = sk->write_seq;
2597                 sk->prot->queue_xmit(sk, dev, buff, 0);
2598                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2599         }
2600 }
2601 
2602 /*
2603  *      Shutdown the sending side of a connection. Much like close except
2604  *      that we don't receive shut down or set sk->dead=1.
2605  */
2606 
2607 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2608 {
2609         /*
2610          *      We need to grab some memory, and put together a FIN,
2611          *      and then put it into the queue to be sent.
2612          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2613          */
2614 
2615         if (!(how & SEND_SHUTDOWN)) 
2616                 return;
2617          
2618         /*
2619          *      If we've already sent a FIN, or it's a closed state
2620          */
2621          
2622         if (sk->state == TCP_FIN_WAIT1 ||
2623             sk->state == TCP_FIN_WAIT2 ||
2624             sk->state == TCP_CLOSING ||
2625             sk->state == TCP_LAST_ACK ||
2626             sk->state == TCP_TIME_WAIT || 
2627             sk->state == TCP_CLOSE ||
2628             sk->state == TCP_LISTEN
2629           )
2630         {
2631                 return;
2632         }
2633         sk->inuse = 1;
2634 
2635         /*
2636          * flag that the sender has shutdown
2637          */
2638 
2639         sk->shutdown |= SEND_SHUTDOWN;
2640 
2641         /*
2642          *  Clear out any half completed packets. 
2643          */
2644 
2645         if (sk->partial)
2646                 tcp_send_partial(sk);
2647                 
2648         /*
2649          *      FIN if needed
2650          */
2651          
2652         if(tcp_close_state(sk,0))
2653                 tcp_send_fin(sk);
2654                 
2655         release_sock(sk);
2656 }
2657 
2658 
2659 static int
2660 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /*  */
2661              int to_len, int nonblock, unsigned flags,
2662              struct sockaddr_in *addr, int *addr_len)
2663 {
2664         int result;
2665   
2666         /* 
2667          *      Have to check these first unlike the old code. If 
2668          *      we check them after we lose data on an error
2669          *      which is wrong 
2670          */
2671 
2672         if(addr_len)
2673                 *addr_len = sizeof(*addr);
2674         result=tcp_read(sk, to, to_len, nonblock, flags);
2675 
2676         if (result < 0) 
2677                 return(result);
2678   
2679         if(addr)
2680         {
2681                 addr->sin_family = AF_INET;
2682                 addr->sin_port = sk->dummy_th.dest;
2683                 addr->sin_addr.s_addr = sk->daddr;
2684         }
2685         return(result);
2686 }
2687 
2688 
2689 /*
2690  *      This routine will send an RST to the other tcp. 
2691  */
2692  
2693 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2694           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2695 {
2696         struct sk_buff *buff;
2697         struct tcphdr *t1;
2698         int tmp;
2699         struct device *ndev=NULL;
2700 
2701         /*
2702          *      Cannot reset a reset (Think about it).
2703          */
2704          
2705         if(th->rst)
2706                 return;
2707   
2708         /*
2709          * We need to grab some memory, and put together an RST,
2710          * and then put it into the queue to be sent.
2711          */
2712 
2713         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2714         if (buff == NULL) 
2715                 return;
2716 
2717         buff->sk = NULL;
2718         buff->dev = dev;
2719         buff->localroute = 0;
2720 
2721         /*
2722          *      Put in the IP header and routing stuff. 
2723          */
2724 
2725         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2726                            sizeof(struct tcphdr),tos,ttl);
2727         if (tmp < 0) 
2728         {
2729                 buff->free = 1;
2730                 prot->wfree(NULL, buff);
2731                 return;
2732         }
2733 
2734         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2735         memcpy(t1, th, sizeof(*t1));
2736 
2737         /*
2738          *      Swap the send and the receive. 
2739          */
2740 
2741         t1->dest = th->source;
2742         t1->source = th->dest;
2743         t1->rst = 1;  
2744         t1->window = 0;
2745   
2746         if(th->ack)
2747         {
2748                 t1->ack = 0;
2749                 t1->seq = th->ack_seq;
2750                 t1->ack_seq = 0;
2751         }
2752         else
2753         {
2754                 t1->ack = 1;
2755                 if(!th->syn)
2756                         t1->ack_seq=htonl(th->seq);
2757                 else
2758                         t1->ack_seq=htonl(th->seq+1);
2759                 t1->seq=0;
2760         }
2761 
2762         t1->syn = 0;
2763         t1->urg = 0;
2764         t1->fin = 0;
2765         t1->psh = 0;
2766         t1->doff = sizeof(*t1)/4;
2767         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2768         prot->queue_xmit(NULL, ndev, buff, 1);
2769         tcp_statistics.TcpOutSegs++;
2770 }
2771 
2772 
2773 /*
2774  *      Look for tcp options. Parses everything but only knows about MSS.
2775  *      This routine is always called with the packet containing the SYN.
2776  *      However it may also be called with the ack to the SYN.  So you
2777  *      can't assume this is always the SYN.  It's always called after
2778  *      we have set up sk->mtu to our own MTU.
2779  *
2780  *      We need at minimum to add PAWS support here. Possibly large windows
2781  *      as Linux gets deployed on 100Mb/sec networks.
2782  */
2783  
2784 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2785 {
2786         unsigned char *ptr;
2787         int length=(th->doff*4)-sizeof(struct tcphdr);
2788         int mss_seen = 0;
2789     
2790         ptr = (unsigned char *)(th + 1);
2791   
2792         while(length>0)
2793         {
2794                 int opcode=*ptr++;
2795                 int opsize=*ptr++;
2796                 switch(opcode)
2797                 {
2798                         case TCPOPT_EOL:
2799                                 return;
2800                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2801                                 length--;
2802                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2803                                 continue;
2804                         
2805                         default:
2806                                 if(opsize<=2)   /* Avoid silly options looping forever */
2807                                         return;
2808                                 switch(opcode)
2809                                 {
2810                                         case TCPOPT_MSS:
2811                                                 if(opsize==4 && th->syn)
2812                                                 {
2813                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2814                                                         mss_seen = 1;
2815                                                 }
2816                                                 break;
2817                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2818                                 }
2819                                 ptr+=opsize-2;
2820                                 length-=opsize;
2821                 }
2822         }
2823         if (th->syn) 
2824         {
2825                 if (! mss_seen)
2826                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2827         }
2828 #ifdef CONFIG_INET_PCTCP
2829         sk->mss = min(sk->max_window >> 1, sk->mtu);
2830 #else    
2831         sk->mss = min(sk->max_window, sk->mtu);
2832 #endif  
2833 }
2834 
2835 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2836 {
2837         dst = ntohl(dst);
2838         if (IN_CLASSA(dst))
2839                 return htonl(IN_CLASSA_NET);
2840         if (IN_CLASSB(dst))
2841                 return htonl(IN_CLASSB_NET);
2842         return htonl(IN_CLASSC_NET);
2843 }
2844 
2845 /*
2846  *      Default sequence number picking algorithm.
2847  *      As close as possible to RFC 793, which
2848  *      suggests using a 250kHz clock.
2849  *      Further reading shows this assumes 2MB/s networks.
2850  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2851  *      That's funny, Linux has one built in!  Use it!
2852  */
2853 
2854 extern inline u32 tcp_init_seq(void)
     /*  */
2855 {
2856         struct timeval tv;
2857         do_gettimeofday(&tv);
2858         return tv.tv_usec+tv.tv_sec*1000000;
2859 }
2860 
2861 /*
2862  *      This routine handles a connection request.
2863  *      It should make sure we haven't already responded.
2864  *      Because of the way BSD works, we have to send a syn/ack now.
2865  *      This also means it will be harder to close a socket which is
2866  *      listening.
2867  */
2868  
2869 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2870                  unsigned long daddr, unsigned long saddr,
2871                  struct options *opt, struct device *dev, u32 seq)
2872 {
2873         struct sk_buff *buff;
2874         struct tcphdr *t1;
2875         unsigned char *ptr;
2876         struct sock *newsk;
2877         struct tcphdr *th;
2878         struct device *ndev=NULL;
2879         int tmp;
2880         struct rtable *rt;
2881   
2882         th = skb->h.th;
2883 
2884         /* If the socket is dead, don't accept the connection. */
2885         if (!sk->dead) 
2886         {
2887                 sk->data_ready(sk,0);
2888         }
2889         else 
2890         {
2891                 if(sk->debug)
2892                         printk("Reset on %p: Connect on dead socket.\n",sk);
2893                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2894                 tcp_statistics.TcpAttemptFails++;
2895                 kfree_skb(skb, FREE_READ);
2896                 return;
2897         }
2898 
2899         /*
2900          * Make sure we can accept more.  This will prevent a
2901          * flurry of syns from eating up all our memory.
2902          */
2903 
2904         if (sk->ack_backlog >= sk->max_ack_backlog) 
2905         {
2906                 tcp_statistics.TcpAttemptFails++;
2907                 kfree_skb(skb, FREE_READ);
2908                 return;
2909         }
2910 
2911         /*
2912          * We need to build a new sock struct.
2913          * It is sort of bad to have a socket without an inode attached
2914          * to it, but the wake_up's will just wake up the listening socket,
2915          * and if the listening socket is destroyed before this is taken
2916          * off of the queue, this will take care of it.
2917          */
2918 
2919         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2920         if (newsk == NULL) 
2921         {
2922                 /* just ignore the syn.  It will get retransmitted. */
2923                 tcp_statistics.TcpAttemptFails++;
2924                 kfree_skb(skb, FREE_READ);
2925                 return;
2926         }
2927 
2928         memcpy(newsk, sk, sizeof(*newsk));
2929         skb_queue_head_init(&newsk->write_queue);
2930         skb_queue_head_init(&newsk->receive_queue);
2931         newsk->send_head = NULL;
2932         newsk->send_tail = NULL;
2933         skb_queue_head_init(&newsk->back_log);
2934         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2935         newsk->rto = TCP_TIMEOUT_INIT;
2936         newsk->mdev = 0;
2937         newsk->max_window = 0;
2938         newsk->cong_window = 1;
2939         newsk->cong_count = 0;
2940         newsk->ssthresh = 0;
2941         newsk->backoff = 0;
2942         newsk->blog = 0;
2943         newsk->intr = 0;
2944         newsk->proc = 0;
2945         newsk->done = 0;
2946         newsk->partial = NULL;
2947         newsk->pair = NULL;
2948         newsk->wmem_alloc = 0;
2949         newsk->rmem_alloc = 0;
2950         newsk->localroute = sk->localroute;
2951 
2952         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2953 
2954         newsk->err = 0;
2955         newsk->shutdown = 0;
2956         newsk->ack_backlog = 0;
2957         newsk->acked_seq = skb->h.th->seq+1;
2958         newsk->copied_seq = skb->h.th->seq+1;
2959         newsk->fin_seq = skb->h.th->seq;
2960         newsk->state = TCP_SYN_RECV;
2961         newsk->timeout = 0;
2962         newsk->ip_xmit_timeout = 0;
2963         newsk->write_seq = seq; 
2964         newsk->window_seq = newsk->write_seq;
2965         newsk->rcv_ack_seq = newsk->write_seq;
2966         newsk->urg_data = 0;
2967         newsk->retransmits = 0;
2968         newsk->linger=0;
2969         newsk->destroy = 0;
2970         init_timer(&newsk->timer);
2971         newsk->timer.data = (unsigned long)newsk;
2972         newsk->timer.function = &net_timer;
2973         init_timer(&newsk->retransmit_timer);
2974         newsk->retransmit_timer.data = (unsigned long)newsk;
2975         newsk->retransmit_timer.function=&retransmit_timer;
2976         newsk->dummy_th.source = skb->h.th->dest;
2977         newsk->dummy_th.dest = skb->h.th->source;
2978         
2979         /*
2980          *      Swap these two, they are from our point of view. 
2981          */
2982          
2983         newsk->daddr = saddr;
2984         newsk->saddr = daddr;
2985 
2986         put_sock(newsk->num,newsk);
2987         newsk->dummy_th.res1 = 0;
2988         newsk->dummy_th.doff = 6;
2989         newsk->dummy_th.fin = 0;
2990         newsk->dummy_th.syn = 0;
2991         newsk->dummy_th.rst = 0;        
2992         newsk->dummy_th.psh = 0;
2993         newsk->dummy_th.ack = 0;
2994         newsk->dummy_th.urg = 0;
2995         newsk->dummy_th.res2 = 0;
2996         newsk->acked_seq = skb->h.th->seq + 1;
2997         newsk->copied_seq = skb->h.th->seq + 1;
2998         newsk->socket = NULL;
2999 
3000         /*
3001          *      Grab the ttl and tos values and use them 
3002          */
3003 
3004         newsk->ip_ttl=sk->ip_ttl;
3005         newsk->ip_tos=skb->ip_hdr->tos;
3006 
3007         /*
3008          *      Use 512 or whatever user asked for 
3009          */
3010 
3011         /*
3012          *      Note use of sk->user_mss, since user has no direct access to newsk 
3013          */
3014 
3015         rt=ip_rt_route(saddr, NULL,NULL);
3016         
3017         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3018                 newsk->window_clamp = rt->rt_window;
3019         else
3020                 newsk->window_clamp = 0;
3021                 
3022         if (sk->user_mss)
3023                 newsk->mtu = sk->user_mss;
3024         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3025                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3026         else 
3027         {
3028 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3029                 if ((saddr ^ daddr) & default_mask(saddr))
3030 #else
3031                 if ((saddr ^ daddr) & dev->pa_mask)
3032 #endif
3033                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3034                 else
3035                         newsk->mtu = MAX_WINDOW;
3036         }
3037 
3038         /*
3039          *      But not bigger than device MTU 
3040          */
3041 
3042         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3043 
3044         /*
3045          *      This will min with what arrived in the packet 
3046          */
3047 
3048         tcp_options(newsk,skb->h.th);
3049         
3050         tcp_cache_zap();
3051 
3052         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3053         if (buff == NULL) 
3054         {
3055                 sk->err = ENOMEM;
3056                 newsk->dead = 1;
3057                 newsk->state = TCP_CLOSE;
3058                 /* And this will destroy it */
3059                 release_sock(newsk);
3060                 kfree_skb(skb, FREE_READ);
3061                 tcp_statistics.TcpAttemptFails++;
3062                 return;
3063         }
3064   
3065         buff->sk = newsk;
3066         buff->localroute = newsk->localroute;
3067 
3068         /*
3069          *      Put in the IP header and routing stuff. 
3070          */
3071 
3072         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3073                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3074 
3075         /*
3076          *      Something went wrong. 
3077          */
3078 
3079         if (tmp < 0) 
3080         {
3081                 sk->err = tmp;
3082                 buff->free = 1;
3083                 kfree_skb(buff,FREE_WRITE);
3084                 newsk->dead = 1;
3085                 newsk->state = TCP_CLOSE;
3086                 release_sock(newsk);
3087                 skb->sk = sk;
3088                 kfree_skb(skb, FREE_READ);
3089                 tcp_statistics.TcpAttemptFails++;
3090                 return;
3091         }
3092 
3093         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3094   
3095         memcpy(t1, skb->h.th, sizeof(*t1));
3096         buff->h.seq = newsk->write_seq;
3097         /*
3098          *      Swap the send and the receive. 
3099          */
3100         t1->dest = skb->h.th->source;
3101         t1->source = newsk->dummy_th.source;
3102         t1->seq = ntohl(newsk->write_seq++);
3103         t1->ack = 1;
3104         newsk->window = tcp_select_window(newsk);
3105         newsk->sent_seq = newsk->write_seq;
3106         t1->window = ntohs(newsk->window);
3107         t1->res1 = 0;
3108         t1->res2 = 0;
3109         t1->rst = 0;
3110         t1->urg = 0;
3111         t1->psh = 0;
3112         t1->syn = 1;
3113         t1->ack_seq = ntohl(skb->h.th->seq+1);
3114         t1->doff = sizeof(*t1)/4+1;
3115         ptr = skb_put(buff,4);
3116         ptr[0] = 2;
3117         ptr[1] = 4;
3118         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3119         ptr[3] =(newsk->mtu) & 0xff;
3120 
3121         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3122         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3123         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3124         skb->sk = newsk;
3125 
3126         /*
3127          *      Charge the sock_buff to newsk. 
3128          */
3129          
3130         sk->rmem_alloc -= skb->truesize;
3131         newsk->rmem_alloc += skb->truesize;
3132         
3133         skb_queue_tail(&sk->receive_queue,skb);
3134         sk->ack_backlog++;
3135         release_sock(newsk);
3136         tcp_statistics.TcpOutSegs++;
3137 }
3138 
3139 
3140 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3141 {
3142         /*
3143          * We need to grab some memory, and put together a FIN, 
3144          * and then put it into the queue to be sent.
3145          */
3146         
3147         sk->inuse = 1;
3148         
3149         if(th_cache_sk==sk)
3150                 tcp_cache_zap();
3151         if(sk->state == TCP_LISTEN)
3152         {
3153                 /* Special case */
3154                 tcp_set_state(sk, TCP_CLOSE);
3155                 tcp_close_pending(sk);
3156                 release_sock(sk);
3157                 return;
3158         }
3159         
3160         sk->keepopen = 1;
3161         sk->shutdown = SHUTDOWN_MASK;
3162 
3163         if (!sk->dead) 
3164                 sk->state_change(sk);
3165 
3166         if (timeout == 0) 
3167         {
3168                 struct sk_buff *skb;
3169                 
3170                 /*
3171                  *  We need to flush the recv. buffs.  We do this only on the
3172                  *  descriptor close, not protocol-sourced closes, because the
3173                  *  reader process may not have drained the data yet!
3174                  */
3175                  
3176                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3177                         kfree_skb(skb, FREE_READ);
3178                 /*
3179                  *      Get rid off any half-completed packets. 
3180                  */
3181 
3182                 if (sk->partial) 
3183                         tcp_send_partial(sk);
3184         }
3185 
3186                 
3187         /*
3188          *      Timeout is not the same thing - however the code likes
3189          *      to send both the same way (sigh).
3190          */
3191          
3192         if(timeout)
3193         {
3194                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3195         }
3196         else
3197         {
3198                 if(tcp_close_state(sk,1)==1)
3199                 {
3200                         tcp_send_fin(sk);
3201                 }
3202         }
3203         release_sock(sk);
3204 }
3205 
3206 
3207 /*
3208  *      This routine takes stuff off of the write queue,
3209  *      and puts it in the xmit queue. This happens as incoming acks
3210  *      open up the remote window for us.
3211  */
3212  
3213 static void tcp_write_xmit(struct sock *sk)
     /*  */
3214 {
3215         struct sk_buff *skb;
3216 
3217         /*
3218          *      The bytes will have to remain here. In time closedown will
3219          *      empty the write queue and all will be happy 
3220          */
3221 
3222         if(sk->zapped)
3223                 return;
3224 
3225         /*
3226          *      Anything on the transmit queue that fits the window can
3227          *      be added providing we are not
3228          *
3229          *      a) retransmitting (Nagle's rule)
3230          *      b) exceeding our congestion window.
3231          */
3232          
3233         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3234                 before(skb->h.seq, sk->window_seq + 1) &&
3235                 (sk->retransmits == 0 ||
3236                  sk->ip_xmit_timeout != TIME_WRITE ||
3237                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3238                 && sk->packets_out < sk->cong_window) 
3239         {
3240                 IS_SKB(skb);
3241                 skb_unlink(skb);
3242                 
3243                 /*
3244                  *      See if we really need to send the packet. 
3245                  */
3246                  
3247                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3248                 {
3249                         /*
3250                          *      This is acked data. We can discard it. This 
3251                          *      cannot currently occur.
3252                          */
3253                          
3254                         sk->retransmits = 0;
3255                         kfree_skb(skb, FREE_WRITE);
3256                         if (!sk->dead) 
3257                                 sk->write_space(sk);
3258                 } 
3259                 else
3260                 {
3261                         struct tcphdr *th;
3262                         struct iphdr *iph;
3263                         int size;
3264 /*
3265  * put in the ack seq and window at this point rather than earlier,
3266  * in order to keep them monotonic.  We really want to avoid taking
3267  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3268  * Ack and window will in general have changed since this packet was put
3269  * on the write queue.
3270  */
3271                         iph = skb->ip_hdr;
3272                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3273                         size = skb->len - (((unsigned char *) th) - skb->data);
3274                         
3275                         th->ack_seq = ntohl(sk->acked_seq);
3276                         th->window = ntohs(tcp_select_window(sk));
3277 
3278                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3279 
3280                         sk->sent_seq = skb->h.seq;
3281                         
3282                         /*
3283                          *      IP manages our queue for some crazy reason
3284                          */
3285                          
3286                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3287                         
3288                         /*
3289                          *      Again we slide the timer wrongly
3290                          */
3291                          
3292                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3293                 }
3294         }
3295 }
3296 
3297 
3298 /*
3299  *      This routine deals with incoming acks, but not outgoing ones.
3300  */
3301 
3302 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3303 {
3304         u32 ack;
3305         int flag = 0;
3306 
3307         /* 
3308          * 1 - there was data in packet as well as ack or new data is sent or 
3309          *     in shutdown state
3310          * 2 - data from retransmit queue was acked and removed
3311          * 4 - window shrunk or data from retransmit queue was acked and removed
3312          */
3313 
3314         if(sk->zapped)
3315                 return(1);      /* Dead, cant ack any more so why bother */
3316 
3317         /*
3318          *      Have we discovered a larger window
3319          */
3320          
3321         ack = ntohl(th->ack_seq);
3322 
3323         if (ntohs(th->window) > sk->max_window) 
3324         {
3325                 sk->max_window = ntohs(th->window);
3326 #ifdef CONFIG_INET_PCTCP
3327                 /* Hack because we don't send partial packets to non SWS
3328                    handling hosts */
3329                 sk->mss = min(sk->max_window>>1, sk->mtu);
3330 #else
3331                 sk->mss = min(sk->max_window, sk->mtu);
3332 #endif  
3333         }
3334 
3335         /*
3336          *      We have dropped back to keepalive timeouts. Thus we have
3337          *      no retransmits pending.
3338          */
3339          
3340         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3341                 sk->retransmits = 0;
3342 
3343         /*
3344          *      If the ack is newer than sent or older than previous acks
3345          *      then we can probably ignore it.
3346          */
3347          
3348         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3349         {
3350                 if(sk->debug)
3351                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3352                         
3353                 /*
3354                  *      Keepalive processing.
3355                  */
3356                  
3357                 if (after(ack, sk->sent_seq)) 
3358                 {
3359                         return(0);
3360                 }
3361                 
3362                 /*
3363                  *      Restart the keepalive timer.
3364                  */
3365                  
3366                 if (sk->keepopen) 
3367                 {
3368                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3369                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3370                 }
3371                 return(1);
3372         }
3373 
3374         /*
3375          *      If there is data set flag 1
3376          */
3377          
3378         if (len != th->doff*4) 
3379                 flag |= 1;
3380 
3381         /*
3382          *      See if our window has been shrunk. 
3383          */
3384 
3385         if (after(sk->window_seq, ack+ntohs(th->window))) 
3386         {
3387                 /*
3388                  * We may need to move packets from the send queue
3389                  * to the write queue, if the window has been shrunk on us.
3390                  * The RFC says you are not allowed to shrink your window
3391                  * like this, but if the other end does, you must be able
3392                  * to deal with it.
3393                  */
3394                 struct sk_buff *skb;
3395                 struct sk_buff *skb2;
3396                 struct sk_buff *wskb = NULL;
3397         
3398                 skb2 = sk->send_head;
3399                 sk->send_head = NULL;
3400                 sk->send_tail = NULL;
3401         
3402                 /*
3403                  *      This is an artifact of a flawed concept. We want one
3404                  *      queue and a smarter send routine when we send all.
3405                  */
3406         
3407                 flag |= 4;      /* Window changed */
3408         
3409                 sk->window_seq = ack + ntohs(th->window);
3410                 cli();
3411                 while (skb2 != NULL) 
3412                 {
3413                         skb = skb2;
3414                         skb2 = skb->link3;
3415                         skb->link3 = NULL;
3416                         if (after(skb->h.seq, sk->window_seq)) 
3417                         {
3418                                 if (sk->packets_out > 0) 
3419                                         sk->packets_out--;
3420                                 /* We may need to remove this from the dev send list. */
3421                                 if (skb->next != NULL) 
3422                                 {
3423                                         skb_unlink(skb);                                
3424                                 }
3425                                 /* Now add it to the write_queue. */
3426                                 if (wskb == NULL)
3427                                         skb_queue_head(&sk->write_queue,skb);
3428                                 else
3429                                         skb_append(wskb,skb);
3430                                 wskb = skb;
3431                         } 
3432                         else 
3433                         {
3434                                 if (sk->send_head == NULL) 
3435                                 {
3436                                         sk->send_head = skb;
3437                                         sk->send_tail = skb;
3438                                 }
3439                                 else
3440                                 {
3441                                         sk->send_tail->link3 = skb;
3442                                         sk->send_tail = skb;
3443                                 }
3444                                 skb->link3 = NULL;
3445                         }
3446                 }
3447                 sti();
3448         }
3449 
3450         /*
3451          *      Pipe has emptied
3452          */
3453          
3454         if (sk->send_tail == NULL || sk->send_head == NULL) 
3455         {
3456                 sk->send_head = NULL;
3457                 sk->send_tail = NULL;
3458                 sk->packets_out= 0;
3459         }
3460 
3461         /*
3462          *      Update the right hand window edge of the host
3463          */
3464          
3465         sk->window_seq = ack + ntohs(th->window);
3466 
3467         /*
3468          *      We don't want too many packets out there. 
3469          */
3470          
3471         if (sk->ip_xmit_timeout == TIME_WRITE && 
3472                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3473         {
3474                 /* 
3475                  * This is Jacobson's slow start and congestion avoidance. 
3476                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3477                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3478                  * counter and increment it once every cwnd times.  It's possible
3479                  * that this should be done only if sk->retransmits == 0.  I'm
3480                  * interpreting "new data is acked" as including data that has
3481                  * been retransmitted but is just now being acked.
3482                  */
3483                 if (sk->cong_window < sk->ssthresh)  
3484                         /* 
3485                          *      In "safe" area, increase
3486                          */
3487                         sk->cong_window++;
3488                 else 
3489                 {
3490                         /*
3491                          *      In dangerous area, increase slowly.  In theory this is
3492                          *      sk->cong_window += 1 / sk->cong_window
3493                          */
3494                         if (sk->cong_count >= sk->cong_window) 
3495                         {
3496                                 sk->cong_window++;
3497                                 sk->cong_count = 0;
3498                         }
3499                         else 
3500                                 sk->cong_count++;
3501                 }
3502         }
3503 
3504         /*
3505          *      Remember the highest ack received.
3506          */
3507          
3508         sk->rcv_ack_seq = ack;
3509 
3510         /*
3511          *      If this ack opens up a zero window, clear backoff.  It was
3512          *      being used to time the probes, and is probably far higher than
3513          *      it needs to be for normal retransmission.
3514          */
3515 
3516         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3517         {
3518                 sk->retransmits = 0;    /* Our probe was answered */
3519                 
3520                 /*
3521                  *      Was it a usable window open ?
3522                  */
3523                  
3524                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3525                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3526                 {
3527                         sk->backoff = 0;
3528                         
3529                         /*
3530                          *      Recompute rto from rtt.  this eliminates any backoff.
3531                          */
3532 
3533                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3534                         if (sk->rto > 120*HZ)
3535                                 sk->rto = 120*HZ;
3536                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3537                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3538                                                    .2 of a second is going to need huge windows (SIGH) */
3539                         sk->rto = 20;
3540                 }
3541         }
3542 
3543         /* 
3544          *      See if we can take anything off of the retransmit queue.
3545          */
3546    
3547         while(sk->send_head != NULL) 
3548         {
3549                 /* Check for a bug. */
3550                 if (sk->send_head->link3 &&
3551                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3552                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3553                         
3554                 /*
3555                  *      If our packet is before the ack sequence we can
3556                  *      discard it as it's confirmed to have arrived the other end.
3557                  */
3558                  
3559                 if (before(sk->send_head->h.seq, ack+1)) 
3560                 {
3561                         struct sk_buff *oskb;   
3562                         if (sk->retransmits) 
3563                         {       
3564                                 /*
3565                                  *      We were retransmitting.  don't count this in RTT est 
3566                                  */
3567                                 flag |= 2;
3568 
3569                                 /*
3570                                  * even though we've gotten an ack, we're still
3571                                  * retransmitting as long as we're sending from
3572                                  * the retransmit queue.  Keeping retransmits non-zero
3573                                  * prevents us from getting new data interspersed with
3574                                  * retransmissions.
3575                                  */
3576 
3577                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3578                                         sk->retransmits = 1;
3579                                 else
3580                                         sk->retransmits = 0;
3581                         }
3582                         /*
3583                          * Note that we only reset backoff and rto in the
3584                          * rtt recomputation code.  And that doesn't happen
3585                          * if there were retransmissions in effect.  So the
3586                          * first new packet after the retransmissions is
3587                          * sent with the backoff still in effect.  Not until
3588                          * we get an ack from a non-retransmitted packet do
3589                          * we reset the backoff and rto.  This allows us to deal
3590                          * with a situation where the network delay has increased
3591                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3592                          */
3593 
3594                         /*
3595                          *      We have one less packet out there. 
3596                          */
3597                          
3598                         if (sk->packets_out > 0) 
3599                                 sk->packets_out --;
3600                         /* 
3601                          *      Wake up the process, it can probably write more. 
3602                          */
3603                         if (!sk->dead) 
3604                                 sk->write_space(sk);
3605                         oskb = sk->send_head;
3606 
3607                         if (!(flag&2))  /* Not retransmitting */
3608                         {
3609                                 long m;
3610         
3611                                 /*
3612                                  *      The following amusing code comes from Jacobson's
3613                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3614                                  *      are scaled versions of rtt and mean deviation.
3615                                  *      This is designed to be as fast as possible 
3616                                  *      m stands for "measurement".
3617                                  */
3618         
3619                                 m = jiffies - oskb->when;  /* RTT */
3620                                 if(m<=0)
3621                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3622                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3623                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3624                                 if (m < 0)
3625                                         m = -m;         /* m is now abs(error) */
3626                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3627                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3628         
3629                                 /*
3630                                  *      Now update timeout.  Note that this removes any backoff.
3631                                  */
3632                          
3633                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3634                                 if (sk->rto > 120*HZ)
3635                                         sk->rto = 120*HZ;
3636                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3637                                         sk->rto = 20;
3638                                 sk->backoff = 0;
3639                         }
3640                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3641                                            In this case as we just set it up */
3642                         cli();
3643                         oskb = sk->send_head;
3644                         IS_SKB(oskb);
3645                         sk->send_head = oskb->link3;
3646                         if (sk->send_head == NULL) 
3647                         {
3648                                 sk->send_tail = NULL;
3649                         }
3650 
3651                 /*
3652                  *      We may need to remove this from the dev send list. 
3653                  */
3654 
3655                         if (oskb->next)
3656                                 skb_unlink(oskb);
3657                         sti();
3658                         kfree_skb(oskb, FREE_WRITE); /* write. */
3659                         if (!sk->dead) 
3660                                 sk->write_space(sk);
3661                 }
3662                 else
3663                 {
3664                         break;
3665                 }
3666         }
3667 
3668         /*
3669          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3670          * returns non-NULL, we complete ignore the timer stuff in the else
3671          * clause.  We ought to organize the code so that else clause can
3672          * (should) be executed regardless, possibly moving the PROBE timer
3673          * reset over.  The skb_peek() thing should only move stuff to the
3674          * write queue, NOT also manage the timer functions.
3675          */
3676 
3677         /*
3678          * Maybe we can take some stuff off of the write queue,
3679          * and put it onto the xmit queue.
3680          */
3681         if (skb_peek(&sk->write_queue) != NULL) 
3682         {
3683                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3684                         (sk->retransmits == 0 || 
3685                          sk->ip_xmit_timeout != TIME_WRITE ||
3686                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3687                         && sk->packets_out < sk->cong_window) 
3688                 {
3689                         /*
3690                          *      Add more data to the send queue.
3691                          */
3692                         flag |= 1;
3693                         tcp_write_xmit(sk);
3694                 }
3695                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3696                         sk->send_head == NULL &&
3697                         sk->ack_backlog == 0 &&
3698                         sk->state != TCP_TIME_WAIT) 
3699                 {
3700                         /*
3701                          *      Data to queue but no room.
3702                          */
3703                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3704                 }               
3705         }
3706         else
3707         {
3708                 /*
3709                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3710                  * from TCP_CLOSE we don't do anything
3711                  *
3712                  * from anything else, if there is write data (or fin) pending,
3713                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3714                  * a KEEPALIVE timeout, else we delete the timer.
3715                  *
3716                  * We do not set flag for nominal write data, otherwise we may
3717                  * force a state where we start to write itsy bitsy tidbits
3718                  * of data.
3719                  */
3720 
3721                 switch(sk->state) {
3722                 case TCP_TIME_WAIT:
3723                         /*
3724                          * keep us in TIME_WAIT until we stop getting packets,
3725                          * reset the timeout.
3726                          */
3727                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3728                         break;
3729                 case TCP_CLOSE:
3730                         /*
3731                          * don't touch the timer.
3732                          */
3733                         break;
3734                 default:
3735                         /*
3736                          *      Must check send_head, write_queue, and ack_backlog
3737                          *      to determine which timeout to use.
3738                          */
3739                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3740                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3741                         } else if (sk->keepopen) {
3742                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3743                         } else {
3744                                 del_timer(&sk->retransmit_timer);
3745                                 sk->ip_xmit_timeout = 0;
3746                         }
3747                         break;
3748                 }
3749         }
3750 
3751         /*
3752          *      We have nothing queued but space to send. Send any partial
3753          *      packets immediately (end of Nagle rule application).
3754          */
3755          
3756         if (sk->packets_out == 0 && sk->partial != NULL &&
3757                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3758         {
3759                 flag |= 1;
3760                 tcp_send_partial(sk);
3761         }
3762 
3763         /*
3764          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3765          * we are now waiting for an acknowledge to our FIN.  The other end is
3766          * already in TIME_WAIT.
3767          *
3768          * Move to TCP_CLOSE on success.
3769          */
3770 
3771         if (sk->state == TCP_LAST_ACK) 
3772         {
3773                 if (!sk->dead)
3774                         sk->state_change(sk);
3775                 if(sk->debug)
3776                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3777                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3778                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3779                 {
3780                         flag |= 1;
3781                         tcp_set_state(sk,TCP_CLOSE);
3782                         sk->shutdown = SHUTDOWN_MASK;
3783                 }
3784         }
3785 
3786         /*
3787          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3788          *
3789          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3790          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3791          */
3792 
3793         if (sk->state == TCP_FIN_WAIT1) 
3794         {
3795 
3796                 if (!sk->dead) 
3797                         sk->state_change(sk);
3798                 if (sk->rcv_ack_seq == sk->write_seq) 
3799                 {
3800                         flag |= 1;
3801                         sk->shutdown |= SEND_SHUTDOWN;
3802                         tcp_set_state(sk, TCP_FIN_WAIT2);
3803                 }
3804         }
3805 
3806         /*
3807          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3808          *
3809          *      Move to TIME_WAIT
3810          */
3811 
3812         if (sk->state == TCP_CLOSING) 
3813         {
3814 
3815                 if (!sk->dead) 
3816                         sk->state_change(sk);
3817                 if (sk->rcv_ack_seq == sk->write_seq) 
3818                 {
3819                         flag |= 1;
3820                         tcp_time_wait(sk);
3821                 }
3822         }
3823         
3824         /*
3825          *      Final ack of a three way shake 
3826          */
3827          
3828         if(sk->state==TCP_SYN_RECV)
3829         {
3830                 tcp_set_state(sk, TCP_ESTABLISHED);
3831                 tcp_options(sk,th);
3832                 sk->dummy_th.dest=th->source;
3833                 sk->copied_seq = sk->acked_seq;
3834                 if(!sk->dead)
3835                         sk->state_change(sk);
3836                 if(sk->max_window==0)
3837                 {
3838                         sk->max_window=32;      /* Sanity check */
3839                         sk->mss=min(sk->max_window,sk->mtu);
3840                 }
3841         }
3842         
3843         /*
3844          * I make no guarantees about the first clause in the following
3845          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3846          * what conditions "!flag" would be true.  However I think the rest
3847          * of the conditions would prevent that from causing any
3848          * unnecessary retransmission. 
3849          *   Clearly if the first packet has expired it should be 
3850          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3851          * harder to explain:  You have to look carefully at how and when the
3852          * timer is set and with what timeout.  The most recent transmission always
3853          * sets the timer.  So in general if the most recent thing has timed
3854          * out, everything before it has as well.  So we want to go ahead and
3855          * retransmit some more.  If we didn't explicitly test for this
3856          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3857          * would not be true.  If you look at the pattern of timing, you can
3858          * show that rto is increased fast enough that the next packet would
3859          * almost never be retransmitted immediately.  Then you'd end up
3860          * waiting for a timeout to send each packet on the retransmission
3861          * queue.  With my implementation of the Karn sampling algorithm,
3862          * the timeout would double each time.  The net result is that it would
3863          * take a hideous amount of time to recover from a single dropped packet.
3864          * It's possible that there should also be a test for TIME_WRITE, but
3865          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3866          * got to be in real retransmission mode.
3867          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3868          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3869          * As long as no further losses occur, this seems reasonable.
3870          */
3871         
3872         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3873                (((flag&2) && sk->retransmits) ||
3874                (sk->send_head->when + sk->rto < jiffies))) 
3875         {
3876                 if(sk->send_head->when + sk->rto < jiffies)
3877                         tcp_retransmit(sk,0);   
3878                 else
3879                 {
3880                         tcp_do_retransmit(sk, 1);
3881                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3882                 }
3883         }
3884 
3885         return(1);
3886 }
3887 
3888 
3889 /*
3890  *      Process the FIN bit. This now behaves as it is supposed to work
3891  *      and the FIN takes effect when it is validly part of sequence
3892  *      space. Not before when we get holes.
3893  *
3894  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3895  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3896  *      TIME-WAIT)
3897  *
3898  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3899  *      close and we go into CLOSING (and later onto TIME-WAIT)
3900  *
3901  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3902  *
3903  */
3904  
3905 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3906 {
3907         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3908 
3909         if (!sk->dead) 
3910         {
3911                 sk->state_change(sk);
3912                 sock_wake_async(sk->socket, 1);
3913         }
3914 
3915         switch(sk->state) 
3916         {
3917                 case TCP_SYN_RECV:
3918                 case TCP_SYN_SENT:
3919                 case TCP_ESTABLISHED:
3920                         /*
3921                          * move to CLOSE_WAIT, tcp_data() already handled
3922                          * sending the ack.
3923                          */
3924                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3925                         if (th->rst)
3926                                 sk->shutdown = SHUTDOWN_MASK;
3927                         break;
3928 
3929                 case TCP_CLOSE_WAIT:
3930                 case TCP_CLOSING:
3931                         /*
3932                          * received a retransmission of the FIN, do
3933                          * nothing.
3934                          */
3935                         break;
3936                 case TCP_TIME_WAIT:
3937                         /*
3938                          * received a retransmission of the FIN,
3939                          * restart the TIME_WAIT timer.
3940                          */
3941                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3942                         return(0);
3943                 case TCP_FIN_WAIT1:
3944                         /*
3945                          * This case occurs when a simultaneous close
3946                          * happens, we must ack the received FIN and
3947                          * enter the CLOSING state.
3948                          *
3949                          * This causes a WRITE timeout, which will either
3950                          * move on to TIME_WAIT when we timeout, or resend
3951                          * the FIN properly (maybe we get rid of that annoying
3952                          * FIN lost hang). The TIME_WRITE code is already correct
3953                          * for handling this timeout.
3954                          */
3955 
3956                         if(sk->ip_xmit_timeout != TIME_WRITE)
3957                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3958                         tcp_set_state(sk,TCP_CLOSING);
3959                         break;
3960                 case TCP_FIN_WAIT2:
3961                         /*
3962                          * received a FIN -- send ACK and enter TIME_WAIT
3963                          */
3964                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3965                         sk->shutdown|=SHUTDOWN_MASK;
3966                         tcp_set_state(sk,TCP_TIME_WAIT);
3967                         break;
3968                 case TCP_CLOSE:
3969                         /*
3970                          * already in CLOSE
3971                          */
3972                         break;
3973                 default:
3974                         tcp_set_state(sk,TCP_LAST_ACK);
3975         
3976                         /* Start the timers. */
3977                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3978                         return(0);
3979         }
3980 
3981         return(0);
3982 }
3983 
3984 
3985 
3986 /*
3987  *      This routine handles the data.  If there is room in the buffer,
3988  *      it will be have already been moved into it.  If there is no
3989  *      room, then we will just have to discard the packet.
3990  */
3991 
3992 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
3993          unsigned long saddr, unsigned short len)
3994 {
3995         struct sk_buff *skb1, *skb2;
3996         struct tcphdr *th;
3997         int dup_dumped=0;
3998         u32 new_seq, shut_seq;
3999 
4000         th = skb->h.th;
4001         skb_pull(skb,th->doff*4);
4002         skb_trim(skb,len-(th->doff*4));
4003 
4004         /*
4005          *      The bytes in the receive read/assembly queue has increased. Needed for the
4006          *      low memory discard algorithm 
4007          */
4008            
4009         sk->bytes_rcv += skb->len;
4010         
4011         if (skb->len == 0 && !th->fin) 
4012         {
4013                 /* 
4014                  *      Don't want to keep passing ack's back and forth. 
4015                  *      (someone sent us dataless, boring frame)
4016                  */
4017                 if (!th->ack)
4018                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4019                 kfree_skb(skb, FREE_READ);
4020                 return(0);
4021         }
4022         
4023         /*
4024          *      We no longer have anyone receiving data on this connection.
4025          */
4026 
4027 #ifndef TCP_DONT_RST_SHUTDOWN            
4028 
4029         if(sk->shutdown & RCV_SHUTDOWN)
4030         {
4031                 /*
4032                  *      FIXME: BSD has some magic to avoid sending resets to
4033                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4034                  *      BSD stacks still have broken keepalives so we want to
4035                  *      cope with it.
4036                  */
4037 
4038                 if(skb->len)    /* We don't care if it's just an ack or
4039                                    a keepalive/window probe */
4040                 {
4041                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4042                         
4043                         /* Do this the way 4.4BSD treats it. Not what I'd
4044                            regard as the meaning of the spec but it's what BSD
4045                            does and clearly they know everything 8) */
4046 
4047                         /*
4048                          *      This is valid because of two things
4049                          *
4050                          *      a) The way tcp_data behaves at the bottom.
4051                          *      b) A fin takes effect when read not when received.
4052                          */
4053                          
4054                         shut_seq=sk->acked_seq+1;       /* Last byte */
4055                         
4056                         if(after(new_seq,shut_seq))
4057                         {
4058                                 if(sk->debug)
4059                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4060                                                 sk, new_seq, shut_seq, sk->blog);
4061                                 if(sk->dead)
4062                                 {
4063                                         sk->acked_seq = new_seq + th->fin;
4064                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4065                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4066                                         tcp_statistics.TcpEstabResets++;
4067                                         tcp_set_state(sk,TCP_CLOSE);
4068                                         sk->err = EPIPE;
4069                                         sk->shutdown = SHUTDOWN_MASK;
4070                                         kfree_skb(skb, FREE_READ);
4071                                         return 0;
4072                                 }
4073                         }
4074                 }
4075         }
4076 
4077 #endif
4078 
4079         /*
4080          *      Now we have to walk the chain, and figure out where this one
4081          *      goes into it.  This is set up so that the last packet we received
4082          *      will be the first one we look at, that way if everything comes
4083          *      in order, there will be no performance loss, and if they come
4084          *      out of order we will be able to fit things in nicely.
4085          *
4086          *      [AC: This is wrong. We should assume in order first and then walk
4087          *       forwards from the first hole based upon real traffic patterns.]
4088          *      
4089          */
4090 
4091         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4092         {
4093                 skb_queue_head(&sk->receive_queue,skb);
4094                 skb1= NULL;
4095         } 
4096         else
4097         {
4098                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4099                 {
4100                         if(sk->debug)
4101                         {
4102                                 printk("skb1=%p :", skb1);
4103                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4104                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4105                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4106                                                 sk->acked_seq);
4107                         }
4108                         
4109                         /*
4110                          *      Optimisation: Duplicate frame or extension of previous frame from
4111                          *      same sequence point (lost ack case).
4112                          *      The frame contains duplicate data or replaces a previous frame
4113                          *      discard the previous frame (safe as sk->inuse is set) and put
4114                          *      the new one in its place.
4115                          */
4116                          
4117                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4118                         {
4119                                 skb_append(skb1,skb);
4120                                 skb_unlink(skb1);
4121                                 kfree_skb(skb1,FREE_READ);
4122                                 dup_dumped=1;
4123                                 skb1=NULL;
4124                                 break;
4125                         }
4126                         
4127                         /*
4128                          *      Found where it fits
4129                          */
4130                          
4131                         if (after(th->seq+1, skb1->h.th->seq))
4132                         {
4133                                 skb_append(skb1,skb);
4134                                 break;
4135                         }
4136                         
4137                         /*
4138                          *      See if we've hit the start. If so insert.
4139                          */
4140                         if (skb1 == skb_peek(&sk->receive_queue))
4141                         {
4142                                 skb_queue_head(&sk->receive_queue, skb);
4143                                 break;
4144                         }
4145                 }
4146         }
4147 
4148         /*
4149          *      Figure out what the ack value for this frame is
4150          */
4151          
4152         th->ack_seq = th->seq + skb->len;
4153         if (th->syn) 
4154                 th->ack_seq++;
4155         if (th->fin)
4156                 th->ack_seq++;
4157 
4158         if (before(sk->acked_seq, sk->copied_seq)) 
4159         {
4160                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4161                 sk->acked_seq = sk->copied_seq;
4162         }
4163 
4164         /*
4165          *      Now figure out if we can ack anything. This is very messy because we really want two
4166          *      receive queues, a completed and an assembly queue. We also want only one transmit
4167          *      queue.
4168          */
4169 
4170         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4171         {
4172                 if (before(th->seq, sk->acked_seq+1)) 
4173                 {
4174                         int newwindow;
4175 
4176                         if (after(th->ack_seq, sk->acked_seq)) 
4177                         {
4178                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4179                                 if (newwindow < 0)
4180                                         newwindow = 0;  
4181                                 sk->window = newwindow;
4182                                 sk->acked_seq = th->ack_seq;
4183                         }
4184                         skb->acked = 1;
4185 
4186                         /*
4187                          *      When we ack the fin, we do the FIN 
4188                          *      processing.
4189                          */
4190 
4191                         if (skb->h.th->fin) 
4192                         {
4193                                 tcp_fin(skb,sk,skb->h.th);
4194                         }
4195           
4196                         for(skb2 = skb->next;
4197                             skb2 != (struct sk_buff *)&sk->receive_queue;
4198                             skb2 = skb2->next) 
4199                         {
4200                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4201                                 {
4202                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4203                                         {
4204                                                 newwindow = sk->window -
4205                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4206                                                 if (newwindow < 0)
4207                                                         newwindow = 0;  
4208                                                 sk->window = newwindow;
4209                                                 sk->acked_seq = skb2->h.th->ack_seq;
4210                                         }
4211                                         skb2->acked = 1;
4212                                         /*
4213                                          *      When we ack the fin, we do
4214                                          *      the fin handling.
4215                                          */
4216                                         if (skb2->h.th->fin) 
4217                                         {
4218                                                 tcp_fin(skb,sk,skb->h.th);
4219                                         }
4220 
4221                                         /*
4222                                          *      Force an immediate ack.
4223                                          */
4224                                          
4225                                         sk->ack_backlog = sk->max_ack_backlog;
4226                                 }
4227                                 else
4228                                 {
4229                                         break;
4230                                 }
4231                         }
4232 
4233                         /*
4234                          *      This also takes care of updating the window.
4235                          *      This if statement needs to be simplified.
4236                          */
4237                         if (!sk->delay_acks ||
4238                             sk->ack_backlog >= sk->max_ack_backlog || 
4239                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4240         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4241                         }
4242                         else 
4243                         {
4244                                 sk->ack_backlog++;
4245                                 if(sk->debug)
4246                                         printk("Ack queued.\n");
4247                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4248                         }
4249                 }
4250         }
4251 
4252         /*
4253          *      If we've missed a packet, send an ack.
4254          *      Also start a timer to send another.
4255          */
4256          
4257         if (!skb->acked) 
4258         {
4259         
4260         /*
4261          *      This is important.  If we don't have much room left,
4262          *      we need to throw out a few packets so we have a good
4263          *      window.  Note that mtu is used, not mss, because mss is really
4264          *      for the send side.  He could be sending us stuff as large as mtu.
4265          */
4266                  
4267                 while (sk->prot->rspace(sk) < sk->mtu) 
4268                 {
4269                         skb1 = skb_peek(&sk->receive_queue);
4270                         if (skb1 == NULL) 
4271                         {
4272                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4273                                 break;
4274                         }
4275 
4276                         /*
4277                          *      Don't throw out something that has been acked. 
4278                          */
4279                  
4280                         if (skb1->acked) 
4281                         {
4282                                 break;
4283                         }
4284                 
4285                         skb_unlink(skb1);
4286                         kfree_skb(skb1, FREE_READ);
4287                 }
4288                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4289                 sk->ack_backlog++;
4290                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4291         }
4292         else
4293         {
4294                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4295         }
4296 
4297         /*
4298          *      Now tell the user we may have some data. 
4299          */
4300          
4301         if (!sk->dead) 
4302         {
4303                 if(sk->debug)
4304                         printk("Data wakeup.\n");
4305                 sk->data_ready(sk,0);
4306         } 
4307         return(0);
4308 }
4309 
4310 
4311 /*
4312  *      This routine is only called when we have urgent data
4313  *      signalled. Its the 'slow' part of tcp_urg. It could be
4314  *      moved inline now as tcp_urg is only called from one
4315  *      place. We handle URGent data wrong. We have to - as
4316  *      BSD still doesn't use the correction from RFC961.
4317  */
4318  
4319 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4320 {
4321         u32 ptr = ntohs(th->urg_ptr);
4322 
4323         if (ptr)
4324                 ptr--;
4325         ptr += th->seq;
4326 
4327         /* ignore urgent data that we've already seen and read */
4328         if (after(sk->copied_seq, ptr))
4329                 return;
4330 
4331         /* do we already have a newer (or duplicate) urgent pointer? */
4332         if (sk->urg_data && !after(ptr, sk->urg_seq))
4333                 return;
4334 
4335         /* tell the world about our new urgent pointer */
4336         if (sk->proc != 0) {
4337                 if (sk->proc > 0) {
4338                         kill_proc(sk->proc, SIGURG, 1);
4339                 } else {
4340                         kill_pg(-sk->proc, SIGURG, 1);
4341                 }
4342         }
4343         sk->urg_data = URG_NOTYET;
4344         sk->urg_seq = ptr;
4345 }
4346 
4347 /*
4348  *      This is the 'fast' part of urgent handling.
4349  */
4350  
4351 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4352         unsigned long saddr, unsigned long len)
4353 {
4354         u32 ptr;
4355 
4356         /*
4357          *      Check if we get a new urgent pointer - normally not 
4358          */
4359          
4360         if (th->urg)
4361                 tcp_check_urg(sk,th);
4362 
4363         /*
4364          *      Do we wait for any urgent data? - normally not
4365          */
4366          
4367         if (sk->urg_data != URG_NOTYET)
4368                 return 0;
4369 
4370         /*
4371          *      Is the urgent pointer pointing into this packet? 
4372          */
4373          
4374         ptr = sk->urg_seq - th->seq + th->doff*4;
4375         if (ptr >= len)
4376                 return 0;
4377 
4378         /*
4379          *      Ok, got the correct packet, update info 
4380          */
4381          
4382         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4383         if (!sk->dead)
4384                 sk->data_ready(sk,0);
4385         return 0;
4386 }
4387 
4388 /*
4389  *      This will accept the next outstanding connection. 
4390  */
4391  
4392 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4393 {
4394         struct sock *newsk;
4395         struct sk_buff *skb;
4396   
4397   /*
4398    * We need to make sure that this socket is listening,
4399    * and that it has something pending.
4400    */
4401 
4402         if (sk->state != TCP_LISTEN) 
4403         {
4404                 sk->err = EINVAL;
4405                 return(NULL); 
4406         }
4407 
4408         /* Avoid the race. */
4409         cli();
4410         sk->inuse = 1;
4411 
4412         while((skb = tcp_dequeue_established(sk)) == NULL) 
4413         {
4414                 if (flags & O_NONBLOCK) 
4415                 {
4416                         sti();
4417                         release_sock(sk);
4418                         sk->err = EAGAIN;
4419                         return(NULL);
4420                 }
4421 
4422                 release_sock(sk);
4423                 interruptible_sleep_on(sk->sleep);
4424                 if (current->signal & ~current->blocked) 
4425                 {
4426                         sti();
4427                         sk->err = ERESTARTSYS;
4428                         return(NULL);
4429                 }
4430                 sk->inuse = 1;
4431         }
4432         sti();
4433 
4434         /*
4435          *      Now all we need to do is return skb->sk. 
4436          */
4437 
4438         newsk = skb->sk;
4439 
4440         kfree_skb(skb, FREE_READ);
4441         sk->ack_backlog--;
4442         release_sock(sk);
4443         return(newsk);
4444 }
4445 
4446 
4447 /*
4448  *      This will initiate an outgoing connection. 
4449  */
4450  
4451 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4452 {
4453         struct sk_buff *buff;
4454         struct device *dev=NULL;
4455         unsigned char *ptr;
4456         int tmp;
4457         int atype;
4458         struct tcphdr *t1;
4459         struct rtable *rt;
4460 
4461         if (sk->state != TCP_CLOSE) 
4462         {
4463                 return(-EISCONN);
4464         }
4465         
4466         if (addr_len < 8) 
4467                 return(-EINVAL);
4468 
4469         if (usin->sin_family && usin->sin_family != AF_INET) 
4470                 return(-EAFNOSUPPORT);
4471 
4472         /*
4473          *      connect() to INADDR_ANY means loopback (BSD'ism).
4474          */
4475         
4476         if(usin->sin_addr.s_addr==INADDR_ANY)
4477                 usin->sin_addr.s_addr=ip_my_addr();
4478                   
4479         /*
4480          *      Don't want a TCP connection going to a broadcast address 
4481          */
4482 
4483         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4484                 return -ENETUNREACH;
4485   
4486         sk->inuse = 1;
4487         sk->daddr = usin->sin_addr.s_addr;
4488         sk->write_seq = tcp_init_seq();
4489         sk->window_seq = sk->write_seq;
4490         sk->rcv_ack_seq = sk->write_seq -1;
4491         sk->err = 0;
4492         sk->dummy_th.dest = usin->sin_port;
4493         release_sock(sk);
4494 
4495         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4496         if (buff == NULL) 
4497         {
4498                 return(-ENOMEM);
4499         }
4500         sk->inuse = 1;
4501         buff->sk = sk;
4502         buff->free = 0;
4503         buff->localroute = sk->localroute;
4504         
4505 
4506         /*
4507          *      Put in the IP header and routing stuff. 
4508          */
4509          
4510         rt=ip_rt_route(sk->daddr, NULL, NULL);
4511         
4512 
4513         /*
4514          *      We need to build the routing stuff from the things saved in skb. 
4515          */
4516 
4517         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4518                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4519         if (tmp < 0) 
4520         {
4521                 sk->prot->wfree(sk, buff);
4522                 release_sock(sk);
4523                 return(-ENETUNREACH);
4524         }
4525 
4526         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4527 
4528         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4529         t1->seq = ntohl(sk->write_seq++);
4530         sk->sent_seq = sk->write_seq;
4531         buff->h.seq = sk->write_seq;
4532         t1->ack = 0;
4533         t1->window = 2;
4534         t1->res1=0;
4535         t1->res2=0;
4536         t1->rst = 0;
4537         t1->urg = 0;
4538         t1->psh = 0;
4539         t1->syn = 1;
4540         t1->urg_ptr = 0;
4541         t1->doff = 6;
4542         /* use 512 or whatever user asked for */
4543         
4544         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4545                 sk->window_clamp=rt->rt_window;
4546         else
4547                 sk->window_clamp=0;
4548 
4549         if (sk->user_mss)
4550                 sk->mtu = sk->user_mss;
4551         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4552                 sk->mtu = rt->rt_mss;
4553         else 
4554         {
4555 #ifdef CONFIG_INET_SNARL
4556                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4557 #else
4558                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4559 #endif
4560                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4561                 else
4562                         sk->mtu = MAX_WINDOW;
4563         }
4564         /*
4565          *      but not bigger than device MTU 
4566          */
4567 
4568         if(sk->mtu <32)
4569                 sk->mtu = 32;   /* Sanity limit */
4570                 
4571         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4572         
4573         /*
4574          *      Put in the TCP options to say MTU. 
4575          */
4576 
4577         ptr = skb_put(buff,4);
4578         ptr[0] = 2;
4579         ptr[1] = 4;
4580         ptr[2] = (sk->mtu) >> 8;
4581         ptr[3] = (sk->mtu) & 0xff;
4582         tcp_send_check(t1, sk->saddr, sk->daddr,
4583                   sizeof(struct tcphdr) + 4, sk);
4584 
4585         /*
4586          *      This must go first otherwise a really quick response will get reset. 
4587          */
4588 
4589         tcp_cache_zap();
4590         tcp_set_state(sk,TCP_SYN_SENT);
4591         if(rt&&rt->rt_flags&RTF_IRTT)
4592                 sk->rto = rt->rt_irtt;
4593         else
4594                 sk->rto = TCP_TIMEOUT_INIT;
4595         sk->retransmit_timer.function=&retransmit_timer;
4596         sk->retransmit_timer.data = (unsigned long)sk;
4597         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4598         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4599 
4600         sk->prot->queue_xmit(sk, dev, buff, 0);  
4601         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4602         tcp_statistics.TcpActiveOpens++;
4603         tcp_statistics.TcpOutSegs++;
4604   
4605         release_sock(sk);
4606         return(0);
4607 }
4608 
4609 
4610 /* This functions checks to see if the tcp header is actually acceptable. */
4611 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4612              struct options *opt, unsigned long saddr, struct device *dev)
4613 {
4614         u32 next_seq;
4615 
4616         next_seq = len - 4*th->doff;
4617         if (th->fin)
4618                 next_seq++;
4619         /* if we have a zero window, we can't have any data in the packet.. */
4620         if (next_seq && !sk->window)
4621                 goto ignore_it;
4622         next_seq += th->seq;
4623 
4624         /*
4625          * This isn't quite right.  sk->acked_seq could be more recent
4626          * than sk->window.  This is however close enough.  We will accept
4627          * slightly more packets than we should, but it should not cause
4628          * problems unless someone is trying to forge packets.
4629          */
4630 
4631         /* have we already seen all of this packet? */
4632         if (!after(next_seq+1, sk->acked_seq))
4633                 goto ignore_it;
4634         /* or does it start beyond the window? */
4635         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4636                 goto ignore_it;
4637 
4638         /* ok, at least part of this packet would seem interesting.. */
4639         return 1;
4640 
4641 ignore_it:
4642         if (th->rst)
4643                 return 0;
4644 
4645         /*
4646          *      Send a reset if we get something not ours and we are
4647          *      unsynchronized. Note: We don't do anything to our end. We
4648          *      are just killing the bogus remote connection then we will
4649          *      connect again and it will work (with luck).
4650          */
4651          
4652         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4653         {
4654                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4655                 return 1;
4656         }
4657 
4658         /* Try to resync things. */
4659         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4660         return 0;
4661 }
4662 
4663 /*
4664  *      When we get a reset we do this.
4665  */
4666 
4667 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4668 {
4669         sk->zapped = 1;
4670         sk->err = ECONNRESET;
4671         if (sk->state == TCP_SYN_SENT)
4672                 sk->err = ECONNREFUSED;
4673         if (sk->state == TCP_CLOSE_WAIT)
4674                 sk->err = EPIPE;
4675 #ifdef TCP_DO_RFC1337           
4676         /*
4677          *      Time wait assassination protection [RFC1337]
4678          */
4679         if(sk->state!=TCP_TIME_WAIT)
4680         {       
4681                 tcp_set_state(sk,TCP_CLOSE);
4682                 sk->shutdown = SHUTDOWN_MASK;
4683         }
4684 #else   
4685         tcp_set_state(sk,TCP_CLOSE);
4686         sk->shutdown = SHUTDOWN_MASK;
4687 #endif  
4688         if (!sk->dead) 
4689                 sk->state_change(sk);
4690         kfree_skb(skb, FREE_READ);
4691         release_sock(sk);
4692         return(0);
4693 }
4694 
4695 /*
4696  *      A TCP packet has arrived.
4697  *              skb->h.raw is the TCP header.
4698  */
4699  
4700 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4701         __u32 daddr, unsigned short len,
4702         __u32 saddr, int redo, struct inet_protocol * protocol)
4703 {
4704         struct tcphdr *th;
4705         struct sock *sk;
4706         int syn_ok=0;
4707         
4708         tcp_statistics.TcpInSegs++;
4709         if(skb->pkt_type!=PACKET_HOST)
4710         {
4711                 kfree_skb(skb,FREE_READ);
4712                 return(0);
4713         }
4714   
4715         th = skb->h.th;
4716 
4717         /*
4718          *      Find the socket, using the last hit cache if applicable.
4719          */
4720 
4721         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4722                 sk=(struct sock *)th_cache_sk;
4723         else
4724         {
4725                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4726                 th_cache_saddr=saddr;
4727                 th_cache_daddr=daddr;
4728                 th_cache_dport=th->dest;
4729                 th_cache_sport=th->source;
4730                 th_cache_sk=sk;
4731         }               
4732 
4733         /*
4734          *      If this socket has got a reset it's to all intents and purposes 
4735          *      really dead. Count closed sockets as dead.
4736          *
4737          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4738          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4739          *      exist so should cause resets as if the port was unreachable.
4740          */
4741          
4742         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4743                 sk=NULL;
4744 
4745         if (!redo) 
4746         {
4747                 /*
4748                  *      Pull up the IP header.
4749                  */
4750                 skb_pull(skb, skb->h.raw-skb->data);
4751                 /*
4752                  *      Try to use the device checksum if provided.
4753                  */
4754                 if (
4755                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4756                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4757                     )
4758                 {
4759                         skb->sk = NULL;
4760                         kfree_skb(skb,FREE_READ);
4761                         /*
4762                          *      We don't release the socket because it was
4763                          *      never marked in use.
4764                          */
4765                         return(0);
4766                 }
4767                 th->seq = ntohl(th->seq);
4768 
4769                 /* See if we know about the socket. */
4770                 if (sk == NULL) 
4771                 {
4772                         /*
4773                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4774                          */
4775                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4776                         skb->sk = NULL;
4777                         /*
4778                          *      Discard frame
4779                          */
4780                         kfree_skb(skb, FREE_READ);
4781                         return(0);
4782                 }
4783 
4784 /*              skb->len = len;*/
4785                 skb->acked = 0;
4786                 skb->used = 0;
4787                 skb->free = 0;
4788                 skb->saddr = daddr;
4789                 skb->daddr = saddr;
4790         
4791                 /* We may need to add it to the backlog here. */
4792                 cli();
4793                 if (sk->inuse) 
4794                 {
4795                         skb_queue_tail(&sk->back_log, skb);
4796                         sti();
4797                         return(0);
4798                 }
4799                 sk->inuse = 1;
4800                 sti();
4801         }
4802         else
4803         {
4804                 if (sk==NULL) 
4805                 {
4806                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4807                         skb->sk = NULL;
4808                         kfree_skb(skb, FREE_READ);
4809                         return(0);
4810                 }
4811         }
4812 
4813 
4814         if (!sk->prot) 
4815         {
4816                 printk("IMPOSSIBLE 3\n");
4817                 return(0);
4818         }
4819 
4820 
4821         /*
4822          *      Charge the memory to the socket. 
4823          */
4824          
4825         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4826         {
4827                 kfree_skb(skb, FREE_READ);
4828                 release_sock(sk);
4829                 return(0);
4830         }
4831 
4832         skb->sk=sk;
4833         sk->rmem_alloc += skb->truesize;
4834 
4835         /*
4836          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4837          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4838          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4839          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4840          */
4841 
4842         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4843         {
4844         
4845                 /*
4846                  *      Now deal with unusual cases.
4847                  */
4848          
4849                 if(sk->state==TCP_LISTEN)
4850                 {
4851                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4852                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4853 
4854                         /*
4855                          *      We don't care for RST, and non SYN are absorbed (old segments)
4856                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4857                          *      netmask on a running connection it can go broadcast. Even Sun's have
4858                          *      this problem so I'm ignoring it 
4859                          */
4860                            
4861                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4862                         {
4863                                 kfree_skb(skb, FREE_READ);
4864                                 release_sock(sk);
4865                                 return 0;
4866                         }
4867                 
4868                         /*      
4869                          *      Guess we need to make a new socket up 
4870                          */
4871                 
4872                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4873                 
4874                         /*
4875                          *      Now we have several options: In theory there is nothing else
4876                          *      in the frame. KA9Q has an option to send data with the syn,
4877                          *      BSD accepts data with the syn up to the [to be] advertised window
4878                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4879                          *      it, that fits the spec precisely and avoids incompatibilities. It
4880                          *      would be nice in future to drop through and process the data.
4881                          */
4882                          
4883                         release_sock(sk);
4884                         return 0;
4885                 }
4886         
4887                 /* retransmitted SYN? */
4888                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4889                 {
4890                         kfree_skb(skb, FREE_READ);
4891                         release_sock(sk);
4892                         return 0;
4893                 }
4894                 
4895                 /*
4896                  *      SYN sent means we have to look for a suitable ack and either reset
4897                  *      for bad matches or go to connected 
4898                  */
4899            
4900                 if(sk->state==TCP_SYN_SENT)
4901                 {
4902                         /* Crossed SYN or previous junk segment */
4903                         if(th->ack)
4904                         {
4905                                 /* We got an ack, but it's not a good ack */
4906                                 if(!tcp_ack(sk,th,saddr,len))
4907                                 {
4908                                         /* Reset the ack - its an ack from a 
4909                                            different connection  [ th->rst is checked in tcp_reset()] */
4910                                         tcp_statistics.TcpAttemptFails++;
4911                                         tcp_reset(daddr, saddr, th,
4912                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4913                                         kfree_skb(skb, FREE_READ);
4914                                         release_sock(sk);
4915                                         return(0);
4916                                 }
4917                                 if(th->rst)
4918                                         return tcp_std_reset(sk,skb);
4919                                 if(!th->syn)
4920                                 {
4921                                         /* A valid ack from a different connection
4922                                            start. Shouldn't happen but cover it */
4923                                         kfree_skb(skb, FREE_READ);
4924                                         release_sock(sk);
4925                                         return 0;
4926                                 }
4927                                 /*
4928                                  *      Ok.. it's good. Set up sequence numbers and
4929                                  *      move to established.
4930                                  */
4931                                 syn_ok=1;       /* Don't reset this connection for the syn */
4932                                 sk->acked_seq=th->seq+1;
4933                                 sk->fin_seq=th->seq;
4934                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4935                                 tcp_set_state(sk, TCP_ESTABLISHED);
4936                                 tcp_options(sk,th);
4937                                 sk->dummy_th.dest=th->source;
4938                                 sk->copied_seq = sk->acked_seq;
4939                                 if(!sk->dead)
4940                                 {
4941                                         sk->state_change(sk);
4942                                         sock_wake_async(sk->socket, 0);
4943                                 }
4944                                 if(sk->max_window==0)
4945                                 {
4946                                         sk->max_window = 32;
4947                                         sk->mss = min(sk->max_window, sk->mtu);
4948                                 }
4949                         }
4950                         else
4951                         {
4952                                 /* See if SYN's cross. Drop if boring */
4953                                 if(th->syn && !th->rst)
4954                                 {
4955                                         /* Crossed SYN's are fine - but talking to
4956                                            yourself is right out... */
4957                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4958                                                 sk->dummy_th.source==th->source &&
4959                                                 sk->dummy_th.dest==th->dest)
4960                                         {
4961                                                 tcp_statistics.TcpAttemptFails++;
4962                                                 return tcp_std_reset(sk,skb);
4963                                         }
4964                                         tcp_set_state(sk,TCP_SYN_RECV);
4965                                         
4966                                         /*
4967                                          *      FIXME:
4968                                          *      Must send SYN|ACK here
4969                                          */
4970                                 }               
4971                                 /* Discard junk segment */
4972                                 kfree_skb(skb, FREE_READ);
4973                                 release_sock(sk);
4974                                 return 0;
4975                         }
4976                         /*
4977                          *      SYN_RECV with data maybe.. drop through
4978                          */
4979                         goto rfc_step6;
4980                 }
4981 
4982         /*
4983          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4984          *      a more complex suggestion for fixing these reuse issues in RFC1644
4985          *      but not yet ready for general use. Also see RFC1379.
4986          */
4987         
4988 #define BSD_TIME_WAIT
4989 #ifdef BSD_TIME_WAIT
4990                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4991                         after(th->seq, sk->acked_seq) && !th->rst)
4992                 {
4993                         u32 seq = sk->write_seq;
4994                         if(sk->debug)
4995                                 printk("Doing a BSD time wait\n");
4996                         tcp_statistics.TcpEstabResets++;           
4997                         sk->rmem_alloc -= skb->truesize;
4998                         skb->sk = NULL;
4999                         sk->err=ECONNRESET;
5000                         tcp_set_state(sk, TCP_CLOSE);
5001                         sk->shutdown = SHUTDOWN_MASK;
5002                         release_sock(sk);
5003                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5004                         if (sk && sk->state==TCP_LISTEN)
5005                         {
5006                                 sk->inuse=1;
5007                                 skb->sk = sk;
5008                                 sk->rmem_alloc += skb->truesize;
5009                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5010                                 release_sock(sk);
5011                                 return 0;
5012                         }
5013                         kfree_skb(skb, FREE_READ);
5014                         return 0;
5015                 }
5016 #endif  
5017         }
5018 
5019         /*
5020          *      We are now in normal data flow (see the step list in the RFC)
5021          *      Note most of these are inline now. I'll inline the lot when
5022          *      I have time to test it hard and look at what gcc outputs 
5023          */
5024         
5025         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5026         {
5027                 kfree_skb(skb, FREE_READ);
5028                 release_sock(sk);
5029                 return 0;
5030         }
5031 
5032         if(th->rst)
5033                 return tcp_std_reset(sk,skb);
5034         
5035         /*
5036          *      !syn_ok is effectively the state test in RFC793.
5037          */
5038          
5039         if(th->syn && !syn_ok)
5040         {
5041                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5042                 return tcp_std_reset(sk,skb);   
5043         }
5044 
5045         /*
5046          *      Process the ACK
5047          */
5048          
5049 
5050         if(th->ack && !tcp_ack(sk,th,saddr,len))
5051         {
5052                 /*
5053                  *      Our three way handshake failed.
5054                  */
5055                  
5056                 if(sk->state==TCP_SYN_RECV)
5057                 {
5058                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5059                 }
5060                 kfree_skb(skb, FREE_READ);
5061                 release_sock(sk);
5062                 return 0;
5063         }
5064         
5065 rfc_step6:              /* I'll clean this up later */
5066 
5067         /*
5068          *      Process urgent data
5069          */
5070                 
5071         if(tcp_urg(sk, th, saddr, len))
5072         {
5073                 kfree_skb(skb, FREE_READ);
5074                 release_sock(sk);
5075                 return 0;
5076         }
5077         
5078         
5079         /*
5080          *      Process the encapsulated data
5081          */
5082         
5083         if(tcp_data(skb,sk, saddr, len))
5084         {
5085                 kfree_skb(skb, FREE_READ);
5086                 release_sock(sk);
5087                 return 0;
5088         }
5089 
5090         /*
5091          *      And done
5092          */     
5093         
5094         release_sock(sk);
5095         return 0;
5096 }
5097 
5098 /*
5099  *      This routine sends a packet with an out of date sequence
5100  *      number. It assumes the other end will try to ack it.
5101  */
5102 
5103 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5104 {
5105         struct sk_buff *buff,*skb;
5106         struct tcphdr *t1;
5107         struct device *dev=NULL;
5108         int tmp;
5109 
5110         if (sk->zapped)
5111                 return; /* After a valid reset we can send no more */
5112 
5113         /*
5114          *      Write data can still be transmitted/retransmitted in the
5115          *      following states.  If any other state is encountered, return.
5116          *      [listen/close will never occur here anyway]
5117          */
5118 
5119         if (sk->state != TCP_ESTABLISHED && 
5120             sk->state != TCP_CLOSE_WAIT &&
5121             sk->state != TCP_FIN_WAIT1 && 
5122             sk->state != TCP_LAST_ACK &&
5123             sk->state != TCP_CLOSING
5124         ) 
5125         {
5126                 return;
5127         }
5128         if ( before(sk->sent_seq, sk->window_seq) && 
5129             (skb=skb_peek(&sk->write_queue)))
5130         {
5131                 /*
5132                  * We are probing the opening of a window
5133                  * but the window size is != 0
5134                  * must have been a result SWS advoidance ( sender )
5135                  */
5136             
5137                 struct iphdr *iph;
5138                 struct tcphdr *th;
5139                 struct tcphdr *nth;
5140                 unsigned long win_size, ow_size;
5141                 void * tcp_data_start;
5142         
5143                 /*
5144                  *      How many bytes can we send ?
5145                  */
5146                  
5147                 win_size = sk->window_seq - sk->sent_seq;
5148 
5149                 /*
5150                  *      Recover the buffer pointers
5151                  */
5152                  
5153                 iph = (struct iphdr *)skb->ip_hdr;
5154                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5155 
5156                 /*
5157                  *      Grab the data for a temporary frame
5158                  */
5159                  
5160                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
5161                                      (iph->ihl << 2) +
5162                                      sk->prot->max_header + 15, 
5163                                      1, GFP_ATOMIC);
5164                 if ( buff == NULL )
5165                         return;
5166 
5167                 /* 
5168                  *      If we strip the packet on the write queue we must
5169                  *      be ready to retransmit this one 
5170                  */
5171             
5172                 buff->free = /*0*/1;
5173 
5174                 buff->sk = sk;
5175                 buff->localroute = sk->localroute;
5176                 
5177                 /*
5178                  *      Put headers on the new packet
5179                  */
5180 
5181                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5182                                          IPPROTO_TCP, sk->opt, buff->truesize,
5183                                          sk->ip_tos,sk->ip_ttl);
5184                 if (tmp < 0) 
5185                 {
5186                         sk->prot->wfree(sk, buff);
5187                         return;
5188                 }
5189                 
5190                 /*
5191                  *      Move the TCP header over
5192                  */
5193 
5194                 buff->dev = dev;
5195 
5196                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5197 
5198                 memcpy(nth, th, th->doff * 4);
5199                 
5200                 /*
5201                  *      Correct the new header
5202                  */
5203                  
5204                 nth->ack = 1; 
5205                 nth->ack_seq = ntohl(sk->acked_seq);
5206                 nth->window = ntohs(tcp_select_window(sk));
5207                 nth->check = 0;
5208 
5209                 /*
5210                  *      Find the first data byte.
5211                  */
5212                  
5213                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5214                                 (iph->ihl << 2) + th->doff * 4;
5215 
5216                 /*
5217                  *      Add it to our new buffer
5218                  */
5219                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5220                 
5221                 /*
5222                  *      Remember our right edge sequence number.
5223                  */
5224                  
5225                 buff->h.seq = sk->sent_seq + win_size;
5226                 sk->sent_seq = buff->h.seq;             /* Hack */
5227 #if 0
5228 
5229                 /*
5230                  *      now: shrink the queue head segment 
5231                  */
5232                  
5233                 th->check = 0;
5234                 ow_size = skb->len - win_size - 
5235                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5236 
5237                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5238                 skb_trim(skb,skb->len-win_size);
5239                 sk->sent_seq += win_size;
5240                 th->seq = htonl(sk->sent_seq);
5241                 if (th->urg)
5242                 {
5243                         unsigned short urg_ptr;
5244         
5245                         urg_ptr = ntohs(th->urg_ptr);
5246                         if (urg_ptr <= win_size)
5247                                 th->urg = 0;
5248                         else
5249                         {
5250                                 urg_ptr -= win_size;
5251                                 th->urg_ptr = htons(urg_ptr);
5252                                 nth->urg_ptr = htons(win_size);
5253                         }
5254                 }
5255 #else
5256                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5257                         nth->urg = 0;
5258 #endif          
5259 
5260                 /*
5261                  *      Checksum the split buffer
5262                  */
5263                  
5264                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5265                            nth->doff * 4 + win_size , sk);
5266         }
5267         else
5268         {       
5269                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5270                 if (buff == NULL) 
5271                         return;
5272 
5273                 buff->free = 1;
5274                 buff->sk = sk;
5275                 buff->localroute = sk->localroute;
5276 
5277                 /*
5278                  *      Put in the IP header and routing stuff. 
5279                  */
5280                  
5281                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5282                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5283                 if (tmp < 0) 
5284                 {
5285                         sk->prot->wfree(sk, buff);
5286                         return;
5287                 }
5288 
5289                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5290                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5291 
5292                 /*
5293                  *      Use a previous sequence.
5294                  *      This should cause the other end to send an ack.
5295                  */
5296          
5297                 t1->seq = htonl(sk->sent_seq-1);
5298                 t1->ack = 1; 
5299                 t1->res1= 0;
5300                 t1->res2= 0;
5301                 t1->rst = 0;
5302                 t1->urg = 0;
5303                 t1->psh = 0;
5304                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5305                 t1->syn = 0;
5306                 t1->ack_seq = ntohl(sk->acked_seq);
5307                 t1->window = ntohs(tcp_select_window(sk));
5308                 t1->doff = sizeof(*t1)/4;
5309                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5310 
5311         }               
5312 
5313         /*
5314          *      Send it.
5315          */
5316         
5317         sk->prot->queue_xmit(sk, dev, buff, 1);
5318         tcp_statistics.TcpOutSegs++;
5319 }
5320 
5321 /*
5322  *      A window probe timeout has occurred.
5323  */
5324 
5325 void tcp_send_probe0(struct sock *sk)
     /*  */
5326 {
5327         if (sk->zapped)
5328                 return;         /* After a valid reset we can send no more */
5329 
5330         tcp_write_wakeup(sk);
5331 
5332         sk->backoff++;
5333         sk->rto = min(sk->rto << 1, 120*HZ);
5334         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5335         sk->retransmits++;
5336         sk->prot->retransmits ++;
5337 }
5338 
5339 /*
5340  *      Socket option code for TCP. 
5341  */
5342   
5343 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5344 {
5345         int val,err;
5346 
5347         if(level!=SOL_TCP)
5348                 return ip_setsockopt(sk,level,optname,optval,optlen);
5349 
5350         if (optval == NULL) 
5351                 return(-EINVAL);
5352 
5353         err=verify_area(VERIFY_READ, optval, sizeof(int));
5354         if(err)
5355                 return err;
5356         
5357         val = get_user((int *)optval);
5358 
5359         switch(optname)
5360         {
5361                 case TCP_MAXSEG:
5362 /*
5363  * values greater than interface MTU won't take effect.  however at
5364  * the point when this call is done we typically don't yet know
5365  * which interface is going to be used
5366  */
5367                         if(val<1||val>MAX_WINDOW)
5368                                 return -EINVAL;
5369                         sk->user_mss=val;
5370                         return 0;
5371                 case TCP_NODELAY:
5372                         sk->nonagle=(val==0)?0:1;
5373                         return 0;
5374                 default:
5375                         return(-ENOPROTOOPT);
5376         }
5377 }
5378 
5379 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5380 {
5381         int val,err;
5382 
5383         if(level!=SOL_TCP)
5384                 return ip_getsockopt(sk,level,optname,optval,optlen);
5385                         
5386         switch(optname)
5387         {
5388                 case TCP_MAXSEG:
5389                         val=sk->user_mss;
5390                         break;
5391                 case TCP_NODELAY:
5392                         val=sk->nonagle;
5393                         break;
5394                 default:
5395                         return(-ENOPROTOOPT);
5396         }
5397         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5398         if(err)
5399                 return err;
5400         put_user(sizeof(int),(int *) optlen);
5401 
5402         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5403         if(err)
5404                 return err;
5405         put_user(val,(int *)optval);
5406 
5407         return(0);
5408 }       
5409 
5410 
5411 struct proto tcp_prot = {
5412         sock_wmalloc,
5413         sock_rmalloc,
5414         sock_wfree,
5415         sock_rfree,
5416         sock_rspace,
5417         sock_wspace,
5418         tcp_close,
5419         tcp_read,
5420         tcp_write,
5421         tcp_sendto,
5422         tcp_recvfrom,
5423         ip_build_header,
5424         tcp_connect,
5425         tcp_accept,
5426         ip_queue_xmit,
5427         tcp_retransmit,
5428         tcp_write_wakeup,
5429         tcp_read_wakeup,
5430         tcp_rcv,
5431         tcp_select,
5432         tcp_ioctl,
5433         NULL,
5434         tcp_shutdown,
5435         tcp_setsockopt,
5436         tcp_getsockopt,
5437         128,
5438         0,
5439         "TCP",
5440         0, 0,
5441         {NULL,}
5442 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS