net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *
 183  *
 184  * To Fix:
 185  *              Fast path the code. Two things here - fix the window calculation
 186  *              so it doesn't iterate over the queue, also spot packets with no funny
 187  *              options arriving in order and process directly.
 188  *
 189  *              Rewrite output state machine to use a single queue and do low window
 190  *              situations as per the spec (RFC 1122)
 191  *              Speed up input assembly algorithm.
 192  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 193  *              could do with it working on IPv4
 194  *              User settable/learned rtt/max window/mtu
 195  *              Cope with MTU/device switches when retransmitting in tcp.
 196  *              Fix the window handling to use PR's new code.
 197  *
 198  *              Change the fundamental structure to a single send queue maintained
 199  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 200  *              active routes too]). Cut the queue off in tcp_retransmit/
 201  *              tcp_transmit.
 202  *              Change the receive queue to assemble as it goes. This lets us
 203  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 204  *              tcp_data/tcp_read as well as the window shrink crud.
 205  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 206  *              tcp_queue_skb seem obvious routines to extract.
 207  *      
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *              
 245  *      TCP_CLOSE               socket is finished
 246  */
 247 
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * 
 255  * Use of PSH (4.2.2.2)
 256  *   MAY aggregate data sent without the PSH flag. (does)
 257  *   MAY queue data recieved without the PSH flag. (does)
 258  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 259  *   MAY implement PSH on send calls. (doesn't, thus:)
 260  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 261  *     MUST set PSH on last segment (does)
 262  *   MAY pass received PSH to application layer (doesn't)
 263  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 264  * 
 265  * Window Size (4.2.2.3, 4.2.2.16)
 266  *   MUST treat window size as an unsigned number (does)
 267  *   SHOULD treat window size as a 32-bit number (does not)
 268  *   MUST NOT shrink window once it is offered (does not normally)
 269  *   
 270  * Urgent Pointer (4.2.2.4)
 271  * **MUST point urgent pointer to last byte of urgent data (not right
 272  *     after). (doesn't, to be like BSD)
 273  *   MUST inform application layer asynchronously of incoming urgent
 274  *     data. (does)
 275  *   MUST provide application with means of determining the amount of
 276  *     urgent data pending. (does)
 277  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 278  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 279  *      [Follows BSD 1 byte of urgent data]
 280  * 
 281  * TCP Options (4.2.2.5)
 282  *   MUST be able to recieve TCP options in any segment. (does)
 283  *   MUST ignore unsupported options (does)
 284  *   
 285  * Maximum Segment Size Option (4.2.2.6)
 286  *   MUST implement both sending and receiving MSS. (does)
 287  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 288  *     it always). (does, even when MSS == 536, which is legal)
 289  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 290  *   MUST calculate "effective send MSS" correctly:
 291  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 292  *     (does - but allows operator override)
 293  *  
 294  * TCP Checksum (4.2.2.7)
 295  *   MUST generate and check TCP checksum. (does)
 296  * 
 297  * Initial Sequence Number Selection (4.2.2.8)
 298  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 299  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 300  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 301  * 
 302  * Simultaneous Open Attempts (4.2.2.10)
 303  *   MUST support simultaneous open attempts (does)
 304  * 
 305  * Recovery from Old Duplicate SYN (4.2.2.11)
 306  *   MUST keep track of active vs. passive open (does)
 307  * 
 308  * RST segment (4.2.2.12)
 309  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 310  *     anything with it, which is standard)
 311  * 
 312  * Closing a Connection (4.2.2.13)
 313  *   MUST inform application of whether connectin was closed by RST or
 314  *     normal close. (does)
 315  *   MAY allow "half-duplex" close (treat connection as closed for the
 316  *     local app, even before handshake is done). (does)
 317  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 318  * 
 319  * Retransmission Timeout (4.2.2.15)
 320  *   MUST implement Jacobson's slow start and congestion avoidance
 321  *     stuff. (does) 
 322  * 
 323  * Probing Zero Windows (4.2.2.17)
 324  *   MUST support probing of zero windows. (does)
 325  *   MAY keep offered window closed indefinitely. (does)
 326  *   MUST allow remote window to stay closed indefinitely. (does)
 327  * 
 328  * Passive Open Calls (4.2.2.18)
 329  *   MUST NOT let new passive open affect other connections. (doesn't)
 330  *   MUST support passive opens (LISTENs) concurrently. (does)
 331  *   
 332  * Time to Live (4.2.2.19)
 333  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 334  * 
 335  * Event Processing (4.2.2.20)
 336  *   SHOULD queue out-of-order segments. (does)
 337  *   MUST aggregate ACK segments whenever possible. (does but badly)
 338  *   
 339  * Retransmission Timeout Calculation (4.2.3.1)
 340  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 341  *     calculation. (does, or at least explains them in the comments 8*b)
 342  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 343  * 
 344  * When to Send an ACK Segment (4.2.3.2)
 345  *   SHOULD implement delayed ACK. (does not)
 346  *   MUST keep ACK delay < 0.5 sec. (N/A)
 347  * 
 348  * When to Send a Window Update (4.2.3.3)
 349  *   MUST implement receiver-side SWS. (does)
 350  *   
 351  * When to Send Data (4.2.3.4)
 352  *   MUST implement sender-side SWS. (does - imperfectly)
 353  *   SHOULD implement Nagle algorithm. (does)
 354  * 
 355  * TCP Connection Failures (4.2.3.5)
 356  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 357  *   SHOULD inform application layer of soft errors. (doesn't)
 358  *   
 359  * TCP Keep-Alives (4.2.3.6)
 360  *   MAY provide keep-alives. (does)
 361  *   MUST make keep-alives configurable on a per-connection basis. (does)
 362  *   MUST default to no keep-alives. (does)
 363  * **MUST make keep-alive interval configurable. (doesn't)
 364  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 365  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 366  *     connection. (doesn't)
 367  *   SHOULD send keep-alive with no data. (does)
 368  * 
 369  * TCP Multihoming (4.2.3.7)
 370  *   MUST get source address from IP layer before sending first
 371  *     SYN. (does)
 372  *   MUST use same local address for all segments of a connection. (does)
 373  * 
 374  * IP Options (4.2.3.8)
 375  *   (I don't think the IP layer sees the IP options, yet.)
 376  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 377  *   MAY support Time Stamp and Record Route. (doesn't)
 378  * **MUST allow application to specify a source route. (doesn't?)
 379  * **MUST allow receieved Source Route option to set route for all future
 380  *     segments on this connection. (doesn't, not that I think it's a
 381  *     huge problem)
 382  * 
 383  * ICMP messages (4.2.3.9)
 384  *   MUST act on ICMP errors. (does)
 385  *   MUST slow transmission upon receipt of a Source Quench. (does)
 386  *   MUST NOT abort connection upon receipt of soft Destination
 387  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 388  *     Problems. (doesn't)
 389  *   SHOULD report soft Destination Unreachables etc. to the
 390  *     application. (doesn't)
 391  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 392  *     messages (2, 3, 4). (does)
 393  * 
 394  * Remote Address Validation (4.2.3.10)
 395  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 396  *   MUST ignore SYN with invalid source address. (does)
 397  *   MUST silently discard incoming SYN for broadcast/multicast
 398  *     address. (does) 
 399  * 
 400  * Asynchronous Reports (4.2.4.1)
 401  * **MUST provide mechanism for reporting soft errors to application
 402  *     layer. (doesn't)
 403  * 
 404  * Type of Service (4.2.4.2)
 405  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 406  * 
 407  * (Whew. -- MS 950903)
 408  **/
 409 
 410 #include <linux/types.h>
 411 #include <linux/sched.h>
 412 #include <linux/mm.h>
 413 #include <linux/time.h>
 414 #include <linux/string.h>
 415 #include <linux/config.h>
 416 #include <linux/socket.h>
 417 #include <linux/sockios.h>
 418 #include <linux/termios.h>
 419 #include <linux/in.h>
 420 #include <linux/fcntl.h>
 421 #include <linux/inet.h>
 422 #include <linux/netdevice.h>
 423 #include <net/snmp.h>
 424 #include <net/ip.h>
 425 #include <net/protocol.h>
 426 #include <net/icmp.h>
 427 #include <net/tcp.h>
 428 #include <net/arp.h>
 429 #include <linux/skbuff.h>
 430 #include <net/sock.h>
 431 #include <net/route.h>
 432 #include <linux/errno.h>
 433 #include <linux/timer.h>
 434 #include <asm/system.h>
 435 #include <asm/segment.h>
 436 #include <linux/mm.h>
 437 #include <net/checksum.h>
 438 
 439 /*
 440  *      The MSL timer is the 'normal' timer.
 441  */
 442  
 443 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 444 
 445 #define SEQ_TICK 3
 446 unsigned long seq_offset;
 447 struct tcp_mib  tcp_statistics;
 448 
 449 /*
 450  *      Cached last hit socket
 451  */
 452  
 453 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 454 volatile unsigned short  th_cache_dport, th_cache_sport;
 455 volatile struct sock *th_cache_sk;
 456 
 457 void tcp_cache_zap(void)
     /*  */
 458 {
 459         unsigned long flags;
 460         save_flags(flags);
 461         cli();
 462         th_cache_saddr=0;
 463         th_cache_daddr=0;
 464         th_cache_dport=0;
 465         th_cache_sport=0;
 466         th_cache_sk=NULL;
 467         restore_flags(flags);
 468 }
 469 
 470 static void tcp_close(struct sock *sk, int timeout);
 471 
 472 
 473 /*
 474  *      The less said about this the better, but it works and will do for 1.2 
 475  */
 476 
 477 static struct wait_queue *master_select_wakeup;
 478 
 479 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 480 {
 481         if (a < b) 
 482                 return(a);
 483         return(b);
 484 }
 485 
 486 #undef STATE_TRACE
 487 
 488 #ifdef STATE_TRACE
 489 static char *statename[]={
 490         "Unused","Established","Syn Sent","Syn Recv",
 491         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 492         "Close Wait","Last ACK","Listen","Closing"
 493 };
 494 #endif
 495 
 496 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 497 {
 498         if(sk->state==TCP_ESTABLISHED)
 499                 tcp_statistics.TcpCurrEstab--;
 500 #ifdef STATE_TRACE
 501         if(sk->debug)
 502                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 503 #endif  
 504         /* This is a hack but it doesn't occur often and it's going to
 505            be a real        to fix nicely */
 506            
 507         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 508         {
 509                 wake_up_interruptible(&master_select_wakeup);
 510         }
 511         sk->state=state;
 512         if(state==TCP_ESTABLISHED)
 513                 tcp_statistics.TcpCurrEstab++;
 514         if(sk->state==TCP_CLOSE)
 515                 tcp_cache_zap();
 516 }
 517 
 518 /*
 519  *      This routine picks a TCP windows for a socket based on
 520  *      the following constraints
 521  *  
 522  *      1. The window can never be shrunk once it is offered (RFC 793)
 523  *      2. We limit memory per socket
 524  *   
 525  *      For now we use NET2E3's heuristic of offering half the memory
 526  *      we have handy. All is not as bad as this seems however because
 527  *      of two things. Firstly we will bin packets even within the window
 528  *      in order to get the data we are waiting for into the memory limit.
 529  *      Secondly we bin common duplicate forms at receive time
 530  *      Better heuristics welcome
 531  */
 532    
 533 int tcp_select_window(struct sock *sk)
     /*  */
 534 {
 535         int new_window = sock_rspace(sk);
 536         
 537         if(sk->window_clamp)
 538                 new_window=min(sk->window_clamp,new_window);
 539         /*
 540          *      Two things are going on here.  First, we don't ever offer a
 541          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 542          *      receiver side of SWS as specified in RFC1122.
 543          *      Second, we always give them at least the window they
 544          *      had before, in order to avoid retracting window.  This
 545          *      is technically allowed, but RFC1122 advises against it and
 546          *      in practice it causes trouble.
 547          *
 548          *      Fixme: This doesn't correctly handle the case where
 549          *      new_window > sk->window but not by enough to allow for the
 550          *      shift in sequence space. 
 551          */
 552         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 553                 return(sk->window);
 554         return(new_window);
 555 }
 556 
 557 /*
 558  *      Find someone to 'accept'. Must be called with
 559  *      sk->inuse=1 or cli()
 560  */ 
 561 
 562 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 563 {
 564         struct sk_buff *p=skb_peek(&s->receive_queue);
 565         if(p==NULL)
 566                 return NULL;
 567         do
 568         {
 569                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 570                         return p;
 571                 p=p->next;
 572         }
 573         while(p!=(struct sk_buff *)&s->receive_queue);
 574         return NULL;
 575 }
 576 
 577 /*
 578  *      Remove a completed connection and return it. This is used by
 579  *      tcp_accept() to get connections from the queue.
 580  */
 581 
 582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 583 {
 584         struct sk_buff *skb;
 585         unsigned long flags;
 586         save_flags(flags);
 587         cli(); 
 588         skb=tcp_find_established(s);
 589         if(skb!=NULL)
 590                 skb_unlink(skb);        /* Take it off the queue */
 591         restore_flags(flags);
 592         return skb;
 593 }
 594 
 595 /* 
 596  *      This routine closes sockets which have been at least partially
 597  *      opened, but not yet accepted. Currently it is only called by
 598  *      tcp_close, and timeout mirrors the value there. 
 599  */
 600 
 601 static void tcp_close_pending (struct sock *sk) 
     /*  */
 602 {
 603         struct sk_buff *skb;
 604 
 605         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 606         {
 607                 skb->sk->dead=1;
 608                 tcp_close(skb->sk, 0);
 609                 kfree_skb(skb, FREE_READ);
 610         }
 611         return;
 612 }
 613 
 614 /*
 615  *      Enter the time wait state. 
 616  */
 617 
 618 static void tcp_time_wait(struct sock *sk)
     /*  */
 619 {
 620         tcp_set_state(sk,TCP_TIME_WAIT);
 621         sk->shutdown = SHUTDOWN_MASK;
 622         if (!sk->dead)
 623                 sk->state_change(sk);
 624         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 625 }
 626 
 627 /*
 628  *      A socket has timed out on its send queue and wants to do a
 629  *      little retransmitting. Currently this means TCP.
 630  */
 631 
 632 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 633 {
 634         struct sk_buff * skb;
 635         struct proto *prot;
 636         struct device *dev;
 637         int ct=0;
 638         struct rtable *rt;
 639 
 640         prot = sk->prot;
 641         skb = sk->send_head;
 642 
 643         while (skb != NULL)
 644         {
 645                 struct tcphdr *th;
 646                 struct iphdr *iph;
 647                 int size;
 648 
 649                 dev = skb->dev;
 650                 IS_SKB(skb);
 651                 skb->when = jiffies;
 652 
 653                 /*
 654                  *      Discard the surplus MAC header
 655                  */
 656                  
 657                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 658 
 659                 /*
 660                  * In general it's OK just to use the old packet.  However we
 661                  * need to use the current ack and window fields.  Urg and
 662                  * urg_ptr could possibly stand to be updated as well, but we
 663                  * don't keep the necessary data.  That shouldn't be a problem,
 664                  * if the other end is doing the right thing.  Since we're
 665                  * changing the packet, we have to issue a new IP identifier.
 666                  */
 667 
 668                 iph = (struct iphdr *)skb->data;
 669                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 670                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 671                 
 672                 /*
 673                  *      Note: We ought to check for window limits here but
 674                  *      currently this is done (less efficiently) elsewhere.
 675                  */
 676 
 677                 /*
 678                  *      Put a MAC header back on (may cause ARPing)
 679                  */
 680                  
 681                 {
 682                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 683                          */
 684                         struct options *  opt = (struct options*)skb->proto_priv;
 685                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 686                 }
 687 
 688                 iph->id = htons(ip_id_count++);
 689 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 690                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 691                         iph->frag_off &= ~htons(IP_DF);
 692 #endif
 693                 ip_send_check(iph);
 694                         
 695                 if (rt==NULL)   /* Deep poo */
 696                 {
 697                         if(skb->sk)
 698                         {
 699                                 skb->sk->err=ENETUNREACH;
 700                                 skb->sk->error_report(skb->sk);
 701                         }
 702                 }
 703                 else
 704                 {
 705                         dev=rt->rt_dev;
 706                         skb->raddr=rt->rt_gateway;
 707                         skb->dev=dev;
 708                         skb->arp=1;
 709                         if (rt->rt_hh)
 710                         {
 711                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 712                                 if (!rt->rt_hh->hh_uptodate)
 713                                 {
 714                                         skb->arp = 0;
 715 #if RT_CACHE_DEBUG >= 2
 716                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 717 #endif
 718                                 }
 719                         }
 720                         else if (dev->hard_header)
 721                         {
 722                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 723                                         skb->arp=0;
 724                         }
 725                 
 726                         /*
 727                          *      This is not the right way to handle this. We have to
 728                          *      issue an up to date window and ack report with this 
 729                          *      retransmit to keep the odd buggy tcp that relies on 
 730                          *      the fact BSD does this happy. 
 731                          *      We don't however need to recalculate the entire 
 732                          *      checksum, so someone wanting a small problem to play
 733                          *      with might like to implement RFC1141/RFC1624 and speed
 734                          *      this up by avoiding a full checksum.
 735                          */
 736                  
 737                         th->ack_seq = htonl(sk->acked_seq);
 738                         th->window = ntohs(tcp_select_window(sk));
 739                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 740                 
 741                         /*
 742                          *      If the interface is (still) up and running, kick it.
 743                          */
 744         
 745                         if (dev->flags & IFF_UP)
 746                         {
 747                                 /*
 748                                  *      If the packet is still being sent by the device/protocol
 749                                  *      below then don't retransmit. This is both needed, and good -
 750                                  *      especially with connected mode AX.25 where it stops resends
 751                                  *      occurring of an as yet unsent anyway frame!
 752                                  *      We still add up the counts as the round trip time wants
 753                                  *      adjusting.
 754                                  */
 755                                 if (sk && !skb_device_locked(skb))
 756                                 {
 757                                         /* Remove it from any existing driver queue first! */
 758                                         skb_unlink(skb);
 759                                         /* Now queue it */
 760                                         ip_statistics.IpOutRequests++;
 761                                         dev_queue_xmit(skb, dev, sk->priority);
 762                                 }
 763                         }
 764                 }
 765                 
 766                 /*
 767                  *      Count retransmissions
 768                  */
 769                  
 770                 ct++;
 771                 sk->prot->retransmits ++;
 772                 tcp_statistics.TcpRetransSegs++;
 773                 
 774 
 775                 /*
 776                  *      Only one retransmit requested.
 777                  */
 778         
 779                 if (!all)
 780                         break;
 781 
 782                 /*
 783                  *      This should cut it off before we send too many packets.
 784                  */
 785 
 786                 if (ct >= sk->cong_window)
 787                         break;
 788                 skb = skb->link3;
 789         }
 790 }
 791 
 792 /*
 793  *      Reset the retransmission timer
 794  */
 795  
 796 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 797 {
 798         del_timer(&sk->retransmit_timer);
 799         sk->ip_xmit_timeout = why;
 800         if((int)when < 0)
 801         {
 802                 when=3;
 803                 printk("Error: Negative timer in xmit_timer\n");
 804         }
 805         sk->retransmit_timer.expires=jiffies+when;
 806         add_timer(&sk->retransmit_timer);
 807 }
 808 
 809 /*
 810  *      This is the normal code called for timeouts.  It does the retransmission
 811  *      and then does backoff.  tcp_do_retransmit is separated out because
 812  *      tcp_ack needs to send stuff from the retransmit queue without
 813  *      initiating a backoff.
 814  */
 815 
 816 
 817 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 818 {
 819         tcp_do_retransmit(sk, all);
 820 
 821         /*
 822          * Increase the timeout each time we retransmit.  Note that
 823          * we do not increase the rtt estimate.  rto is initialized
 824          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 825          * that doubling rto each time is the least we can get away with.
 826          * In KA9Q, Karn uses this for the first few times, and then
 827          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 828          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 829          * defined in the protocol as the maximum possible RTT.  I guess
 830          * we'll have to use something other than TCP to talk to the
 831          * University of Mars.
 832          *
 833          * PAWS allows us longer timeouts and large windows, so once
 834          * implemented ftp to mars will work nicely. We will have to fix
 835          * the 120 second clamps though!
 836          */
 837 
 838         sk->retransmits++;
 839         sk->prot->retransmits++;
 840         sk->backoff++;
 841         sk->rto = min(sk->rto << 1, 120*HZ);
 842         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 843 }
 844 
 845 
 846 /*
 847  *      A timer event has trigger a tcp retransmit timeout. The
 848  *      socket xmit queue is ready and set up to send. Because
 849  *      the ack receive code keeps the queue straight we do
 850  *      nothing clever here.
 851  */
 852 
 853 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 854 {
 855         if (all) 
 856         {
 857                 tcp_retransmit_time(sk, all);
 858                 return;
 859         }
 860 
 861         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 862         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 863         sk->cong_count = 0;
 864 
 865         sk->cong_window = 1;
 866 
 867         /* Do the actual retransmit. */
 868         tcp_retransmit_time(sk, all);
 869 }
 870 
 871 /*
 872  *      A write timeout has occurred. Process the after effects.
 873  */
 874 
 875 static int tcp_write_timeout(struct sock *sk)
     /*  */
 876 {
 877         /*
 878          *      Look for a 'soft' timeout.
 879          */
 880         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 881                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 882         {
 883                 /*
 884                  *      Attempt to recover if arp has changed (unlikely!) or
 885                  *      a route has shifted (not supported prior to 1.3).
 886                  */
 887                 ip_rt_advice(&sk->ip_route_cache, 0);
 888         }
 889         
 890         /*
 891          *      Have we tried to SYN too many times (repent repent 8))
 892          */
 893          
 894         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 895         {
 896                 sk->err=ETIMEDOUT;
 897                 sk->error_report(sk);
 898                 del_timer(&sk->retransmit_timer);
 899                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 900                 tcp_set_state(sk,TCP_CLOSE);
 901                 /* Don't FIN, we got nothing back */
 902                 release_sock(sk);
 903                 return 0;
 904         }
 905         /*
 906          *      Has it gone just too far ?
 907          */
 908         if (sk->retransmits > TCP_RETR2) 
 909         {
 910                 sk->err = ETIMEDOUT;
 911                 sk->error_report(sk);
 912                 del_timer(&sk->retransmit_timer);
 913                 /*
 914                  *      Time wait the socket 
 915                  */
 916                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 917                 {
 918                         tcp_set_state(sk,TCP_TIME_WAIT);
 919                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 920                 }
 921                 else
 922                 {
 923                         /*
 924                          *      Clean up time.
 925                          */
 926                         tcp_set_state(sk, TCP_CLOSE);
 927                         release_sock(sk);
 928                         return 0;
 929                 }
 930         }
 931         return 1;
 932 }
 933 
 934 /*
 935  *      The TCP retransmit timer. This lacks a few small details.
 936  *
 937  *      1.      An initial rtt timeout on the probe0 should cause what we can
 938  *              of the first write queue buffer to be split and sent.
 939  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 940  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 941  *              tcp_err should save a 'soft error' for us.
 942  */
 943 
 944 static void retransmit_timer(unsigned long data)
     /*  */
 945 {
 946         struct sock *sk = (struct sock*)data;
 947         int why = sk->ip_xmit_timeout;
 948 
 949         /* 
 950          * only process if socket is not in use
 951          */
 952 
 953         cli();
 954         if (sk->inuse || in_bh) 
 955         {
 956                 /* Try again in 1 second */
 957                 sk->retransmit_timer.expires = jiffies+HZ;
 958                 add_timer(&sk->retransmit_timer);
 959                 sti();
 960                 return;
 961         }
 962 
 963         sk->inuse = 1;
 964         sti();
 965 
 966         /* Always see if we need to send an ack. */
 967 
 968         if (sk->ack_backlog && !sk->zapped) 
 969         {
 970                 sk->prot->read_wakeup (sk);
 971                 if (! sk->dead)
 972                         sk->data_ready(sk,0);
 973         }
 974 
 975         /* Now we need to figure out why the socket was on the timer. */
 976 
 977         switch (why) 
 978         {
 979                 /* Window probing */
 980                 case TIME_PROBE0:
 981                         tcp_send_probe0(sk);
 982                         tcp_write_timeout(sk);
 983                         break;
 984                 /* Retransmitting */
 985                 case TIME_WRITE:
 986                         /* It could be we got here because we needed to send an ack.
 987                          * So we need to check for that.
 988                          */
 989                 {
 990                         struct sk_buff *skb;
 991                         unsigned long flags;
 992 
 993                         save_flags(flags);
 994                         cli();
 995                         skb = sk->send_head;
 996                         if (!skb) 
 997                         {
 998                                 restore_flags(flags);
 999                         } 
1000                         else 
1001                         {
1002                                 /*
1003                                  *      Kicked by a delayed ack. Reset timer
1004                                  *      correctly now
1005                                  */
1006                                 if (jiffies < skb->when + sk->rto) 
1007                                 {
1008                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1009                                         restore_flags(flags);
1010                                         break;
1011                                 }
1012                                 restore_flags(flags);
1013                                 /*
1014                                  *      Retransmission
1015                                  */
1016                                 sk->retransmits++;
1017                                 sk->prot->retransmits++;
1018                                 sk->prot->retransmit (sk, 0);
1019                                 tcp_write_timeout(sk);
1020                         }
1021                         break;
1022                 }
1023                 /* Sending Keepalives */
1024                 case TIME_KEEPOPEN:
1025                         /* 
1026                          * this reset_timer() call is a hack, this is not
1027                          * how KEEPOPEN is supposed to work.
1028                          */
1029                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1030 
1031                         /* Send something to keep the connection open. */
1032                         if (sk->prot->write_wakeup)
1033                                   sk->prot->write_wakeup (sk);
1034                         sk->retransmits++;
1035                         sk->prot->retransmits++;
1036                         tcp_write_timeout(sk);
1037                         break;
1038                 default:
1039                         printk ("rexmit_timer: timer expired - reason unknown\n");
1040                         break;
1041         }
1042         release_sock(sk);
1043 }
1044 
1045 /*
1046  * This routine is called by the ICMP module when it gets some
1047  * sort of error condition.  If err < 0 then the socket should
1048  * be closed and the error returned to the user.  If err > 0
1049  * it's just the icmp type << 8 | icmp code.  After adjustment
1050  * header points to the first 8 bytes of the tcp header.  We need
1051  * to find the appropriate port.
1052  */
1053 
1054 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1055         __u32 saddr, struct inet_protocol *protocol)
1056 {
1057         struct tcphdr *th = (struct tcphdr *)header;
1058         struct sock *sk;
1059         
1060         /*
1061          *      This one is _WRONG_. FIXME urgently.
1062          */
1063 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1064         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1065 #endif  
1066         th =(struct tcphdr *)header;
1067         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1068 
1069         if (sk == NULL) 
1070                 return;
1071   
1072         if (type == ICMP_SOURCE_QUENCH) 
1073         {
1074                 /*
1075                  * FIXME:
1076                  * For now we will just trigger a linear backoff.
1077                  * The slow start code should cause a real backoff here.
1078                  */
1079                 if (sk->cong_window > 4)
1080                         sk->cong_window--;
1081                 return;
1082         }
1083         
1084         if (type == ICMP_PARAMETERPROB)
1085         {
1086                 sk->err=EPROTO;
1087                 sk->error_report(sk);
1088         }
1089 
1090 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1091         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1092         {
1093                 struct rtable * rt;
1094                 /*
1095                  * Ugly trick to pass MTU to protocol layer.
1096                  * Really we should add argument "info" to error handler.
1097                  */
1098                 unsigned short new_mtu = ntohs(iph->id);
1099 
1100                 if ((rt = sk->ip_route_cache) != NULL)
1101                         if (rt->rt_mtu > new_mtu)
1102                                 rt->rt_mtu = new_mtu;
1103 
1104                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr))
1105                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1106 
1107                 return;
1108         }
1109 #endif
1110 
1111         /*
1112          * If we've already connected we will keep trying
1113          * until we time out, or the user gives up.
1114          */
1115 
1116         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1117         {
1118                 sk->err = icmp_err_convert[code].errno;
1119                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1120                 {
1121                         tcp_statistics.TcpAttemptFails++;
1122                         tcp_set_state(sk,TCP_CLOSE);
1123                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1124                 }
1125         }
1126         return;
1127 }
1128 
1129 
1130 /*
1131  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1132  *      in the received data queue (ie a frame missing that needs sending to us). Not
1133  *      sorting using two queues as data arrives makes life so much harder.
1134  */
1135 
1136 static int tcp_readable(struct sock *sk)
     /*  */
1137 {
1138         unsigned long counted;
1139         unsigned long amount;
1140         struct sk_buff *skb;
1141         int sum;
1142         unsigned long flags;
1143 
1144         if(sk && sk->debug)
1145                 printk("tcp_readable: %p - ",sk);
1146 
1147         save_flags(flags);
1148         cli();
1149         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1150         {
1151                 restore_flags(flags);
1152                 if(sk && sk->debug) 
1153                         printk("empty\n");
1154                 return(0);
1155         }
1156   
1157         counted = sk->copied_seq;       /* Where we are at the moment */
1158         amount = 0;
1159   
1160         /* 
1161          *      Do until a push or until we are out of data. 
1162          */
1163          
1164         do 
1165         {
1166                 if (before(counted, skb->seq))          /* Found a hole so stops here */
1167                         break;
1168                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
1169                 if (skb->h.th->syn)
1170                         sum++;
1171                 if (sum > 0) 
1172                 {                                       /* Add it up, move on */
1173                         amount += sum;
1174                         if (skb->h.th->syn) 
1175                                 amount--;
1176                         counted += sum;
1177                 }
1178                 /*
1179                  * Don't count urg data ... but do it in the right place!
1180                  * Consider: "old_data (ptr is here) URG PUSH data"
1181                  * The old code would stop at the first push because
1182                  * it counted the urg (amount==1) and then does amount--
1183                  * *after* the loop.  This means tcp_readable() always
1184                  * returned zero if any URG PUSH was in the queue, even
1185                  * though there was normal data available. If we subtract
1186                  * the urg data right here, we even get it to work for more
1187                  * than one URG PUSH skb without normal data.
1188                  * This means that select() finally works now with urg data
1189                  * in the queue.  Note that rlogin was never affected
1190                  * because it doesn't use select(); it uses two processes
1191                  * and a blocking read().  And the queue scan in tcp_read()
1192                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1193                  */
1194                 if (skb->h.th->urg)
1195                         amount--;       /* don't count urg data */
1196                 if (amount && skb->h.th->psh) break;
1197                 skb = skb->next;
1198         }
1199         while(skb != (struct sk_buff *)&sk->receive_queue);
1200 
1201         restore_flags(flags);
1202         if(sk->debug)
1203                 printk("got %lu bytes.\n",amount);
1204         return(amount);
1205 }
1206 
1207 /*
1208  * LISTEN is a special case for select..
1209  */
1210 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1211 {
1212         if (sel_type == SEL_IN) {
1213                 int retval;
1214 
1215                 sk->inuse = 1;
1216                 retval = (tcp_find_established(sk) != NULL);
1217                 release_sock(sk);
1218                 if (!retval)
1219                         select_wait(&master_select_wakeup,wait);
1220                 return retval;
1221         }
1222         return 0;
1223 }
1224 
1225 
1226 /*
1227  *      Wait for a TCP event.
1228  *
1229  *      Note that we don't need to set "sk->inuse", as the upper select layers
1230  *      take care of normal races (between the test and the event) and we don't
1231  *      go look at any of the socket buffers directly.
1232  */
1233 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1234 {
1235         if (sk->state == TCP_LISTEN)
1236                 return tcp_listen_select(sk, sel_type, wait);
1237 
1238         switch(sel_type) {
1239         case SEL_IN:
1240                 if (sk->err)
1241                         return 1;
1242                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1243                         break;
1244 
1245                 if (sk->shutdown & RCV_SHUTDOWN)
1246                         return 1;
1247                         
1248                 if (sk->acked_seq == sk->copied_seq)
1249                         break;
1250 
1251                 if (sk->urg_seq != sk->copied_seq ||
1252                     sk->acked_seq != sk->copied_seq+1 ||
1253                     sk->urginline || !sk->urg_data)
1254                         return 1;
1255                 break;
1256 
1257         case SEL_OUT:
1258                 if (sk->err)
1259                         return 1;
1260                 if (sk->shutdown & SEND_SHUTDOWN) 
1261                         return 0;
1262                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1263                         break;
1264                 /*
1265                  * This is now right thanks to a small fix
1266                  * by Matt Dillon.
1267                  */
1268 
1269                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1270                         break;
1271                 return 1;
1272 
1273         case SEL_EX:
1274                 if (sk->urg_data)
1275                         return 1;
1276                 break;
1277         }
1278         select_wait(sk->sleep, wait);
1279         return 0;
1280 }
1281 
1282 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1283 {
1284         int err;
1285         switch(cmd) 
1286         {
1287 
1288                 case TIOCINQ:
1289 #ifdef FIXME    /* FIXME: */
1290                 case FIONREAD:
1291 #endif
1292                 {
1293                         unsigned long amount;
1294 
1295                         if (sk->state == TCP_LISTEN) 
1296                                 return(-EINVAL);
1297 
1298                         sk->inuse = 1;
1299                         amount = tcp_readable(sk);
1300                         release_sock(sk);
1301                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1302                         if(err)
1303                                 return err;
1304                         put_user(amount, (int *)arg);
1305                         return(0);
1306                 }
1307                 case SIOCATMARK:
1308                 {
1309                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1310 
1311                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1312                         if (err)
1313                                 return err;
1314                         put_user(answ,(int *) arg);
1315                         return(0);
1316                 }
1317                 case TIOCOUTQ:
1318                 {
1319                         unsigned long amount;
1320 
1321                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1322                         amount = sock_wspace(sk);
1323                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1324                         if(err)
1325                                 return err;
1326                         put_user(amount, (int *)arg);
1327                         return(0);
1328                 }
1329                 default:
1330                         return(-EINVAL);
1331         }
1332 }
1333 
1334 
1335 /*
1336  *      This routine computes a TCP checksum. 
1337  *
1338  *      Modified January 1995 from a go-faster DOS routine by
1339  *      Jorge Cwik <jorge@laser.satlink.net>
1340  */
1341  
1342 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1343           unsigned long saddr, unsigned long daddr, unsigned long base)
1344 {     
1345         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1346 }
1347 
1348 
1349 
1350 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1351                 unsigned long daddr, int len, struct sock *sk)
1352 {
1353         th->check = 0;
1354         th->check = tcp_check(th, len, saddr, daddr,
1355                 csum_partial((char *)th,len,0));
1356         return;
1357 }
1358 
1359 /*
1360  *      This is the main buffer sending routine. We queue the buffer
1361  *      having checked it is sane seeming.
1362  */
1363  
1364 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1365 {
1366         int size;
1367         struct tcphdr * th = skb->h.th;
1368 
1369         /*
1370          *      length of packet (not counting length of pre-tcp headers) 
1371          */
1372          
1373         size = skb->len - ((unsigned char *) th - skb->data);
1374 
1375         /*
1376          *      Sanity check it.. 
1377          */
1378          
1379         if (size < sizeof(struct tcphdr) || size > skb->len) 
1380         {
1381                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1382                         skb, skb->data, th, skb->len);
1383                 kfree_skb(skb, FREE_WRITE);
1384                 return;
1385         }
1386 
1387         /*
1388          *      If we have queued a header size packet.. (these crash a few
1389          *      tcp stacks if ack is not set)
1390          */
1391          
1392         if (size == sizeof(struct tcphdr)) 
1393         {
1394                 /* If it's got a syn or fin it's notionally included in the size..*/
1395                 if(!th->syn && !th->fin) 
1396                 {
1397                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1398                         kfree_skb(skb,FREE_WRITE);
1399                         return;
1400                 }
1401         }
1402 
1403         /*
1404          *      Actual processing.
1405          */
1406          
1407         tcp_statistics.TcpOutSegs++;  
1408         skb->seq = ntohl(th->seq);
1409         skb->end_seq = skb->seq + size - 4*th->doff;
1410         
1411         /*
1412          *      We must queue if
1413          *
1414          *      a) The right edge of this frame exceeds the window
1415          *      b) We are retransmitting (Nagle's rule)
1416          *      c) We have too many packets 'in flight'
1417          */
1418          
1419         if (after(skb->end_seq, sk->window_seq) ||
1420             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1421              sk->packets_out >= sk->cong_window) 
1422         {
1423                 /* checksum will be supplied by tcp_write_xmit.  So
1424                  * we shouldn't need to set it at all.  I'm being paranoid */
1425                 th->check = 0;
1426                 if (skb->next != NULL) 
1427                 {
1428                         printk("tcp_send_partial: next != NULL\n");
1429                         skb_unlink(skb);
1430                 }
1431                 skb_queue_tail(&sk->write_queue, skb);
1432                 
1433                 /*
1434                  *      If we don't fit we have to start the zero window
1435                  *      probes. This is broken - we really need to do a partial
1436                  *      send _first_ (This is what causes the Cisco and PC/TCP
1437                  *      grief).
1438                  */
1439                  
1440                 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1441                     sk->send_head == NULL && sk->ack_backlog == 0)
1442                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1443         } 
1444         else 
1445         {
1446                 /*
1447                  *      This is going straight out
1448                  */
1449                  
1450                 th->ack_seq = htonl(sk->acked_seq);
1451                 th->window = htons(tcp_select_window(sk));
1452 
1453                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1454 
1455                 sk->sent_seq = sk->write_seq;
1456                 
1457                 /*
1458                  *      This is mad. The tcp retransmit queue is put together
1459                  *      by the ip layer. This causes half the problems with
1460                  *      unroutable FIN's and other things.
1461                  */
1462                  
1463                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1464                 
1465                 /*
1466                  *      Set for next retransmit based on expected ACK time.
1467                  *      FIXME: We set this every time which means our 
1468                  *      retransmits are really about a window behind.
1469                  */
1470 
1471                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1472         }
1473 }
1474 
1475 /*
1476  *      Locking problems lead us to a messy situation where we can have
1477  *      multiple partially complete buffers queued up. This is really bad
1478  *      as we don't want to be sending partial buffers. Fix this with
1479  *      a semaphore or similar to lock tcp_write per socket.
1480  *
1481  *      These routines are pretty self descriptive.
1482  */
1483  
1484 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1485 {
1486         struct sk_buff * skb;
1487         unsigned long flags;
1488 
1489         save_flags(flags);
1490         cli();
1491         skb = sk->partial;
1492         if (skb) {
1493                 sk->partial = NULL;
1494                 del_timer(&sk->partial_timer);
1495         }
1496         restore_flags(flags);
1497         return skb;
1498 }
1499 
1500 /*
1501  *      Empty the partial queue
1502  */
1503  
1504 static void tcp_send_partial(struct sock *sk)
     /*  */
1505 {
1506         struct sk_buff *skb;
1507 
1508         if (sk == NULL)
1509                 return;
1510         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1511                 tcp_send_skb(sk, skb);
1512 }
1513 
1514 /*
1515  *      Queue a partial frame
1516  */
1517  
1518 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1519 {
1520         struct sk_buff * tmp;
1521         unsigned long flags;
1522 
1523         save_flags(flags);
1524         cli();
1525         tmp = sk->partial;
1526         if (tmp)
1527                 del_timer(&sk->partial_timer);
1528         sk->partial = skb;
1529         init_timer(&sk->partial_timer);
1530         /*
1531          *      Wait up to 1 second for the buffer to fill.
1532          */
1533         sk->partial_timer.expires = jiffies+HZ;
1534         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1535         sk->partial_timer.data = (unsigned long) sk;
1536         add_timer(&sk->partial_timer);
1537         restore_flags(flags);
1538         if (tmp)
1539                 tcp_send_skb(sk, tmp);
1540 }
1541 
1542 
1543 /*
1544  *      This routine sends an ack and also updates the window. 
1545  */
1546  
1547 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1548              struct sock *sk,
1549              struct tcphdr *th, unsigned long daddr)
1550 {
1551         struct sk_buff *buff;
1552         struct tcphdr *t1;
1553         struct device *dev = NULL;
1554         int tmp;
1555 
1556         if(sk->zapped)
1557                 return;         /* We have been reset, we may not send again */
1558                 
1559         /*
1560          * We need to grab some memory, and put together an ack,
1561          * and then put it into the queue to be sent.
1562          */
1563 
1564         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1565         if (buff == NULL) 
1566         {
1567                 /* 
1568                  *      Force it to send an ack. We don't have to do this
1569                  *      (ACK is unreliable) but it's much better use of 
1570                  *      bandwidth on slow links to send a spare ack than
1571                  *      resend packets. 
1572                  */
1573                  
1574                 sk->ack_backlog++;
1575                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1576                 {
1577                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1578                 }
1579                 return;
1580         }
1581 
1582         /*
1583          *      Assemble a suitable TCP frame
1584          */
1585          
1586         buff->sk = sk;
1587         buff->localroute = sk->localroute;
1588 
1589         /* 
1590          *      Put in the IP header and routing stuff. 
1591          */
1592          
1593         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1594                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1595         if (tmp < 0) 
1596         {
1597                 buff->free = 1;
1598                 sock_wfree(sk, buff);
1599                 return;
1600         }
1601         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1602 
1603         memcpy(t1, th, sizeof(*t1));
1604 
1605         /*
1606          *      Swap the send and the receive. 
1607          */
1608          
1609         t1->dest = th->source;
1610         t1->source = th->dest;
1611         t1->seq = ntohl(sequence);
1612         t1->ack = 1;
1613         sk->window = tcp_select_window(sk);
1614         t1->window = ntohs(sk->window);
1615         t1->res1 = 0;
1616         t1->res2 = 0;
1617         t1->rst = 0;
1618         t1->urg = 0;
1619         t1->syn = 0;
1620         t1->psh = 0;
1621         t1->fin = 0;
1622         
1623         /*
1624          *      If we have nothing queued for transmit and the transmit timer
1625          *      is on we are just doing an ACK timeout and need to switch
1626          *      to a keepalive.
1627          */
1628          
1629         if (ack == sk->acked_seq) 
1630         {
1631                 sk->ack_backlog = 0;
1632                 sk->bytes_rcv = 0;
1633                 sk->ack_timed = 0;
1634                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1635                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1636                 {
1637                         if(sk->keepopen) {
1638                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1639                         } else {
1640                                 delete_timer(sk);
1641                         }
1642                 }
1643         }
1644         
1645         /*
1646          *      Fill in the packet and send it
1647          */
1648          
1649         t1->ack_seq = htonl(ack);
1650         t1->doff = sizeof(*t1)/4;
1651         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1652         if (sk->debug)
1653                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1654         tcp_statistics.TcpOutSegs++;
1655         sk->prot->queue_xmit(sk, dev, buff, 1);
1656 }
1657 
1658 
1659 /* 
1660  *      This routine builds a generic TCP header. 
1661  */
1662  
1663 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1664 {
1665 
1666         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1667         th->seq = htonl(sk->write_seq);
1668         th->psh =(push == 0) ? 1 : 0;
1669         th->doff = sizeof(*th)/4;
1670         th->ack = 1;
1671         th->fin = 0;
1672         sk->ack_backlog = 0;
1673         sk->bytes_rcv = 0;
1674         sk->ack_timed = 0;
1675         th->ack_seq = htonl(sk->acked_seq);
1676         sk->window = tcp_select_window(sk);
1677         th->window = htons(sk->window);
1678 
1679         return(sizeof(*th));
1680 }
1681 
1682 /*
1683  *      This routine copies from a user buffer into a socket,
1684  *      and starts the transmit system.
1685  */
1686 
1687 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1688           int len, int nonblock, int flags)
1689 {
1690         int copied = 0;
1691         int copy;
1692         int tmp;
1693         int seglen;
1694         int iovct=0;
1695         struct sk_buff *skb;
1696         struct sk_buff *send_tmp;
1697         struct proto *prot;
1698         struct device *dev = NULL;
1699         unsigned char *from;
1700         
1701         /*
1702          *      Do sanity checking for sendmsg/sendto/send
1703          */
1704          
1705         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1706                 return -EINVAL;
1707         if (msg->msg_name)
1708         {
1709                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1710                 if(sk->state == TCP_CLOSE)
1711                         return -ENOTCONN;
1712                 if (msg->msg_namelen < sizeof(*addr))
1713                         return -EINVAL;
1714                 if (addr->sin_family && addr->sin_family != AF_INET) 
1715                         return -EINVAL;
1716                 if (addr->sin_port != sk->dummy_th.dest) 
1717                         return -EISCONN;
1718                 if (addr->sin_addr.s_addr != sk->daddr) 
1719                         return -EISCONN;
1720         }
1721         
1722         /*
1723          *      Ok commence sending
1724          */
1725         
1726         while(iovct<msg->msg_iovlen)
1727         {
1728                 seglen=msg->msg_iov[iovct].iov_len;
1729                 from=msg->msg_iov[iovct++].iov_base;
1730                 sk->inuse=1;
1731                 prot = sk->prot;
1732                 while(seglen > 0) 
1733                 {
1734                         if (sk->err) 
1735                         {                       /* Stop on an error */
1736                                 release_sock(sk);
1737                                 if (copied) 
1738                                         return(copied);
1739                                 return sock_error(sk);
1740                         }
1741 
1742                         /*
1743                          *      First thing we do is make sure that we are established. 
1744                          */
1745         
1746                         if (sk->shutdown & SEND_SHUTDOWN) 
1747                         {
1748                                 release_sock(sk);
1749                                 sk->err = EPIPE;
1750                                 if (copied) 
1751                                         return(copied);
1752                                 sk->err = 0;
1753                                 return(-EPIPE);
1754                         }
1755 
1756                         /* 
1757                          *      Wait for a connection to finish.
1758                          */
1759                 
1760                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1761                         {
1762                                 if (sk->err) 
1763                                 {
1764                                         release_sock(sk);
1765                                         if (copied) 
1766                                                 return(copied);
1767                                         return sock_error(sk);
1768                                 }               
1769         
1770                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1771                                 {
1772                                         release_sock(sk);
1773                                         if (copied) 
1774                                                 return(copied);
1775         
1776                                         if (sk->err) 
1777                                                 return sock_error(sk);
1778 
1779                                         if (sk->keepopen) 
1780                                         {
1781                                                 send_sig(SIGPIPE, current, 0);
1782                                         }
1783                                         return(-EPIPE);
1784                                 }
1785         
1786                                 if (nonblock || copied) 
1787                                 {
1788                                         release_sock(sk);
1789                                         if (copied) 
1790                                                 return(copied);
1791                                         return(-EAGAIN);
1792                                 }
1793         
1794                                 release_sock(sk);
1795                                 cli();
1796                         
1797                                 if (sk->state != TCP_ESTABLISHED &&
1798                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1799                                 {
1800                                         interruptible_sleep_on(sk->sleep);      
1801                                         if (current->signal & ~current->blocked)
1802                                         {
1803                                                 sti();
1804                                                 if (copied) 
1805                                                         return(copied);
1806                                                 return(-ERESTARTSYS);
1807                                         }
1808                                 }
1809                                 sk->inuse = 1;
1810                                 sti();
1811                         }
1812         
1813                 /*
1814                  * The following code can result in copy <= if sk->mss is ever
1815                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1816                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1817                  * had better not get here until we've seen his SYN and at least one
1818                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1819                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1820                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1821                  * before the exchange of SYN's.  If the initial ack from the other
1822                  * end has a window of 0, max_window and thus mss will both be 0.
1823                  */
1824         
1825                 /* 
1826                  *      Now we need to check if we have a half built packet. 
1827                  */
1828 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1829                 /*
1830                  *      FIXME:  I'm almost sure that this fragment is BUG,
1831                  *              but it works... I do not know why 8) --ANK
1832                  *
1833                  *      Really, we should rebuild all the queues...
1834                  *      It's difficult. Temprorary hack is to send all
1835                  *      queued segments with allowed fragmentation.
1836                  */
1837                 {
1838                         int new_mss = min(sk->mtu, sk->max_window);
1839                         if (new_mss < sk->mss)
1840                         {
1841                                 tcp_send_partial(sk);
1842                                 sk->mss = new_mss;
1843                         }
1844                 }
1845 #endif
1846         
1847                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1848                         {
1849                                 int hdrlen;
1850 
1851                                  /* IP header + TCP header */
1852                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1853                                          + sizeof(struct tcphdr);
1854         
1855                                 /* Add more stuff to the end of skb->len */
1856                                 if (!(flags & MSG_OOB)) 
1857                                 {
1858                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1859                                         if (copy <= 0) 
1860                                         {
1861                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1862                                                 return -EFAULT;
1863                                         }                 
1864                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1865                                         from += copy;
1866                                         copied += copy;
1867                                         len -= copy;
1868                                         sk->write_seq += copy;
1869                                         seglen -= copy;
1870                                 }
1871                                 if ((skb->len - hdrlen) >= sk->mss ||
1872                                         (flags & MSG_OOB) || !sk->packets_out)
1873                                         tcp_send_skb(sk, skb);
1874                                 else
1875                                         tcp_enqueue_partial(skb, sk);
1876                                 continue;
1877                         }
1878 
1879                 /*
1880                  * We also need to worry about the window.
1881                  * If window < 1/2 the maximum window we've seen from this
1882                  *   host, don't use it.  This is sender side
1883                  *   silly window prevention, as specified in RFC1122.
1884                  *   (Note that this is different than earlier versions of
1885                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1886                  *   use the whole MSS.  Since the results in the right
1887                  *   edge of the packet being outside the window, it will
1888                  *   be queued for later rather than sent.
1889                  */
1890 
1891                         copy = sk->window_seq - sk->write_seq;
1892                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1893                                 copy = sk->mss;
1894                         if (copy > seglen)
1895                                 copy = seglen;
1896 
1897                 /*
1898                  *      We should really check the window here also. 
1899                  */
1900                  
1901                         send_tmp = NULL;
1902                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1903                         {
1904                                 /*
1905                                  *      We will release the socket in case we sleep here. 
1906                                  */
1907                                 release_sock(sk);
1908                                 /*
1909                                  *      NB: following must be mtu, because mss can be increased.
1910                                  *      mss is always <= mtu 
1911                                  */
1912                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1913                                 sk->inuse = 1;
1914                                 send_tmp = skb;
1915                         } 
1916                         else 
1917                         {
1918                                 /*
1919                                  *      We will release the socket in case we sleep here. 
1920                                  */
1921                                 release_sock(sk);
1922                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1923                                 sk->inuse = 1;
1924                         }
1925         
1926                         /*
1927                          *      If we didn't get any memory, we need to sleep. 
1928                          */
1929         
1930                         if (skb == NULL) 
1931                         {
1932                                 sk->socket->flags |= SO_NOSPACE;
1933                                 if (nonblock) 
1934                                 {
1935                                         release_sock(sk);
1936                                         if (copied) 
1937                                                 return(copied);
1938                                         return(-EAGAIN);
1939                                 }
1940 
1941                                 /*
1942                                  *      FIXME: here is another race condition. 
1943                                  */
1944 
1945                                 tmp = sk->wmem_alloc;
1946                                 release_sock(sk);
1947                                 cli();
1948                                 /*
1949                                  *      Again we will try to avoid it. 
1950                                  */
1951                                 if (tmp <= sk->wmem_alloc &&
1952                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1953                                         && sk->err == 0) 
1954                                 {
1955                                         sk->socket->flags &= ~SO_NOSPACE;
1956                                         interruptible_sleep_on(sk->sleep);
1957                                         if (current->signal & ~current->blocked) 
1958                                         {
1959                                                 sti();
1960                                                 if (copied) 
1961                                                         return(copied);
1962                                                 return(-ERESTARTSYS);
1963                                         }
1964                                 }
1965                                 sk->inuse = 1;
1966                                 sti();
1967                                 continue;
1968                         }
1969 
1970                         skb->sk = sk;
1971                         skb->free = 0;
1972                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1973         
1974                         /*
1975                          * FIXME: we need to optimize this.
1976                          * Perhaps some hints here would be good.
1977                          */
1978                 
1979                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1980                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1981                         if (tmp < 0 ) 
1982                         {
1983                                 sock_wfree(sk, skb);
1984                                 release_sock(sk);
1985                                 if (copied) 
1986                                         return(copied);
1987                                 return(tmp);
1988                         }
1989 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1990                         skb->ip_hdr->frag_off |= htons(IP_DF);
1991 #endif
1992                         skb->dev = dev;
1993                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1994                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1995                         if (tmp < 0) 
1996                         {
1997                                 sock_wfree(sk, skb);
1998                                 release_sock(sk);
1999                                 if (copied) 
2000                                         return(copied);
2001                                 return(tmp);
2002                         }
2003         
2004                         if (flags & MSG_OOB) 
2005                         {
2006                                 skb->h.th->urg = 1;
2007                                 skb->h.th->urg_ptr = ntohs(copy);
2008                         }
2009 
2010                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2011                 
2012                         from += copy;
2013                         copied += copy;
2014                         len -= copy;
2015                         seglen -= copy;
2016                         skb->free = 0;
2017                         sk->write_seq += copy;
2018                 
2019                         if (send_tmp != NULL && sk->packets_out) 
2020                         {
2021                                 tcp_enqueue_partial(send_tmp, sk);
2022                                 continue;
2023                         }
2024                         tcp_send_skb(sk, skb);
2025                 }
2026         }
2027         sk->err = 0;
2028 
2029 /*
2030  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2031  *      interactive fast network servers. It's meant to be on and
2032  *      it really improves the throughput though not the echo time
2033  *      on my slow slip link - Alan
2034  */
2035 
2036 /*
2037  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2038  */
2039  
2040         if(sk->partial && ((!sk->packets_out) 
2041      /* If not nagling we can send on the before case too.. */
2042               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2043         ))
2044                 tcp_send_partial(sk);
2045 
2046         release_sock(sk);
2047         return(copied);
2048 }
2049 
2050 /*
2051  *      Send an ack if one is backlogged at this point. Ought to merge
2052  *      this with tcp_send_ack().
2053  */
2054  
2055 static void tcp_read_wakeup(struct sock *sk)
     /*  */
2056 {
2057         int tmp;
2058         struct device *dev = NULL;
2059         struct tcphdr *t1;
2060         struct sk_buff *buff;
2061 
2062         if (!sk->ack_backlog) 
2063                 return;
2064 
2065         /*
2066          * If we're closed, don't send an ack, or we'll get a RST
2067          * from the closed destination.
2068          */
2069         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2070                 return; 
2071 
2072         /*
2073          * FIXME: we need to put code here to prevent this routine from
2074          * being called.  Being called once in a while is ok, so only check
2075          * if this is the second time in a row.
2076          */
2077 
2078         /*
2079          * We need to grab some memory, and put together an ack,
2080          * and then put it into the queue to be sent.
2081          */
2082 
2083         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2084         if (buff == NULL) 
2085         {
2086                 /* Try again real soon. */
2087                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2088                 return;
2089         }
2090 
2091         buff->sk = sk;
2092         buff->localroute = sk->localroute;
2093         
2094         /*
2095          *      Put in the IP header and routing stuff. 
2096          */
2097 
2098         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2099                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2100         if (tmp < 0) 
2101         {
2102                 buff->free = 1;
2103                 sock_wfree(sk, buff);
2104                 return;
2105         }
2106 
2107         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2108 
2109         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2110         t1->seq = htonl(sk->sent_seq);
2111         t1->ack = 1;
2112         t1->res1 = 0;
2113         t1->res2 = 0;
2114         t1->rst = 0;
2115         t1->urg = 0;
2116         t1->syn = 0;
2117         t1->psh = 0;
2118         sk->ack_backlog = 0;
2119         sk->bytes_rcv = 0;
2120         sk->window = tcp_select_window(sk);
2121         t1->window = htons(sk->window);
2122         t1->ack_seq = htonl(sk->acked_seq);
2123         t1->doff = sizeof(*t1)/4;
2124         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2125         sk->prot->queue_xmit(sk, dev, buff, 1);
2126         tcp_statistics.TcpOutSegs++;
2127 }
2128 
2129 
2130 /*
2131  *      FIXME:
2132  *      This routine frees used buffers.
2133  *      It should consider sending an ACK to let the
2134  *      other end know we now have a bigger window.
2135  */
2136 
2137 static void cleanup_rbuf(struct sock *sk)
     /*  */
2138 {
2139         unsigned long flags;
2140         unsigned long left;
2141         struct sk_buff *skb;
2142         unsigned long rspace;
2143 
2144         if(sk->debug)
2145                 printk("cleaning rbuf for sk=%p\n", sk);
2146   
2147         save_flags(flags);
2148         cli();
2149   
2150         left = sock_rspace(sk);
2151  
2152         /*
2153          *      We have to loop through all the buffer headers,
2154          *      and try to free up all the space we can.
2155          */
2156 
2157         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2158         {
2159                 if (!skb->used || skb->users) 
2160                         break;
2161                 skb_unlink(skb);
2162                 skb->sk = sk;
2163                 kfree_skb(skb, FREE_READ);
2164         }
2165 
2166         restore_flags(flags);
2167 
2168         /*
2169          *      FIXME:
2170          *      At this point we should send an ack if the difference
2171          *      in the window, and the amount of space is bigger than
2172          *      TCP_WINDOW_DIFF.
2173          */
2174 
2175         if(sk->debug)
2176                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2177                                             left);
2178         if ((rspace=sock_rspace(sk)) != left) 
2179         {
2180                 /*
2181                  * This area has caused the most trouble.  The current strategy
2182                  * is to simply do nothing if the other end has room to send at
2183                  * least 3 full packets, because the ack from those will auto-
2184                  * matically update the window.  If the other end doesn't think
2185                  * we have much space left, but we have room for at least 1 more
2186                  * complete packet than it thinks we do, we will send an ack
2187                  * immediately.  Otherwise we will wait up to .5 seconds in case
2188                  * the user reads some more.
2189                  */
2190                 sk->ack_backlog++;
2191         /*
2192          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2193          * if the other end is offering a window smaller than the agreed on MSS
2194          * (called sk->mtu here).  In theory there's no connection between send
2195          * and receive, and so no reason to think that they're going to send
2196          * small packets.  For the moment I'm using the hack of reducing the mss
2197          * only on the send side, so I'm putting mtu here.
2198          */
2199 
2200                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2201                 {
2202                         /* Send an ack right now. */
2203                         tcp_read_wakeup(sk);
2204                 } 
2205                 else 
2206                 {
2207                         /* Force it to send an ack soon. */
2208                         int was_active = del_timer(&sk->retransmit_timer);
2209                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2210                         {
2211                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2212                         } 
2213                         else
2214                                 add_timer(&sk->retransmit_timer);
2215                 }
2216         }
2217 } 
2218 
2219 
2220 /*
2221  *      Handle reading urgent data. BSD has very simple semantics for
2222  *      this, no blocking and very strange errors 8)
2223  */
2224  
2225 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2226              struct msghdr *msg, int len, int flags, int *addr_len)
2227 {
2228         /*
2229          *      No URG data to read
2230          */
2231         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2232                 return -EINVAL; /* Yes this is right ! */
2233                 
2234         if (sk->err) 
2235                 return sock_error(sk);
2236                 
2237         if (sk->state == TCP_CLOSE || sk->done) 
2238         {
2239                 if (!sk->done) 
2240                 {
2241                         sk->done = 1;
2242                         return 0;
2243                 }
2244                 return -ENOTCONN;
2245         }
2246 
2247         if (sk->shutdown & RCV_SHUTDOWN) 
2248         {
2249                 sk->done = 1;
2250                 return 0;
2251         }
2252         sk->inuse = 1;
2253         if (sk->urg_data & URG_VALID) 
2254         {
2255                 char c = sk->urg_data;
2256                 if (!(flags & MSG_PEEK))
2257                         sk->urg_data = URG_READ;
2258                 memcpy_toiovec(msg->msg_iov, &c, 1);
2259                 if(msg->msg_name)
2260                 {
2261                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2262                         sin->sin_family=AF_INET;
2263                         sin->sin_addr.s_addr=sk->daddr;
2264                         sin->sin_port=sk->dummy_th.dest;
2265                 }
2266                 if(addr_len)
2267                         *addr_len=sizeof(struct sockaddr_in);
2268                 release_sock(sk);
2269                 return 1;
2270         }
2271         release_sock(sk);
2272         
2273         /*
2274          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2275          * the available implementations agree in this case:
2276          * this call should never block, independent of the
2277          * blocking state of the socket.
2278          * Mike <pall@rz.uni-karlsruhe.de>
2279          */
2280         return -EAGAIN;
2281 }
2282 
2283 
2284 /*
2285  *      This routine copies from a sock struct into the user buffer. 
2286  */
2287  
2288 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2289         int len, int nonblock, int flags, int *addr_len)
2290 {
2291         struct wait_queue wait = { current, NULL };
2292         int copied = 0;
2293         u32 peek_seq;
2294         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2295         unsigned long used;
2296 
2297         /* 
2298          *      This error should be checked. 
2299          */
2300          
2301         if (sk->state == TCP_LISTEN)
2302                 return -ENOTCONN;
2303 
2304         /*
2305          *      Urgent data needs to be handled specially. 
2306          */
2307          
2308         if (flags & MSG_OOB)
2309                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2310 
2311         /*
2312          *      Copying sequence to update. This is volatile to handle
2313          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2314          *      inline and thus not flush cached variables otherwise).
2315          */
2316          
2317         peek_seq = sk->copied_seq;
2318         seq = &sk->copied_seq;
2319         if (flags & MSG_PEEK)
2320                 seq = &peek_seq;
2321 
2322         add_wait_queue(sk->sleep, &wait);
2323         sk->inuse = 1;
2324         while (len > 0) 
2325         {
2326                 struct sk_buff * skb;
2327                 u32 offset;
2328         
2329                 /*
2330                  * Are we at urgent data? Stop if we have read anything.
2331                  */
2332                  
2333                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2334                         break;
2335 
2336                 /*
2337                  *      Next get a buffer.
2338                  */
2339                  
2340                 current->state = TASK_INTERRUPTIBLE;
2341 
2342                 skb = skb_peek(&sk->receive_queue);
2343                 do 
2344                 {
2345                         if (!skb)
2346                                 break;
2347                         if (before(*seq, skb->seq))
2348                                 break;
2349                         offset = *seq - skb->seq;
2350                         if (skb->h.th->syn)
2351                                 offset--;
2352                         if (offset < skb->len)
2353                                 goto found_ok_skb;
2354                         if (skb->h.th->fin)
2355                                 goto found_fin_ok;
2356                         if (!(flags & MSG_PEEK))
2357                                 skb->used = 1;
2358                         skb = skb->next;
2359                 }
2360                 while (skb != (struct sk_buff *)&sk->receive_queue);
2361 
2362                 if (copied)
2363                         break;
2364 
2365                 if (sk->err) 
2366                 {
2367                         copied = sock_error(sk);
2368                         break;
2369                 }
2370 
2371                 if (sk->state == TCP_CLOSE) 
2372                 {
2373                         if (!sk->done) 
2374                         {
2375                                 sk->done = 1;
2376                                 break;
2377                         }
2378                         copied = -ENOTCONN;
2379                         break;
2380                 }
2381 
2382                 if (sk->shutdown & RCV_SHUTDOWN) 
2383                 {
2384                         sk->done = 1;
2385                         break;
2386                 }
2387                         
2388                 if (nonblock) 
2389                 {
2390                         copied = -EAGAIN;
2391                         break;
2392                 }
2393 
2394                 cleanup_rbuf(sk);
2395                 release_sock(sk);
2396                 sk->socket->flags |= SO_WAITDATA;
2397                 schedule();
2398                 sk->socket->flags &= ~SO_WAITDATA;
2399                 sk->inuse = 1;
2400 
2401                 if (current->signal & ~current->blocked) 
2402                 {
2403                         copied = -ERESTARTSYS;
2404                         break;
2405                 }
2406                 continue;
2407 
2408         found_ok_skb:
2409                 /*
2410                  *      Lock the buffer. We can be fairly relaxed as
2411                  *      an interrupt will never steal a buffer we are 
2412                  *      using unless I've missed something serious in
2413                  *      tcp_data.
2414                  */
2415                 
2416                 skb->users++;
2417                 
2418                 /*
2419                  *      Ok so how much can we use ? 
2420                  */
2421                  
2422                 used = skb->len - offset;
2423                 if (len < used)
2424                         used = len;
2425                 /*
2426                  *      Do we have urgent data here? 
2427                  */
2428                 
2429                 if (sk->urg_data) 
2430                 {
2431                         u32 urg_offset = sk->urg_seq - *seq;
2432                         if (urg_offset < used) 
2433                         {
2434                                 if (!urg_offset) 
2435                                 {
2436                                         if (!sk->urginline) 
2437                                         {
2438                                                 ++*seq;
2439                                                 offset++;
2440                                                 used--;
2441                                         }
2442                                 }
2443                                 else
2444                                         used = urg_offset;
2445                         }
2446                 }
2447                 
2448                 /*
2449                  *      Copy it - We _MUST_ update *seq first so that we
2450                  *      don't ever double read when we have dual readers
2451                  */
2452                  
2453                 *seq += used;
2454 
2455                 /*
2456                  *      This memcpy_tofs can sleep. If it sleeps and we
2457                  *      do a second read it relies on the skb->users to avoid
2458                  *      a crash when cleanup_rbuf() gets called.
2459                  */
2460                  
2461                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2462                         skb->h.th->doff*4 + offset, used);
2463                 copied += used;
2464                 len -= used;
2465                 
2466                 /*
2467                  *      We now will not sleep again until we are finished
2468                  *      with skb. Sorry if you are doing the SMP port
2469                  *      but you'll just have to fix it neatly ;)
2470                  */
2471                  
2472                 skb->users --;
2473                 
2474                 if (after(sk->copied_seq,sk->urg_seq))
2475                         sk->urg_data = 0;
2476                 if (used + offset < skb->len)
2477                         continue;
2478                 
2479                 /*
2480                  *      Process the FIN.
2481                  */
2482 
2483                 if (skb->h.th->fin)
2484                         goto found_fin_ok;
2485                 if (flags & MSG_PEEK)
2486                         continue;
2487                 skb->used = 1;
2488                 continue;
2489 
2490         found_fin_ok:
2491                 ++*seq;
2492                 if (flags & MSG_PEEK)
2493                         break;
2494                         
2495                 /*
2496                  *      All is done
2497                  */
2498                  
2499                 skb->used = 1;
2500                 sk->shutdown |= RCV_SHUTDOWN;
2501                 break;
2502 
2503         }
2504         
2505         if(copied>0 && msg->msg_name)
2506         {
2507                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2508                 sin->sin_family=AF_INET;
2509                 sin->sin_addr.s_addr=sk->daddr;
2510                 sin->sin_port=sk->dummy_th.dest;
2511         }
2512         if(addr_len)
2513                 *addr_len=sizeof(struct sockaddr_in);
2514                 
2515         remove_wait_queue(sk->sleep, &wait);
2516         current->state = TASK_RUNNING;
2517 
2518         /* Clean up data we have read: This will do ACK frames */
2519         cleanup_rbuf(sk);
2520         release_sock(sk);
2521         return copied;
2522 }
2523 
2524 
2525 
2526 /*
2527  *      State processing on a close. This implements the state shift for
2528  *      sending our FIN frame. Note that we only send a FIN for some 
2529  *      states. A shutdown() may have already sent the FIN, or we may be
2530  *      closed.
2531  */
2532  
2533 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2534 {
2535         int ns=TCP_CLOSE;
2536         int send_fin=0;
2537         switch(sk->state)
2538         {
2539                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2540                         break;
2541                 case TCP_SYN_RECV:
2542                 case TCP_ESTABLISHED:   /* Closedown begin */
2543                         ns=TCP_FIN_WAIT1;
2544                         send_fin=1;
2545                         break;
2546                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2547                 case TCP_FIN_WAIT2:
2548                 case TCP_CLOSING:
2549                         ns=sk->state;
2550                         break;
2551                 case TCP_CLOSE:
2552                 case TCP_LISTEN:
2553                         break;
2554                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2555                                            wait only for the ACK */
2556                         ns=TCP_LAST_ACK;
2557                         send_fin=1;
2558         }
2559         
2560         tcp_set_state(sk,ns);
2561                 
2562         /*
2563          *      This is a (useful) BSD violating of the RFC. There is a
2564          *      problem with TCP as specified in that the other end could
2565          *      keep a socket open forever with no application left this end.
2566          *      We use a 3 minute timeout (about the same as BSD) then kill
2567          *      our end. If they send after that then tough - BUT: long enough
2568          *      that we won't make the old 4*rto = almost no time - whoops
2569          *      reset mistake.
2570          */
2571         if(dead && ns==TCP_FIN_WAIT2)
2572         {
2573                 int timer_active=del_timer(&sk->timer);
2574                 if(timer_active)
2575                         add_timer(&sk->timer);
2576                 else
2577                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2578         }
2579         
2580         return send_fin;
2581 }
2582 
2583 /*
2584  *      Send a fin.
2585  */
2586 
2587 static void tcp_send_fin(struct sock *sk)
     /*  */
2588 {
2589         struct proto *prot =(struct proto *)sk->prot;
2590         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2591         struct tcphdr *t1;
2592         struct sk_buff *buff;
2593         struct device *dev=NULL;
2594         int tmp;
2595                 
2596         release_sock(sk); /* in case the malloc sleeps. */
2597         
2598         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2599         sk->inuse = 1;
2600 
2601         if (buff == NULL)
2602         {
2603                 /* This is a disaster if it occurs */
2604                 printk("tcp_send_fin: Impossible malloc failure");
2605                 return;
2606         }
2607 
2608         /*
2609          *      Administrivia
2610          */
2611          
2612         buff->sk = sk;
2613         buff->localroute = sk->localroute;
2614 
2615         /*
2616          *      Put in the IP header and routing stuff. 
2617          */
2618 
2619         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2620                            IPPROTO_TCP, sk->opt,
2621                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2622         if (tmp < 0) 
2623         {
2624                 int t;
2625                 /*
2626                  *      Finish anyway, treat this as a send that got lost. 
2627                  *      (Not good).
2628                  */
2629                  
2630                 buff->free = 1;
2631                 sock_wfree(sk,buff);
2632                 sk->write_seq++;
2633                 t=del_timer(&sk->timer);
2634                 if(t)
2635                         add_timer(&sk->timer);
2636                 else
2637                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2638                 return;
2639         }
2640         
2641         /*
2642          *      We ought to check if the end of the queue is a buffer and
2643          *      if so simply add the fin to that buffer, not send it ahead.
2644          */
2645 
2646         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2647         buff->dev = dev;
2648         memcpy(t1, th, sizeof(*t1));
2649         buff->seq = sk->write_seq;
2650         sk->write_seq++;
2651         buff->end_seq = sk->write_seq;
2652         t1->seq = htonl(buff->seq);
2653         t1->ack = 1;
2654         t1->ack_seq = htonl(sk->acked_seq);
2655         t1->window = htons(sk->window=tcp_select_window(sk));
2656         t1->fin = 1;
2657         t1->rst = 0;
2658         t1->doff = sizeof(*t1)/4;
2659         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2660 
2661         /*
2662          * If there is data in the write queue, the fin must be appended to
2663          * the write queue.
2664          */
2665         
2666         if (skb_peek(&sk->write_queue) != NULL) 
2667         {
2668                 buff->free = 0;
2669                 if (buff->next != NULL) 
2670                 {
2671                         printk("tcp_send_fin: next != NULL\n");
2672                         skb_unlink(buff);
2673                 }
2674                 skb_queue_tail(&sk->write_queue, buff);
2675         } 
2676         else 
2677         {
2678                 sk->sent_seq = sk->write_seq;
2679                 sk->prot->queue_xmit(sk, dev, buff, 0);
2680                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2681         }
2682 }
2683 
2684 /*
2685  *      Shutdown the sending side of a connection. Much like close except
2686  *      that we don't receive shut down or set sk->dead=1.
2687  */
2688 
2689 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2690 {
2691         /*
2692          *      We need to grab some memory, and put together a FIN,
2693          *      and then put it into the queue to be sent.
2694          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2695          */
2696 
2697         if (!(how & SEND_SHUTDOWN)) 
2698                 return;
2699          
2700         /*
2701          *      If we've already sent a FIN, or it's a closed state
2702          */
2703          
2704         if (sk->state == TCP_FIN_WAIT1 ||
2705             sk->state == TCP_FIN_WAIT2 ||
2706             sk->state == TCP_CLOSING ||
2707             sk->state == TCP_LAST_ACK ||
2708             sk->state == TCP_TIME_WAIT || 
2709             sk->state == TCP_CLOSE ||
2710             sk->state == TCP_LISTEN
2711           )
2712         {
2713                 return;
2714         }
2715         sk->inuse = 1;
2716 
2717         /*
2718          * flag that the sender has shutdown
2719          */
2720 
2721         sk->shutdown |= SEND_SHUTDOWN;
2722 
2723         /*
2724          *  Clear out any half completed packets. 
2725          */
2726 
2727         if (sk->partial)
2728                 tcp_send_partial(sk);
2729                 
2730         /*
2731          *      FIN if needed
2732          */
2733          
2734         if(tcp_close_state(sk,0))
2735                 tcp_send_fin(sk);
2736                 
2737         release_sock(sk);
2738 }
2739 
2740 /*
2741  *      This routine will send an RST to the other tcp. 
2742  */
2743  
2744 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2745           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2746 {
2747         struct sk_buff *buff;
2748         struct tcphdr *t1;
2749         int tmp;
2750         struct device *ndev=NULL;
2751 
2752         /*
2753          *      Cannot reset a reset (Think about it).
2754          */
2755          
2756         if(th->rst)
2757                 return;
2758   
2759         /*
2760          * We need to grab some memory, and put together an RST,
2761          * and then put it into the queue to be sent.
2762          */
2763 
2764         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2765         if (buff == NULL) 
2766                 return;
2767 
2768         buff->sk = NULL;
2769         buff->dev = dev;
2770         buff->localroute = 0;
2771 
2772         /*
2773          *      Put in the IP header and routing stuff. 
2774          */
2775 
2776         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2777                            sizeof(struct tcphdr),tos,ttl,NULL);
2778         if (tmp < 0) 
2779         {
2780                 buff->free = 1;
2781                 sock_wfree(NULL, buff);
2782                 return;
2783         }
2784 
2785         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2786         memcpy(t1, th, sizeof(*t1));
2787 
2788         /*
2789          *      Swap the send and the receive. 
2790          */
2791 
2792         t1->dest = th->source;
2793         t1->source = th->dest;
2794         t1->rst = 1;  
2795         t1->window = 0;
2796   
2797         if(th->ack)
2798         {
2799                 t1->ack = 0;
2800                 t1->seq = th->ack_seq;
2801                 t1->ack_seq = 0;
2802         }
2803         else
2804         {
2805                 t1->ack = 1;
2806                 if(!th->syn)
2807                         t1->ack_seq = th->seq;
2808                 else
2809                         t1->ack_seq = htonl(ntohl(th->seq)+1);
2810                 t1->seq = 0;
2811         }
2812 
2813         t1->syn = 0;
2814         t1->urg = 0;
2815         t1->fin = 0;
2816         t1->psh = 0;
2817         t1->doff = sizeof(*t1)/4;
2818         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2819         prot->queue_xmit(NULL, ndev, buff, 1);
2820         tcp_statistics.TcpOutSegs++;
2821 }
2822 
2823 
2824 /*
2825  *      Look for tcp options. Parses everything but only knows about MSS.
2826  *      This routine is always called with the packet containing the SYN.
2827  *      However it may also be called with the ack to the SYN.  So you
2828  *      can't assume this is always the SYN.  It's always called after
2829  *      we have set up sk->mtu to our own MTU.
2830  *
2831  *      We need at minimum to add PAWS support here. Possibly large windows
2832  *      as Linux gets deployed on 100Mb/sec networks.
2833  */
2834  
2835 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2836 {
2837         unsigned char *ptr;
2838         int length=(th->doff*4)-sizeof(struct tcphdr);
2839         int mss_seen = 0;
2840     
2841         ptr = (unsigned char *)(th + 1);
2842   
2843         while(length>0)
2844         {
2845                 int opcode=*ptr++;
2846                 int opsize=*ptr++;
2847                 switch(opcode)
2848                 {
2849                         case TCPOPT_EOL:
2850                                 return;
2851                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2852                                 length--;
2853                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2854                                 continue;
2855                         
2856                         default:
2857                                 if(opsize<=2)   /* Avoid silly options looping forever */
2858                                         return;
2859                                 switch(opcode)
2860                                 {
2861                                         case TCPOPT_MSS:
2862                                                 if(opsize==4 && th->syn)
2863                                                 {
2864                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2865                                                         mss_seen = 1;
2866                                                 }
2867                                                 break;
2868                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2869                                 }
2870                                 ptr+=opsize-2;
2871                                 length-=opsize;
2872                 }
2873         }
2874         if (th->syn) 
2875         {
2876                 if (! mss_seen)
2877                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2878         }
2879 #ifdef CONFIG_INET_PCTCP
2880         sk->mss = min(sk->max_window >> 1, sk->mtu);
2881 #else    
2882         sk->mss = min(sk->max_window, sk->mtu);
2883 #endif  
2884 }
2885 
2886 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2887 {
2888         dst = ntohl(dst);
2889         if (IN_CLASSA(dst))
2890                 return htonl(IN_CLASSA_NET);
2891         if (IN_CLASSB(dst))
2892                 return htonl(IN_CLASSB_NET);
2893         return htonl(IN_CLASSC_NET);
2894 }
2895 
2896 /*
2897  *      Default sequence number picking algorithm.
2898  *      As close as possible to RFC 793, which
2899  *      suggests using a 250kHz clock.
2900  *      Further reading shows this assumes 2MB/s networks.
2901  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2902  *      That's funny, Linux has one built in!  Use it!
2903  */
2904 
2905 extern inline u32 tcp_init_seq(void)
     /*  */
2906 {
2907         struct timeval tv;
2908         do_gettimeofday(&tv);
2909         return tv.tv_usec+tv.tv_sec*1000000;
2910 }
2911 
2912 /*
2913  *      This routine handles a connection request.
2914  *      It should make sure we haven't already responded.
2915  *      Because of the way BSD works, we have to send a syn/ack now.
2916  *      This also means it will be harder to close a socket which is
2917  *      listening.
2918  */
2919  
2920 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2921                  unsigned long daddr, unsigned long saddr,
2922                  struct options *opt, struct device *dev, u32 seq)
2923 {
2924         struct sk_buff *buff;
2925         struct tcphdr *t1;
2926         unsigned char *ptr;
2927         struct sock *newsk;
2928         struct tcphdr *th;
2929         struct device *ndev=NULL;
2930         int tmp;
2931         struct rtable *rt;
2932   
2933         th = skb->h.th;
2934 
2935         /* If the socket is dead, don't accept the connection. */
2936         if (!sk->dead) 
2937         {
2938                 sk->data_ready(sk,0);
2939         }
2940         else 
2941         {
2942                 if(sk->debug)
2943                         printk("Reset on %p: Connect on dead socket.\n",sk);
2944                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2945                 tcp_statistics.TcpAttemptFails++;
2946                 kfree_skb(skb, FREE_READ);
2947                 return;
2948         }
2949 
2950         /*
2951          * Make sure we can accept more.  This will prevent a
2952          * flurry of syns from eating up all our memory.
2953          */
2954 
2955         if (sk->ack_backlog >= sk->max_ack_backlog) 
2956         {
2957                 tcp_statistics.TcpAttemptFails++;
2958                 kfree_skb(skb, FREE_READ);
2959                 return;
2960         }
2961 
2962         /*
2963          * We need to build a new sock struct.
2964          * It is sort of bad to have a socket without an inode attached
2965          * to it, but the wake_up's will just wake up the listening socket,
2966          * and if the listening socket is destroyed before this is taken
2967          * off of the queue, this will take care of it.
2968          */
2969 
2970         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2971         if (newsk == NULL) 
2972         {
2973                 /* just ignore the syn.  It will get retransmitted. */
2974                 tcp_statistics.TcpAttemptFails++;
2975                 kfree_skb(skb, FREE_READ);
2976                 return;
2977         }
2978 
2979         memcpy(newsk, sk, sizeof(*newsk));
2980         newsk->opt = NULL;
2981         newsk->ip_route_cache  = NULL;
2982         if (opt && opt->optlen) {
2983           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2984           if (!sk->opt) {
2985                 kfree_s(newsk, sizeof(struct sock));
2986                 tcp_statistics.TcpAttemptFails++;
2987                 kfree_skb(skb, FREE_READ);
2988                 return;
2989           }
2990           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2991                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2992                 kfree_s(newsk, sizeof(struct sock));
2993                 tcp_statistics.TcpAttemptFails++;
2994                 kfree_skb(skb, FREE_READ);
2995                 return;
2996           }
2997         }
2998         skb_queue_head_init(&newsk->write_queue);
2999         skb_queue_head_init(&newsk->receive_queue);
3000         newsk->send_head = NULL;
3001         newsk->send_tail = NULL;
3002         skb_queue_head_init(&newsk->back_log);
3003         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3004         newsk->rto = TCP_TIMEOUT_INIT;
3005         newsk->mdev = 0;
3006         newsk->max_window = 0;
3007         newsk->cong_window = 1;
3008         newsk->cong_count = 0;
3009         newsk->ssthresh = 0;
3010         newsk->backoff = 0;
3011         newsk->blog = 0;
3012         newsk->intr = 0;
3013         newsk->proc = 0;
3014         newsk->done = 0;
3015         newsk->partial = NULL;
3016         newsk->pair = NULL;
3017         newsk->wmem_alloc = 0;
3018         newsk->rmem_alloc = 0;
3019         newsk->localroute = sk->localroute;
3020 
3021         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3022 
3023         newsk->err = 0;
3024         newsk->shutdown = 0;
3025         newsk->ack_backlog = 0;
3026         newsk->acked_seq = skb->seq+1;
3027         newsk->copied_seq = skb->seq+1;
3028         newsk->fin_seq = skb->seq;
3029         newsk->state = TCP_SYN_RECV;
3030         newsk->timeout = 0;
3031         newsk->ip_xmit_timeout = 0;
3032         newsk->write_seq = seq; 
3033         newsk->window_seq = newsk->write_seq;
3034         newsk->rcv_ack_seq = newsk->write_seq;
3035         newsk->urg_data = 0;
3036         newsk->retransmits = 0;
3037         newsk->linger=0;
3038         newsk->destroy = 0;
3039         init_timer(&newsk->timer);
3040         newsk->timer.data = (unsigned long)newsk;
3041         newsk->timer.function = &net_timer;
3042         init_timer(&newsk->retransmit_timer);
3043         newsk->retransmit_timer.data = (unsigned long)newsk;
3044         newsk->retransmit_timer.function=&retransmit_timer;
3045         newsk->dummy_th.source = skb->h.th->dest;
3046         newsk->dummy_th.dest = skb->h.th->source;
3047         
3048         /*
3049          *      Swap these two, they are from our point of view. 
3050          */
3051          
3052         newsk->daddr = saddr;
3053         newsk->saddr = daddr;
3054         newsk->rcv_saddr = daddr;
3055 
3056         put_sock(newsk->num,newsk);
3057         newsk->dummy_th.res1 = 0;
3058         newsk->dummy_th.doff = 6;
3059         newsk->dummy_th.fin = 0;
3060         newsk->dummy_th.syn = 0;
3061         newsk->dummy_th.rst = 0;        
3062         newsk->dummy_th.psh = 0;
3063         newsk->dummy_th.ack = 0;
3064         newsk->dummy_th.urg = 0;
3065         newsk->dummy_th.res2 = 0;
3066         newsk->acked_seq = skb->seq + 1;
3067         newsk->copied_seq = skb->seq + 1;
3068         newsk->socket = NULL;
3069 
3070         /*
3071          *      Grab the ttl and tos values and use them 
3072          */
3073 
3074         newsk->ip_ttl=sk->ip_ttl;
3075         newsk->ip_tos=skb->ip_hdr->tos;
3076 
3077         /*
3078          *      Use 512 or whatever user asked for 
3079          */
3080 
3081         /*
3082          *      Note use of sk->user_mss, since user has no direct access to newsk 
3083          */
3084 
3085         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3086         newsk->ip_route_cache = rt;
3087         
3088         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3089                 newsk->window_clamp = rt->rt_window;
3090         else
3091                 newsk->window_clamp = 0;
3092                 
3093         if (sk->user_mss)
3094                 newsk->mtu = sk->user_mss;
3095         else if (rt)
3096                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3097         else 
3098                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3099 
3100         /*
3101          *      But not bigger than device MTU 
3102          */
3103 
3104         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3105 
3106 #ifdef CONFIG_SKIP
3107         
3108         /*
3109          *      SKIP devices set their MTU to 65535. This is so they can take packets
3110          *      unfragmented to security process then fragment. They could lie to the
3111          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3112          *      simply because the final package we want unfragmented is going to be
3113          *
3114          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3115          */
3116          
3117         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3118                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3119 #endif
3120         /*
3121          *      This will min with what arrived in the packet 
3122          */
3123 
3124         tcp_options(newsk,skb->h.th);
3125         
3126         tcp_cache_zap();
3127 
3128         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3129         if (buff == NULL) 
3130         {
3131                 sk->err = ENOMEM;
3132                 newsk->dead = 1;
3133                 newsk->state = TCP_CLOSE;
3134                 /* And this will destroy it */
3135                 release_sock(newsk);
3136                 kfree_skb(skb, FREE_READ);
3137                 tcp_statistics.TcpAttemptFails++;
3138                 return;
3139         }
3140   
3141         buff->sk = newsk;
3142         buff->localroute = newsk->localroute;
3143 
3144         /*
3145          *      Put in the IP header and routing stuff. 
3146          */
3147 
3148         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3149                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3150 
3151         /*
3152          *      Something went wrong. 
3153          */
3154 
3155         if (tmp < 0) 
3156         {
3157                 sk->err = tmp;
3158                 buff->free = 1;
3159                 kfree_skb(buff,FREE_WRITE);
3160                 newsk->dead = 1;
3161                 newsk->state = TCP_CLOSE;
3162                 release_sock(newsk);
3163                 skb->sk = sk;
3164                 kfree_skb(skb, FREE_READ);
3165                 tcp_statistics.TcpAttemptFails++;
3166                 return;
3167         }
3168 
3169         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3170   
3171         memcpy(t1, skb->h.th, sizeof(*t1));
3172         buff->seq = newsk->write_seq++;
3173         buff->end_seq = newsk->write_seq;
3174         /*
3175          *      Swap the send and the receive. 
3176          */
3177         t1->dest = skb->h.th->source;
3178         t1->source = newsk->dummy_th.source;
3179         t1->seq = ntohl(buff->seq);
3180         t1->ack = 1;
3181         newsk->window = tcp_select_window(newsk);
3182         newsk->sent_seq = newsk->write_seq;
3183         t1->window = ntohs(newsk->window);
3184         t1->res1 = 0;
3185         t1->res2 = 0;
3186         t1->rst = 0;
3187         t1->urg = 0;
3188         t1->psh = 0;
3189         t1->syn = 1;
3190         t1->ack_seq = htonl(newsk->acked_seq);
3191         t1->doff = sizeof(*t1)/4+1;
3192         ptr = skb_put(buff,4);
3193         ptr[0] = 2;
3194         ptr[1] = 4;
3195         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3196         ptr[3] =(newsk->mtu) & 0xff;
3197 
3198         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3199         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3200         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3201         skb->sk = newsk;
3202 
3203         /*
3204          *      Charge the sock_buff to newsk. 
3205          */
3206          
3207         sk->rmem_alloc -= skb->truesize;
3208         newsk->rmem_alloc += skb->truesize;
3209         
3210         skb_queue_tail(&sk->receive_queue,skb);
3211         sk->ack_backlog++;
3212         release_sock(newsk);
3213         tcp_statistics.TcpOutSegs++;
3214 }
3215 
3216 
3217 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3218 {
3219         /*
3220          * We need to grab some memory, and put together a FIN, 
3221          * and then put it into the queue to be sent.
3222          */
3223         
3224         sk->inuse = 1;
3225         
3226         if(th_cache_sk==sk)
3227                 tcp_cache_zap();
3228         if(sk->state == TCP_LISTEN)
3229         {
3230                 /* Special case */
3231                 tcp_set_state(sk, TCP_CLOSE);
3232                 tcp_close_pending(sk);
3233                 release_sock(sk);
3234                 return;
3235         }
3236         
3237         sk->keepopen = 1;
3238         sk->shutdown = SHUTDOWN_MASK;
3239 
3240         if (!sk->dead) 
3241                 sk->state_change(sk);
3242 
3243         if (timeout == 0) 
3244         {
3245                 struct sk_buff *skb;
3246                 
3247                 /*
3248                  *  We need to flush the recv. buffs.  We do this only on the
3249                  *  descriptor close, not protocol-sourced closes, because the
3250                  *  reader process may not have drained the data yet!
3251                  */
3252                  
3253                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3254                         kfree_skb(skb, FREE_READ);
3255                 /*
3256                  *      Get rid off any half-completed packets. 
3257                  */
3258 
3259                 if (sk->partial) 
3260                         tcp_send_partial(sk);
3261         }
3262 
3263                 
3264         /*
3265          *      Timeout is not the same thing - however the code likes
3266          *      to send both the same way (sigh).
3267          */
3268          
3269         if(timeout)
3270         {
3271                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3272         }
3273         else
3274         {
3275                 if(tcp_close_state(sk,1)==1)
3276                 {
3277                         tcp_send_fin(sk);
3278                 }
3279         }
3280         release_sock(sk);
3281 }
3282 
3283 
3284 /*
3285  *      This routine takes stuff off of the write queue,
3286  *      and puts it in the xmit queue. This happens as incoming acks
3287  *      open up the remote window for us.
3288  */
3289  
3290 static void tcp_write_xmit(struct sock *sk)
     /*  */
3291 {
3292         struct sk_buff *skb;
3293 
3294         /*
3295          *      The bytes will have to remain here. In time closedown will
3296          *      empty the write queue and all will be happy 
3297          */
3298 
3299         if(sk->zapped)
3300                 return;
3301 
3302         /*
3303          *      Anything on the transmit queue that fits the window can
3304          *      be added providing we are not
3305          *
3306          *      a) retransmitting (Nagle's rule)
3307          *      b) exceeding our congestion window.
3308          */
3309          
3310         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3311                 before(skb->end_seq, sk->window_seq + 1) &&
3312                 (sk->retransmits == 0 ||
3313                  sk->ip_xmit_timeout != TIME_WRITE ||
3314                  before(skb->end_seq, sk->rcv_ack_seq + 1))
3315                 && sk->packets_out < sk->cong_window) 
3316         {
3317                 IS_SKB(skb);
3318                 skb_unlink(skb);
3319                 
3320                 /*
3321                  *      See if we really need to send the packet. 
3322                  */
3323                  
3324                 if (before(skb->end_seq, sk->rcv_ack_seq +1)) 
3325                 {
3326                         /*
3327                          *      This is acked data. We can discard it. This 
3328                          *      cannot currently occur.
3329                          */
3330                          
3331                         sk->retransmits = 0;
3332                         kfree_skb(skb, FREE_WRITE);
3333                         if (!sk->dead) 
3334                                 sk->write_space(sk);
3335                 } 
3336                 else
3337                 {
3338                         struct tcphdr *th;
3339                         struct iphdr *iph;
3340                         int size;
3341 /*
3342  * put in the ack seq and window at this point rather than earlier,
3343  * in order to keep them monotonic.  We really want to avoid taking
3344  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3345  * Ack and window will in general have changed since this packet was put
3346  * on the write queue.
3347  */
3348                         iph = skb->ip_hdr;
3349                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3350                         size = skb->len - (((unsigned char *) th) - skb->data);
3351 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3352                         if (size > sk->mtu - sizeof(struct iphdr))
3353                         {
3354                                 iph->frag_off &= ~htons(IP_DF);
3355                                 ip_send_check(iph);
3356                         }
3357 #endif
3358                         
3359                         th->ack_seq = htonl(sk->acked_seq);
3360                         th->window = htons(tcp_select_window(sk));
3361 
3362                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3363 
3364                         sk->sent_seq = skb->end_seq;
3365                         
3366                         /*
3367                          *      IP manages our queue for some crazy reason
3368                          */
3369                          
3370                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3371                         
3372                         /*
3373                          *      Again we slide the timer wrongly
3374                          */
3375                          
3376                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3377                 }
3378         }
3379 }
3380 
3381 
3382 /*
3383  *      This routine deals with incoming acks, but not outgoing ones.
3384  */
3385 
3386 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3387 {
3388         u32 ack;
3389         int flag = 0;
3390 
3391         /* 
3392          * 1 - there was data in packet as well as ack or new data is sent or 
3393          *     in shutdown state
3394          * 2 - data from retransmit queue was acked and removed
3395          * 4 - window shrunk or data from retransmit queue was acked and removed
3396          */
3397 
3398         if(sk->zapped)
3399                 return(1);      /* Dead, cant ack any more so why bother */
3400 
3401         /*
3402          *      Have we discovered a larger window
3403          */
3404          
3405         ack = ntohl(th->ack_seq);
3406 
3407         if (ntohs(th->window) > sk->max_window) 
3408         {
3409                 sk->max_window = ntohs(th->window);
3410 #ifdef CONFIG_INET_PCTCP
3411                 /* Hack because we don't send partial packets to non SWS
3412                    handling hosts */
3413                 sk->mss = min(sk->max_window>>1, sk->mtu);
3414 #else
3415                 sk->mss = min(sk->max_window, sk->mtu);
3416 #endif  
3417         }
3418 
3419         /*
3420          *      We have dropped back to keepalive timeouts. Thus we have
3421          *      no retransmits pending.
3422          */
3423          
3424         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3425                 sk->retransmits = 0;
3426 
3427         /*
3428          *      If the ack is newer than sent or older than previous acks
3429          *      then we can probably ignore it.
3430          */
3431          
3432         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3433         {
3434                 if(sk->debug)
3435                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3436                         
3437                 /*
3438                  *      Keepalive processing.
3439                  */
3440                  
3441                 if (after(ack, sk->sent_seq)) 
3442                 {
3443                         return(0);
3444                 }
3445                 
3446                 /*
3447                  *      Restart the keepalive timer.
3448                  */
3449                  
3450                 if (sk->keepopen) 
3451                 {
3452                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3453                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3454                 }
3455                 return(1);
3456         }
3457 
3458         /*
3459          *      If there is data set flag 1
3460          */
3461          
3462         if (len != th->doff*4) 
3463                 flag |= 1;
3464 
3465         /*
3466          *      See if our window has been shrunk. 
3467          */
3468 
3469         if (after(sk->window_seq, ack+ntohs(th->window))) 
3470         {
3471                 /*
3472                  * We may need to move packets from the send queue
3473                  * to the write queue, if the window has been shrunk on us.
3474                  * The RFC says you are not allowed to shrink your window
3475                  * like this, but if the other end does, you must be able
3476                  * to deal with it.
3477                  */
3478                 struct sk_buff *skb;
3479                 struct sk_buff *skb2;
3480                 struct sk_buff *wskb = NULL;
3481         
3482                 skb2 = sk->send_head;
3483                 sk->send_head = NULL;
3484                 sk->send_tail = NULL;
3485         
3486                 /*
3487                  *      This is an artifact of a flawed concept. We want one
3488                  *      queue and a smarter send routine when we send all.
3489                  */
3490         
3491                 flag |= 4;      /* Window changed */
3492         
3493                 sk->window_seq = ack + ntohs(th->window);
3494                 cli();
3495                 while (skb2 != NULL) 
3496                 {
3497                         skb = skb2;
3498                         skb2 = skb->link3;
3499                         skb->link3 = NULL;
3500                         if (after(skb->end_seq, sk->window_seq)) 
3501                         {
3502                                 if (sk->packets_out > 0) 
3503                                         sk->packets_out--;
3504                                 /* We may need to remove this from the dev send list. */
3505                                 if (skb->next != NULL) 
3506                                 {
3507                                         skb_unlink(skb);                                
3508                                 }
3509                                 /* Now add it to the write_queue. */
3510                                 if (wskb == NULL)
3511                                         skb_queue_head(&sk->write_queue,skb);
3512                                 else
3513                                         skb_append(wskb,skb);
3514                                 wskb = skb;
3515                         } 
3516                         else 
3517                         {
3518                                 if (sk->send_head == NULL) 
3519                                 {
3520                                         sk->send_head = skb;
3521                                         sk->send_tail = skb;
3522                                 }
3523                                 else
3524                                 {
3525                                         sk->send_tail->link3 = skb;
3526                                         sk->send_tail = skb;
3527                                 }
3528                                 skb->link3 = NULL;
3529                         }
3530                 }
3531                 sti();
3532         }
3533 
3534         /*
3535          *      Pipe has emptied
3536          */
3537          
3538         if (sk->send_tail == NULL || sk->send_head == NULL) 
3539         {
3540                 sk->send_head = NULL;
3541                 sk->send_tail = NULL;
3542                 sk->packets_out= 0;
3543         }
3544 
3545         /*
3546          *      Update the right hand window edge of the host
3547          */
3548          
3549         sk->window_seq = ack + ntohs(th->window);
3550 
3551         /*
3552          *      We don't want too many packets out there. 
3553          */
3554          
3555         if (sk->ip_xmit_timeout == TIME_WRITE && 
3556                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3557         {
3558                 /* 
3559                  * This is Jacobson's slow start and congestion avoidance. 
3560                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3561                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3562                  * counter and increment it once every cwnd times.  It's possible
3563                  * that this should be done only if sk->retransmits == 0.  I'm
3564                  * interpreting "new data is acked" as including data that has
3565                  * been retransmitted but is just now being acked.
3566                  */
3567                 if (sk->cong_window < sk->ssthresh)  
3568                         /* 
3569                          *      In "safe" area, increase
3570                          */
3571                         sk->cong_window++;
3572                 else 
3573                 {
3574                         /*
3575                          *      In dangerous area, increase slowly.  In theory this is
3576                          *      sk->cong_window += 1 / sk->cong_window
3577                          */
3578                         if (sk->cong_count >= sk->cong_window) 
3579                         {
3580                                 sk->cong_window++;
3581                                 sk->cong_count = 0;
3582                         }
3583                         else 
3584                                 sk->cong_count++;
3585                 }
3586         }
3587 
3588         /*
3589          *      Remember the highest ack received.
3590          */
3591          
3592         sk->rcv_ack_seq = ack;
3593 
3594         /*
3595          *      If this ack opens up a zero window, clear backoff.  It was
3596          *      being used to time the probes, and is probably far higher than
3597          *      it needs to be for normal retransmission.
3598          */
3599 
3600         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3601         {
3602                 sk->retransmits = 0;    /* Our probe was answered */
3603                 
3604                 /*
3605                  *      Was it a usable window open ?
3606                  */
3607                  
3608                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3609                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
3610                 {
3611                         sk->backoff = 0;
3612                         
3613                         /*
3614                          *      Recompute rto from rtt.  this eliminates any backoff.
3615                          */
3616 
3617                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3618                         if (sk->rto > 120*HZ)
3619                                 sk->rto = 120*HZ;
3620                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3621                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3622                                                    .2 of a second is going to need huge windows (SIGH) */
3623                         sk->rto = 20;
3624                 }
3625         }
3626 
3627         /* 
3628          *      See if we can take anything off of the retransmit queue.
3629          */
3630    
3631         while(sk->send_head != NULL) 
3632         {
3633                 /* Check for a bug. */
3634                 if (sk->send_head->link3 &&
3635                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
3636                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3637                         
3638                 /*
3639                  *      If our packet is before the ack sequence we can
3640                  *      discard it as it's confirmed to have arrived the other end.
3641                  */
3642                  
3643                 if (before(sk->send_head->end_seq, ack+1)) 
3644                 {
3645                         struct sk_buff *oskb;   
3646                         if (sk->retransmits) 
3647                         {       
3648                                 /*
3649                                  *      We were retransmitting.  don't count this in RTT est 
3650                                  */
3651                                 flag |= 2;
3652 
3653                                 /*
3654                                  * even though we've gotten an ack, we're still
3655                                  * retransmitting as long as we're sending from
3656                                  * the retransmit queue.  Keeping retransmits non-zero
3657                                  * prevents us from getting new data interspersed with
3658                                  * retransmissions.
3659                                  */
3660 
3661                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3662                                         sk->retransmits = 1;
3663                                 else
3664                                         sk->retransmits = 0;
3665                         }
3666                         /*
3667                          * Note that we only reset backoff and rto in the
3668                          * rtt recomputation code.  And that doesn't happen
3669                          * if there were retransmissions in effect.  So the
3670                          * first new packet after the retransmissions is
3671                          * sent with the backoff still in effect.  Not until
3672                          * we get an ack from a non-retransmitted packet do
3673                          * we reset the backoff and rto.  This allows us to deal
3674                          * with a situation where the network delay has increased
3675                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3676                          */
3677 
3678                         /*
3679                          *      We have one less packet out there. 
3680                          */
3681                          
3682                         if (sk->packets_out > 0) 
3683                                 sk->packets_out --;
3684                         /* 
3685                          *      Wake up the process, it can probably write more. 
3686                          */
3687                         if (!sk->dead) 
3688                                 sk->write_space(sk);
3689                         oskb = sk->send_head;
3690 
3691                         if (!(flag&2))  /* Not retransmitting */
3692                         {
3693                                 long m;
3694         
3695                                 /*
3696                                  *      The following amusing code comes from Jacobson's
3697                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3698                                  *      are scaled versions of rtt and mean deviation.
3699                                  *      This is designed to be as fast as possible 
3700                                  *      m stands for "measurement".
3701                                  */
3702         
3703                                 m = jiffies - oskb->when;  /* RTT */
3704                                 if(m<=0)
3705                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3706                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3707                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3708                                 if (m < 0)
3709                                         m = -m;         /* m is now abs(error) */
3710                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3711                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3712         
3713                                 /*
3714                                  *      Now update timeout.  Note that this removes any backoff.
3715                                  */
3716                          
3717                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3718                                 if (sk->rto > 120*HZ)
3719                                         sk->rto = 120*HZ;
3720                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3721                                         sk->rto = 20;
3722                                 sk->backoff = 0;
3723                         }
3724                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3725                                            In this case as we just set it up */
3726                         cli();
3727                         oskb = sk->send_head;
3728                         IS_SKB(oskb);
3729                         sk->send_head = oskb->link3;
3730                         if (sk->send_head == NULL) 
3731                         {
3732                                 sk->send_tail = NULL;
3733                         }
3734 
3735                 /*
3736                  *      We may need to remove this from the dev send list. 
3737                  */
3738 
3739                         if (oskb->next)
3740                                 skb_unlink(oskb);
3741                         sti();
3742                         kfree_skb(oskb, FREE_WRITE); /* write. */
3743                         if (!sk->dead) 
3744                                 sk->write_space(sk);
3745                 }
3746                 else
3747                 {
3748                         break;
3749                 }
3750         }
3751 
3752         /*
3753          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3754          * returns non-NULL, we complete ignore the timer stuff in the else
3755          * clause.  We ought to organize the code so that else clause can
3756          * (should) be executed regardless, possibly moving the PROBE timer
3757          * reset over.  The skb_peek() thing should only move stuff to the
3758          * write queue, NOT also manage the timer functions.
3759          */
3760 
3761         /*
3762          * Maybe we can take some stuff off of the write queue,
3763          * and put it onto the xmit queue.
3764          */
3765         if (skb_peek(&sk->write_queue) != NULL) 
3766         {
3767                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3768                         (sk->retransmits == 0 || 
3769                          sk->ip_xmit_timeout != TIME_WRITE ||
3770                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3771                         && sk->packets_out < sk->cong_window) 
3772                 {
3773                         /*
3774                          *      Add more data to the send queue.
3775                          */
3776                         flag |= 1;
3777                         tcp_write_xmit(sk);
3778                 }
3779                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3780                         sk->send_head == NULL &&
3781                         sk->ack_backlog == 0 &&
3782                         sk->state != TCP_TIME_WAIT) 
3783                 {
3784                         /*
3785                          *      Data to queue but no room.
3786                          */
3787                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3788                 }               
3789         }
3790         else
3791         {
3792                 /*
3793                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3794                  * from TCP_CLOSE we don't do anything
3795                  *
3796                  * from anything else, if there is write data (or fin) pending,
3797                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3798                  * a KEEPALIVE timeout, else we delete the timer.
3799                  *
3800                  * We do not set flag for nominal write data, otherwise we may
3801                  * force a state where we start to write itsy bitsy tidbits
3802                  * of data.
3803                  */
3804 
3805                 switch(sk->state) {
3806                 case TCP_TIME_WAIT:
3807                         /*
3808                          * keep us in TIME_WAIT until we stop getting packets,
3809                          * reset the timeout.
3810                          */
3811                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3812                         break;
3813                 case TCP_CLOSE:
3814                         /*
3815                          * don't touch the timer.
3816                          */
3817                         break;
3818                 default:
3819                         /*
3820                          *      Must check send_head, write_queue, and ack_backlog
3821                          *      to determine which timeout to use.
3822                          */
3823                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3824                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3825                         } else if (sk->keepopen) {
3826                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3827                         } else {
3828                                 del_timer(&sk->retransmit_timer);
3829                                 sk->ip_xmit_timeout = 0;
3830                         }
3831                         break;
3832                 }
3833         }
3834 
3835         /*
3836          *      We have nothing queued but space to send. Send any partial
3837          *      packets immediately (end of Nagle rule application).
3838          */
3839          
3840         if (sk->packets_out == 0 && sk->partial != NULL &&
3841                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3842         {
3843                 flag |= 1;
3844                 tcp_send_partial(sk);
3845         }
3846 
3847         /*
3848          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3849          * we are now waiting for an acknowledge to our FIN.  The other end is
3850          * already in TIME_WAIT.
3851          *
3852          * Move to TCP_CLOSE on success.
3853          */
3854 
3855         if (sk->state == TCP_LAST_ACK) 
3856         {
3857                 if (!sk->dead)
3858                         sk->state_change(sk);
3859                 if(sk->debug)
3860                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3861                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3862                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3863                 {
3864                         flag |= 1;
3865                         tcp_set_state(sk,TCP_CLOSE);
3866                         sk->shutdown = SHUTDOWN_MASK;
3867                 }
3868         }
3869 
3870         /*
3871          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3872          *
3873          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3874          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3875          */
3876 
3877         if (sk->state == TCP_FIN_WAIT1) 
3878         {
3879 
3880                 if (!sk->dead) 
3881                         sk->state_change(sk);
3882                 if (sk->rcv_ack_seq == sk->write_seq) 
3883                 {
3884                         flag |= 1;
3885                         sk->shutdown |= SEND_SHUTDOWN;
3886                         tcp_set_state(sk, TCP_FIN_WAIT2);
3887                 }
3888         }
3889 
3890         /*
3891          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3892          *
3893          *      Move to TIME_WAIT
3894          */
3895 
3896         if (sk->state == TCP_CLOSING) 
3897         {
3898 
3899                 if (!sk->dead) 
3900                         sk->state_change(sk);
3901                 if (sk->rcv_ack_seq == sk->write_seq) 
3902                 {
3903                         flag |= 1;
3904                         tcp_time_wait(sk);
3905                 }
3906         }
3907         
3908         /*
3909          *      Final ack of a three way shake 
3910          */
3911          
3912         if(sk->state==TCP_SYN_RECV)
3913         {
3914                 tcp_set_state(sk, TCP_ESTABLISHED);
3915                 tcp_options(sk,th);
3916                 sk->dummy_th.dest=th->source;
3917                 sk->copied_seq = sk->acked_seq;
3918                 if(!sk->dead)
3919                         sk->state_change(sk);
3920                 if(sk->max_window==0)
3921                 {
3922                         sk->max_window=32;      /* Sanity check */
3923                         sk->mss=min(sk->max_window,sk->mtu);
3924                 }
3925         }
3926         
3927         /*
3928          * I make no guarantees about the first clause in the following
3929          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3930          * what conditions "!flag" would be true.  However I think the rest
3931          * of the conditions would prevent that from causing any
3932          * unnecessary retransmission. 
3933          *   Clearly if the first packet has expired it should be 
3934          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3935          * harder to explain:  You have to look carefully at how and when the
3936          * timer is set and with what timeout.  The most recent transmission always
3937          * sets the timer.  So in general if the most recent thing has timed
3938          * out, everything before it has as well.  So we want to go ahead and
3939          * retransmit some more.  If we didn't explicitly test for this
3940          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3941          * would not be true.  If you look at the pattern of timing, you can
3942          * show that rto is increased fast enough that the next packet would
3943          * almost never be retransmitted immediately.  Then you'd end up
3944          * waiting for a timeout to send each packet on the retransmission
3945          * queue.  With my implementation of the Karn sampling algorithm,
3946          * the timeout would double each time.  The net result is that it would
3947          * take a hideous amount of time to recover from a single dropped packet.
3948          * It's possible that there should also be a test for TIME_WRITE, but
3949          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3950          * got to be in real retransmission mode.
3951          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3952          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3953          * As long as no further losses occur, this seems reasonable.
3954          */
3955         
3956         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3957                (((flag&2) && sk->retransmits) ||
3958                (sk->send_head->when + sk->rto < jiffies))) 
3959         {
3960                 if(sk->send_head->when + sk->rto < jiffies)
3961                         tcp_retransmit(sk,0);   
3962                 else
3963                 {
3964                         tcp_do_retransmit(sk, 1);
3965                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3966                 }
3967         }
3968 
3969         return(1);
3970 }
3971 
3972 
3973 /*
3974  *      Process the FIN bit. This now behaves as it is supposed to work
3975  *      and the FIN takes effect when it is validly part of sequence
3976  *      space. Not before when we get holes.
3977  *
3978  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3979  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3980  *      TIME-WAIT)
3981  *
3982  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3983  *      close and we go into CLOSING (and later onto TIME-WAIT)
3984  *
3985  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3986  *
3987  */
3988  
3989 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3990 {
3991         sk->fin_seq = skb->end_seq;
3992 
3993         if (!sk->dead) 
3994         {
3995                 sk->state_change(sk);
3996                 sock_wake_async(sk->socket, 1);
3997         }
3998 
3999         switch(sk->state) 
4000         {
4001                 case TCP_SYN_RECV:
4002                 case TCP_SYN_SENT:
4003                 case TCP_ESTABLISHED:
4004                         /*
4005                          * move to CLOSE_WAIT, tcp_data() already handled
4006                          * sending the ack.
4007                          */
4008                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4009                         if (th->rst)
4010                                 sk->shutdown = SHUTDOWN_MASK;
4011                         break;
4012 
4013                 case TCP_CLOSE_WAIT:
4014                 case TCP_CLOSING:
4015                         /*
4016                          * received a retransmission of the FIN, do
4017                          * nothing.
4018                          */
4019                         break;
4020                 case TCP_TIME_WAIT:
4021                         /*
4022                          * received a retransmission of the FIN,
4023                          * restart the TIME_WAIT timer.
4024                          */
4025                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4026                         return(0);
4027                 case TCP_FIN_WAIT1:
4028                         /*
4029                          * This case occurs when a simultaneous close
4030                          * happens, we must ack the received FIN and
4031                          * enter the CLOSING state.
4032                          *
4033                          * This causes a WRITE timeout, which will either
4034                          * move on to TIME_WAIT when we timeout, or resend
4035                          * the FIN properly (maybe we get rid of that annoying
4036                          * FIN lost hang). The TIME_WRITE code is already correct
4037                          * for handling this timeout.
4038                          */
4039 
4040                         if(sk->ip_xmit_timeout != TIME_WRITE)
4041                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4042                         tcp_set_state(sk,TCP_CLOSING);
4043                         break;
4044                 case TCP_FIN_WAIT2:
4045                         /*
4046                          * received a FIN -- send ACK and enter TIME_WAIT
4047                          */
4048                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4049                         sk->shutdown|=SHUTDOWN_MASK;
4050                         tcp_set_state(sk,TCP_TIME_WAIT);
4051                         break;
4052                 case TCP_CLOSE:
4053                         /*
4054                          * already in CLOSE
4055                          */
4056                         break;
4057                 default:
4058                         tcp_set_state(sk,TCP_LAST_ACK);
4059         
4060                         /* Start the timers. */
4061                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4062                         return(0);
4063         }
4064 
4065         return(0);
4066 }
4067 
4068 
4069 
4070 /*
4071  *      This routine handles the data.  If there is room in the buffer,
4072  *      it will be have already been moved into it.  If there is no
4073  *      room, then we will just have to discard the packet.
4074  */
4075 
4076 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4077          unsigned long saddr, unsigned short len)
4078 {
4079         struct sk_buff *skb1, *skb2;
4080         struct tcphdr *th;
4081         int dup_dumped=0;
4082         u32 new_seq, shut_seq;
4083 
4084         th = skb->h.th;
4085         skb_pull(skb,th->doff*4);
4086         skb_trim(skb,len-(th->doff*4));
4087 
4088         /*
4089          *      The bytes in the receive read/assembly queue has increased. Needed for the
4090          *      low memory discard algorithm 
4091          */
4092            
4093         sk->bytes_rcv += skb->len;
4094         
4095         if (skb->len == 0 && !th->fin) 
4096         {
4097                 /* 
4098                  *      Don't want to keep passing ack's back and forth. 
4099                  *      (someone sent us dataless, boring frame)
4100                  */
4101                 if (!th->ack)
4102                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4103                 kfree_skb(skb, FREE_READ);
4104                 return(0);
4105         }
4106         
4107         /*
4108          *      We no longer have anyone receiving data on this connection.
4109          */
4110 
4111 #ifndef TCP_DONT_RST_SHUTDOWN            
4112 
4113         if(sk->shutdown & RCV_SHUTDOWN)
4114         {
4115                 /*
4116                  *      FIXME: BSD has some magic to avoid sending resets to
4117                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4118                  *      BSD stacks still have broken keepalives so we want to
4119                  *      cope with it.
4120                  */
4121 
4122                 if(skb->len)    /* We don't care if it's just an ack or
4123                                    a keepalive/window probe */
4124                 {
4125                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
4126                         
4127                         /* Do this the way 4.4BSD treats it. Not what I'd
4128                            regard as the meaning of the spec but it's what BSD
4129                            does and clearly they know everything 8) */
4130 
4131                         /*
4132                          *      This is valid because of two things
4133                          *
4134                          *      a) The way tcp_data behaves at the bottom.
4135                          *      b) A fin takes effect when read not when received.
4136                          */
4137                          
4138                         shut_seq = sk->acked_seq+1;     /* Last byte */
4139                         
4140                         if(after(new_seq,shut_seq))
4141                         {
4142                                 if(sk->debug)
4143                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4144                                                 sk, new_seq, shut_seq, sk->blog);
4145                                 if(sk->dead)
4146                                 {
4147                                         sk->acked_seq = new_seq + th->fin;
4148                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4149                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4150                                         tcp_statistics.TcpEstabResets++;
4151                                         tcp_set_state(sk,TCP_CLOSE);
4152                                         sk->err = EPIPE;
4153                                         sk->shutdown = SHUTDOWN_MASK;
4154                                         kfree_skb(skb, FREE_READ);
4155                                         return 0;
4156                                 }
4157                         }
4158                 }
4159         }
4160 
4161 #endif
4162 
4163         /*
4164          *      Now we have to walk the chain, and figure out where this one
4165          *      goes into it.  This is set up so that the last packet we received
4166          *      will be the first one we look at, that way if everything comes
4167          *      in order, there will be no performance loss, and if they come
4168          *      out of order we will be able to fit things in nicely.
4169          *
4170          *      [AC: This is wrong. We should assume in order first and then walk
4171          *       forwards from the first hole based upon real traffic patterns.]
4172          *      
4173          */
4174 
4175         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4176         {
4177                 skb_queue_head(&sk->receive_queue,skb);
4178                 skb1= NULL;
4179         } 
4180         else
4181         {
4182                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4183                 {
4184                         if(sk->debug)
4185                         {
4186                                 printk("skb1=%p :", skb1);
4187                                 printk("skb1->seq = %d: ", skb1->seq);
4188                                 printk("skb->seq = %d\n",skb->seq);
4189                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4190                                                 sk->acked_seq);
4191                         }
4192                         
4193                         /*
4194                          *      Optimisation: Duplicate frame or extension of previous frame from
4195                          *      same sequence point (lost ack case).
4196                          *      The frame contains duplicate data or replaces a previous frame
4197                          *      discard the previous frame (safe as sk->inuse is set) and put
4198                          *      the new one in its place.
4199                          */
4200                          
4201                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
4202                         {
4203                                 skb_append(skb1,skb);
4204                                 skb_unlink(skb1);
4205                                 kfree_skb(skb1,FREE_READ);
4206                                 dup_dumped=1;
4207                                 skb1=NULL;
4208                                 break;
4209                         }
4210                         
4211                         /*
4212                          *      Found where it fits
4213                          */
4214                          
4215                         if (after(skb->seq+1, skb1->seq))
4216                         {
4217                                 skb_append(skb1,skb);
4218                                 break;
4219                         }
4220                         
4221                         /*
4222                          *      See if we've hit the start. If so insert.
4223                          */
4224                         if (skb1 == skb_peek(&sk->receive_queue))
4225                         {
4226                                 skb_queue_head(&sk->receive_queue, skb);
4227                                 break;
4228                         }
4229                 }
4230         }
4231 
4232         /*
4233          *      Figure out what the ack value for this frame is
4234          */
4235          
4236         if (before(sk->acked_seq, sk->copied_seq)) 
4237         {
4238                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4239                 sk->acked_seq = sk->copied_seq;
4240         }
4241 
4242         /*
4243          *      Now figure out if we can ack anything. This is very messy because we really want two
4244          *      receive queues, a completed and an assembly queue. We also want only one transmit
4245          *      queue.
4246          */
4247 
4248         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
4249         {
4250                 if (before(skb->seq, sk->acked_seq+1)) 
4251                 {
4252                         int newwindow;
4253 
4254                         if (after(skb->end_seq, sk->acked_seq)) 
4255                         {
4256                                 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4257                                 if (newwindow < 0)
4258                                         newwindow = 0;  
4259                                 sk->window = newwindow;
4260                                 sk->acked_seq = skb->end_seq;
4261                         }
4262                         skb->acked = 1;
4263 
4264                         /*
4265                          *      When we ack the fin, we do the FIN 
4266                          *      processing.
4267                          */
4268 
4269                         if (skb->h.th->fin) 
4270                         {
4271                                 tcp_fin(skb,sk,skb->h.th);
4272                         }
4273           
4274                         for(skb2 = skb->next;
4275                             skb2 != (struct sk_buff *)&sk->receive_queue;
4276                             skb2 = skb2->next) 
4277                         {
4278                                 if (before(skb2->seq, sk->acked_seq+1)) 
4279                                 {
4280                                         if (after(skb2->end_seq, sk->acked_seq))
4281                                         {
4282                                                 newwindow = sk->window -
4283                                                  (skb2->end_seq - sk->acked_seq);
4284                                                 if (newwindow < 0)
4285                                                         newwindow = 0;  
4286                                                 sk->window = newwindow;
4287                                                 sk->acked_seq = skb2->end_seq;
4288                                         }
4289                                         skb2->acked = 1;
4290                                         /*
4291                                          *      When we ack the fin, we do
4292                                          *      the fin handling.
4293                                          */
4294                                         if (skb2->h.th->fin) 
4295                                         {
4296                                                 tcp_fin(skb,sk,skb->h.th);
4297                                         }
4298 
4299                                         /*
4300                                          *      Force an immediate ack.
4301                                          */
4302                                          
4303                                         sk->ack_backlog = sk->max_ack_backlog;
4304                                 }
4305                                 else
4306                                 {
4307                                         break;
4308                                 }
4309                         }
4310 
4311                         /*
4312                          *      This also takes care of updating the window.
4313                          *      This if statement needs to be simplified.
4314                          */
4315                         if (!sk->delay_acks ||
4316                             sk->ack_backlog >= sk->max_ack_backlog || 
4317                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4318         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4319                         }
4320                         else 
4321                         {
4322                                 sk->ack_backlog++;
4323                                 if(sk->debug)
4324                                         printk("Ack queued.\n");
4325                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4326                         }
4327                 }
4328         }
4329 
4330         /*
4331          *      If we've missed a packet, send an ack.
4332          *      Also start a timer to send another.
4333          */
4334          
4335         if (!skb->acked) 
4336         {
4337         
4338         /*
4339          *      This is important.  If we don't have much room left,
4340          *      we need to throw out a few packets so we have a good
4341          *      window.  Note that mtu is used, not mss, because mss is really
4342          *      for the send side.  He could be sending us stuff as large as mtu.
4343          */
4344                  
4345                 while (sock_rspace(sk) < sk->mtu) 
4346                 {
4347                         skb1 = skb_peek(&sk->receive_queue);
4348                         if (skb1 == NULL) 
4349                         {
4350                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4351                                 break;
4352                         }
4353 
4354                         /*
4355                          *      Don't throw out something that has been acked. 
4356                          */
4357                  
4358                         if (skb1->acked) 
4359                         {
4360                                 break;
4361                         }
4362                 
4363                         skb_unlink(skb1);
4364                         kfree_skb(skb1, FREE_READ);
4365                 }
4366                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4367                 sk->ack_backlog++;
4368                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4369         }
4370         else
4371         {
4372                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4373         }
4374 
4375         /*
4376          *      Now tell the user we may have some data. 
4377          */
4378          
4379         if (!sk->dead) 
4380         {
4381                 if(sk->debug)
4382                         printk("Data wakeup.\n");
4383                 sk->data_ready(sk,0);
4384         } 
4385         return(0);
4386 }
4387 
4388 
4389 /*
4390  *      This routine is only called when we have urgent data
4391  *      signalled. Its the 'slow' part of tcp_urg. It could be
4392  *      moved inline now as tcp_urg is only called from one
4393  *      place. We handle URGent data wrong. We have to - as
4394  *      BSD still doesn't use the correction from RFC961.
4395  */
4396  
4397 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4398 {
4399         u32 ptr = ntohs(th->urg_ptr);
4400 
4401         if (ptr)
4402                 ptr--;
4403         ptr += ntohl(th->seq);
4404 
4405         /* ignore urgent data that we've already seen and read */
4406         if (after(sk->copied_seq, ptr))
4407                 return;
4408 
4409         /* do we already have a newer (or duplicate) urgent pointer? */
4410         if (sk->urg_data && !after(ptr, sk->urg_seq))
4411                 return;
4412 
4413         /* tell the world about our new urgent pointer */
4414         if (sk->proc != 0) {
4415                 if (sk->proc > 0) {
4416                         kill_proc(sk->proc, SIGURG, 1);
4417                 } else {
4418                         kill_pg(-sk->proc, SIGURG, 1);
4419                 }
4420         }
4421         sk->urg_data = URG_NOTYET;
4422         sk->urg_seq = ptr;
4423 }
4424 
4425 /*
4426  *      This is the 'fast' part of urgent handling.
4427  */
4428  
4429 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4430         unsigned long saddr, unsigned long len)
4431 {
4432         u32 ptr;
4433 
4434         /*
4435          *      Check if we get a new urgent pointer - normally not 
4436          */
4437          
4438         if (th->urg)
4439                 tcp_check_urg(sk,th);
4440 
4441         /*
4442          *      Do we wait for any urgent data? - normally not
4443          */
4444          
4445         if (sk->urg_data != URG_NOTYET)
4446                 return 0;
4447 
4448         /*
4449          *      Is the urgent pointer pointing into this packet? 
4450          */
4451          
4452         ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4453         if (ptr >= len)
4454                 return 0;
4455 
4456         /*
4457          *      Ok, got the correct packet, update info 
4458          */
4459          
4460         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4461         if (!sk->dead)
4462                 sk->data_ready(sk,0);
4463         return 0;
4464 }
4465 
4466 /*
4467  *      This will accept the next outstanding connection. 
4468  */
4469  
4470 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4471 {
4472         struct sock *newsk;
4473         struct sk_buff *skb;
4474   
4475   /*
4476    * We need to make sure that this socket is listening,
4477    * and that it has something pending.
4478    */
4479 
4480         if (sk->state != TCP_LISTEN) 
4481         {
4482                 sk->err = EINVAL;
4483                 return(NULL); 
4484         }
4485 
4486         /* Avoid the race. */
4487         cli();
4488         sk->inuse = 1;
4489 
4490         while((skb = tcp_dequeue_established(sk)) == NULL) 
4491         {
4492                 if (flags & O_NONBLOCK) 
4493                 {
4494                         sti();
4495                         release_sock(sk);
4496                         sk->err = EAGAIN;
4497                         return(NULL);
4498                 }
4499 
4500                 release_sock(sk);
4501                 interruptible_sleep_on(sk->sleep);
4502                 if (current->signal & ~current->blocked) 
4503                 {
4504                         sti();
4505                         sk->err = ERESTARTSYS;
4506                         return(NULL);
4507                 }
4508                 sk->inuse = 1;
4509         }
4510         sti();
4511 
4512         /*
4513          *      Now all we need to do is return skb->sk. 
4514          */
4515 
4516         newsk = skb->sk;
4517 
4518         kfree_skb(skb, FREE_READ);
4519         sk->ack_backlog--;
4520         release_sock(sk);
4521         return(newsk);
4522 }
4523 
4524 
4525 /*
4526  *      This will initiate an outgoing connection. 
4527  */
4528  
4529 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4530 {
4531         struct sk_buff *buff;
4532         struct device *dev=NULL;
4533         unsigned char *ptr;
4534         int tmp;
4535         int atype;
4536         struct tcphdr *t1;
4537         struct rtable *rt;
4538 
4539         if (sk->state != TCP_CLOSE) 
4540                 return(-EISCONN);
4541 
4542         /*
4543          *      Don't allow a double connect.
4544          */
4545                 
4546         if(sk->daddr)
4547                 return -EINVAL;
4548         
4549         if (addr_len < 8) 
4550                 return(-EINVAL);
4551 
4552         if (usin->sin_family && usin->sin_family != AF_INET) 
4553                 return(-EAFNOSUPPORT);
4554 
4555         /*
4556          *      connect() to INADDR_ANY means loopback (BSD'ism).
4557          */
4558         
4559         if(usin->sin_addr.s_addr==INADDR_ANY)
4560                 usin->sin_addr.s_addr=ip_my_addr();
4561                   
4562         /*
4563          *      Don't want a TCP connection going to a broadcast address 
4564          */
4565 
4566         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4567                 return -ENETUNREACH;
4568   
4569         sk->inuse = 1;
4570         sk->daddr = usin->sin_addr.s_addr;
4571         sk->write_seq = tcp_init_seq();
4572         sk->window_seq = sk->write_seq;
4573         sk->rcv_ack_seq = sk->write_seq -1;
4574         sk->err = 0;
4575         sk->dummy_th.dest = usin->sin_port;
4576         release_sock(sk);
4577 
4578         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4579         if (buff == NULL) 
4580         {
4581                 return(-ENOMEM);
4582         }
4583         sk->inuse = 1;
4584         buff->sk = sk;
4585         buff->free = 0;
4586         buff->localroute = sk->localroute;
4587         
4588 
4589         /*
4590          *      Put in the IP header and routing stuff.
4591          */
4592          
4593         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4594                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4595         if (tmp < 0) 
4596         {
4597                 sock_wfree(sk, buff);
4598                 release_sock(sk);
4599                 return(-ENETUNREACH);
4600         }
4601         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4602                 sk->saddr = rt->rt_src;
4603         sk->rcv_saddr = sk->saddr;
4604 
4605         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4606 
4607         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4608         buff->seq = sk->write_seq++;
4609         t1->seq = htonl(buff->seq);
4610         sk->sent_seq = sk->write_seq;
4611         buff->end_seq = sk->write_seq;
4612         t1->ack = 0;
4613         t1->window = 2;
4614         t1->res1=0;
4615         t1->res2=0;
4616         t1->rst = 0;
4617         t1->urg = 0;
4618         t1->psh = 0;
4619         t1->syn = 1;
4620         t1->urg_ptr = 0;
4621         t1->doff = 6;
4622         /* use 512 or whatever user asked for */
4623         
4624         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4625                 sk->window_clamp=rt->rt_window;
4626         else
4627                 sk->window_clamp=0;
4628 
4629         if (sk->user_mss)
4630                 sk->mtu = sk->user_mss;
4631         else if (rt)
4632                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4633         else 
4634                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4635 
4636         /*
4637          *      but not bigger than device MTU 
4638          */
4639 
4640         if(sk->mtu <32)
4641                 sk->mtu = 32;   /* Sanity limit */
4642                 
4643         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4644 
4645 #ifdef CONFIG_SKIP
4646         
4647         /*
4648          *      SKIP devices set their MTU to 65535. This is so they can take packets
4649          *      unfragmented to security process then fragment. They could lie to the
4650          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4651          *      simply because the final package we want unfragmented is going to be
4652          *
4653          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4654          */
4655          
4656         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4657                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4658 #endif
4659         
4660         /*
4661          *      Put in the TCP options to say MTU. 
4662          */
4663 
4664         ptr = skb_put(buff,4);
4665         ptr[0] = 2;
4666         ptr[1] = 4;
4667         ptr[2] = (sk->mtu) >> 8;
4668         ptr[3] = (sk->mtu) & 0xff;
4669         tcp_send_check(t1, sk->saddr, sk->daddr,
4670                   sizeof(struct tcphdr) + 4, sk);
4671 
4672         /*
4673          *      This must go first otherwise a really quick response will get reset. 
4674          */
4675 
4676         tcp_cache_zap();
4677         tcp_set_state(sk,TCP_SYN_SENT);
4678         if(rt&&rt->rt_flags&RTF_IRTT)
4679                 sk->rto = rt->rt_irtt;
4680         else
4681                 sk->rto = TCP_TIMEOUT_INIT;
4682         sk->retransmit_timer.function=&retransmit_timer;
4683         sk->retransmit_timer.data = (unsigned long)sk;
4684         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4685         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4686                                                                                         initial setting */
4687 
4688         sk->prot->queue_xmit(sk, dev, buff, 0);  
4689         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4690         tcp_statistics.TcpActiveOpens++;
4691         tcp_statistics.TcpOutSegs++;
4692   
4693         release_sock(sk);
4694         return(0);
4695 }
4696 
4697 
4698 /*
4699  *      This functions checks to see if the tcp header is actually acceptable. 
4700  */
4701  
4702 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4703              struct options *opt, unsigned long saddr, struct device *dev)
4704 {
4705         u32 next_seq;
4706 
4707         next_seq = len - 4*th->doff;
4708         if (th->fin)
4709                 next_seq++;
4710         /* if we have a zero window, we can't have any data in the packet.. */
4711         if (next_seq && !sk->window)
4712                 goto ignore_it;
4713         next_seq += ntohl(th->seq);
4714 
4715         /*
4716          * This isn't quite right.  sk->acked_seq could be more recent
4717          * than sk->window.  This is however close enough.  We will accept
4718          * slightly more packets than we should, but it should not cause
4719          * problems unless someone is trying to forge packets.
4720          */
4721 
4722         /* have we already seen all of this packet? */
4723         if (!after(next_seq+1, sk->acked_seq))
4724                 goto ignore_it;
4725         /* or does it start beyond the window? */
4726         if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4727                 goto ignore_it;
4728 
4729         /* ok, at least part of this packet would seem interesting.. */
4730         return 1;
4731 
4732 ignore_it:
4733         if (th->rst)
4734                 return 0;
4735 
4736         /*
4737          *      Send a reset if we get something not ours and we are
4738          *      unsynchronized. Note: We don't do anything to our end. We
4739          *      are just killing the bogus remote connection then we will
4740          *      connect again and it will work (with luck).
4741          */
4742          
4743         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4744         {
4745                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4746                 return 1;
4747         }
4748 
4749         /* Try to resync things. */
4750         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4751         return 0;
4752 }
4753 
4754 /*
4755  *      When we get a reset we do this.
4756  */
4757 
4758 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4759 {
4760         sk->zapped = 1;
4761         sk->err = ECONNRESET;
4762         if (sk->state == TCP_SYN_SENT)
4763                 sk->err = ECONNREFUSED;
4764         if (sk->state == TCP_CLOSE_WAIT)
4765                 sk->err = EPIPE;
4766 #ifdef TCP_DO_RFC1337           
4767         /*
4768          *      Time wait assassination protection [RFC1337]
4769          */
4770         if(sk->state!=TCP_TIME_WAIT)
4771         {       
4772                 tcp_set_state(sk,TCP_CLOSE);
4773                 sk->shutdown = SHUTDOWN_MASK;
4774         }
4775 #else   
4776         tcp_set_state(sk,TCP_CLOSE);
4777         sk->shutdown = SHUTDOWN_MASK;
4778 #endif  
4779         if (!sk->dead) 
4780                 sk->state_change(sk);
4781         kfree_skb(skb, FREE_READ);
4782         release_sock(sk);
4783         return(0);
4784 }
4785 
4786 /*
4787  *      A TCP packet has arrived.
4788  *              skb->h.raw is the TCP header.
4789  */
4790  
4791 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4792         __u32 daddr, unsigned short len,
4793         __u32 saddr, int redo, struct inet_protocol * protocol)
4794 {
4795         struct tcphdr *th;
4796         struct sock *sk;
4797         int syn_ok=0;
4798         
4799         tcp_statistics.TcpInSegs++;
4800         if(skb->pkt_type!=PACKET_HOST)
4801         {
4802                 kfree_skb(skb,FREE_READ);
4803                 return(0);
4804         }
4805   
4806         th = skb->h.th;
4807 
4808         /*
4809          *      Find the socket, using the last hit cache if applicable.
4810          */
4811 
4812         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4813         {
4814                 sk=(struct sock *)th_cache_sk;
4815                 /*
4816                  *      We think this is causing the bug so
4817                  */
4818                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4819                         printk("Cache mismatch on TCP.\n");
4820         }
4821         else
4822         {
4823                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4824                 th_cache_saddr=saddr;
4825                 th_cache_daddr=daddr;
4826                 th_cache_dport=th->dest;
4827                 th_cache_sport=th->source;
4828                 th_cache_sk=sk;
4829         }               
4830 
4831         /*
4832          *      If this socket has got a reset it's to all intents and purposes 
4833          *      really dead. Count closed sockets as dead.
4834          *
4835          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4836          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4837          *      exist so should cause resets as if the port was unreachable.
4838          */
4839          
4840         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4841                 sk=NULL;
4842 
4843         if (!redo) 
4844         {
4845                 /*
4846                  *      Pull up the IP header.
4847                  */
4848                 skb_pull(skb, skb->h.raw-skb->data);
4849                 /*
4850                  *      Try to use the device checksum if provided.
4851                  */
4852                 if (
4853                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4854                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4855                     )
4856                 {
4857                         skb->sk = NULL;
4858                         kfree_skb(skb,FREE_READ);
4859                         /*
4860                          *      We don't release the socket because it was
4861                          *      never marked in use.
4862                          */
4863                         return(0);
4864                 }
4865 
4866                 skb->seq = ntohl(th->seq);
4867                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4868                 skb->ack_seq = ntohl(th->ack_seq);
4869 
4870                 /* See if we know about the socket. */
4871                 if (sk == NULL) 
4872                 {
4873                         /*
4874                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4875                          */
4876                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4877                         skb->sk = NULL;
4878                         /*
4879                          *      Discard frame
4880                          */
4881                         kfree_skb(skb, FREE_READ);
4882                         return(0);
4883                 }
4884 
4885                 skb->acked = 0;
4886                 skb->used = 0;
4887                 skb->free = 0;
4888                 skb->saddr = daddr;
4889                 skb->daddr = saddr;
4890         
4891                 /* We may need to add it to the backlog here. */
4892                 cli();
4893                 if (sk->inuse) 
4894                 {
4895                         skb_queue_tail(&sk->back_log, skb);
4896                         sti();
4897                         return(0);
4898                 }
4899                 sk->inuse = 1;
4900                 sti();
4901         }
4902         else
4903         {
4904                 if (sk==NULL) 
4905                 {
4906                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4907                         skb->sk = NULL;
4908                         kfree_skb(skb, FREE_READ);
4909                         return(0);
4910                 }
4911         }
4912 
4913 
4914         if (!sk->prot) 
4915         {
4916                 printk("IMPOSSIBLE 3\n");
4917                 return(0);
4918         }
4919 
4920 
4921         /*
4922          *      Charge the memory to the socket. 
4923          */
4924          
4925         skb->sk=sk;
4926         sk->rmem_alloc += skb->truesize;
4927 
4928         /*
4929          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4930          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4931          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4932          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4933          */
4934 
4935         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4936         {
4937         
4938                 /*
4939                  *      Now deal with unusual cases.
4940                  */
4941          
4942                 if(sk->state==TCP_LISTEN)
4943                 {
4944                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4945                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4946 
4947                         /*
4948                          *      We don't care for RST, and non SYN are absorbed (old segments)
4949                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4950                          *      netmask on a running connection it can go broadcast. Even Sun's have
4951                          *      this problem so I'm ignoring it 
4952                          */
4953                            
4954                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4955                         {
4956                                 kfree_skb(skb, FREE_READ);
4957                                 release_sock(sk);
4958                                 return 0;
4959                         }
4960                 
4961                         /*      
4962                          *      Guess we need to make a new socket up 
4963                          */
4964                 
4965                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4966                 
4967                         /*
4968                          *      Now we have several options: In theory there is nothing else
4969                          *      in the frame. KA9Q has an option to send data with the syn,
4970                          *      BSD accepts data with the syn up to the [to be] advertised window
4971                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4972                          *      it, that fits the spec precisely and avoids incompatibilities. It
4973                          *      would be nice in future to drop through and process the data.
4974                          */
4975                          
4976                         release_sock(sk);
4977                         return 0;
4978                 }
4979         
4980                 /* retransmitted SYN? */
4981                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
4982                 {
4983                         kfree_skb(skb, FREE_READ);
4984                         release_sock(sk);
4985                         return 0;
4986                 }
4987                 
4988                 /*
4989                  *      SYN sent means we have to look for a suitable ack and either reset
4990                  *      for bad matches or go to connected 
4991                  */
4992            
4993                 if(sk->state==TCP_SYN_SENT)
4994                 {
4995                         /* Crossed SYN or previous junk segment */
4996                         if(th->ack)
4997                         {
4998                                 /* We got an ack, but it's not a good ack */
4999                                 if(!tcp_ack(sk,th,saddr,len))
5000                                 {
5001                                         /* Reset the ack - its an ack from a 
5002                                            different connection  [ th->rst is checked in tcp_reset()] */
5003                                         tcp_statistics.TcpAttemptFails++;
5004                                         tcp_reset(daddr, saddr, th,
5005                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5006                                         kfree_skb(skb, FREE_READ);
5007                                         release_sock(sk);
5008                                         return(0);
5009                                 }
5010                                 if(th->rst)
5011                                         return tcp_std_reset(sk,skb);
5012                                 if(!th->syn)
5013                                 {
5014                                         /* A valid ack from a different connection
5015                                            start. Shouldn't happen but cover it */
5016                                         kfree_skb(skb, FREE_READ);
5017                                         release_sock(sk);
5018                                         return 0;
5019                                 }
5020                                 /*
5021                                  *      Ok.. it's good. Set up sequence numbers and
5022                                  *      move to established.
5023                                  */
5024                                 syn_ok=1;       /* Don't reset this connection for the syn */
5025                                 sk->acked_seq = skb->seq+1;
5026                                 sk->fin_seq = skb->seq;
5027                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5028                                 tcp_set_state(sk, TCP_ESTABLISHED);
5029                                 tcp_options(sk,th);
5030                                 sk->dummy_th.dest=th->source;
5031                                 sk->copied_seq = sk->acked_seq;
5032                                 if(!sk->dead)
5033                                 {
5034                                         sk->state_change(sk);
5035                                         sock_wake_async(sk->socket, 0);
5036                                 }
5037                                 if(sk->max_window==0)
5038                                 {
5039                                         sk->max_window = 32;
5040                                         sk->mss = min(sk->max_window, sk->mtu);
5041                                 }
5042                         }
5043                         else
5044                         {
5045                                 /* See if SYN's cross. Drop if boring */
5046                                 if(th->syn && !th->rst)
5047                                 {
5048                                         /* Crossed SYN's are fine - but talking to
5049                                            yourself is right out... */
5050                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5051                                                 sk->dummy_th.source==th->source &&
5052                                                 sk->dummy_th.dest==th->dest)
5053                                         {
5054                                                 tcp_statistics.TcpAttemptFails++;
5055                                                 return tcp_std_reset(sk,skb);
5056                                         }
5057                                         tcp_set_state(sk,TCP_SYN_RECV);
5058                                         
5059                                         /*
5060                                          *      FIXME:
5061                                          *      Must send SYN|ACK here
5062                                          */
5063                                 }               
5064                                 /* Discard junk segment */
5065                                 kfree_skb(skb, FREE_READ);
5066                                 release_sock(sk);
5067                                 return 0;
5068                         }
5069                         /*
5070                          *      SYN_RECV with data maybe.. drop through
5071                          */
5072                         goto rfc_step6;
5073                 }
5074 
5075         /*
5076          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5077          *      a more complex suggestion for fixing these reuse issues in RFC1644
5078          *      but not yet ready for general use. Also see RFC1379.
5079          */
5080         
5081 #define BSD_TIME_WAIT
5082 #ifdef BSD_TIME_WAIT
5083                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5084                         after(skb->seq, sk->acked_seq) && !th->rst)
5085                 {
5086                         u32 seq = sk->write_seq;
5087                         if(sk->debug)
5088                                 printk("Doing a BSD time wait\n");
5089                         tcp_statistics.TcpEstabResets++;           
5090                         sk->rmem_alloc -= skb->truesize;
5091                         skb->sk = NULL;
5092                         sk->err=ECONNRESET;
5093                         tcp_set_state(sk, TCP_CLOSE);
5094                         sk->shutdown = SHUTDOWN_MASK;
5095                         release_sock(sk);
5096                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5097                         if (sk && sk->state==TCP_LISTEN)
5098                         {
5099                                 sk->inuse=1;
5100                                 skb->sk = sk;
5101                                 sk->rmem_alloc += skb->truesize;
5102                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5103                                 release_sock(sk);
5104                                 return 0;
5105                         }
5106                         kfree_skb(skb, FREE_READ);
5107                         return 0;
5108                 }
5109 #endif  
5110         }
5111 
5112         /*
5113          *      We are now in normal data flow (see the step list in the RFC)
5114          *      Note most of these are inline now. I'll inline the lot when
5115          *      I have time to test it hard and look at what gcc outputs 
5116          */
5117         
5118         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5119         {
5120                 kfree_skb(skb, FREE_READ);
5121                 release_sock(sk);
5122                 return 0;
5123         }
5124 
5125         if(th->rst)
5126                 return tcp_std_reset(sk,skb);
5127         
5128         /*
5129          *      !syn_ok is effectively the state test in RFC793.
5130          */
5131          
5132         if(th->syn && !syn_ok)
5133         {
5134                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5135                 return tcp_std_reset(sk,skb);   
5136         }
5137 
5138         /*
5139          *      Process the ACK
5140          */
5141          
5142 
5143         if(th->ack && !tcp_ack(sk,th,saddr,len))
5144         {
5145                 /*
5146                  *      Our three way handshake failed.
5147                  */
5148                  
5149                 if(sk->state==TCP_SYN_RECV)
5150                 {
5151                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5152                 }
5153                 kfree_skb(skb, FREE_READ);
5154                 release_sock(sk);
5155                 return 0;
5156         }
5157         
5158 rfc_step6:              /* I'll clean this up later */
5159 
5160         /*
5161          *      If the accepted buffer put us over our queue size we
5162          *      now drop it (we must process the ack first to avoid
5163          *      deadlock cases).
5164          */
5165          
5166         if (sk->rmem_alloc  >= sk->rcvbuf) 
5167         {
5168                 kfree_skb(skb, FREE_READ);
5169                 release_sock(sk);
5170                 return(0);
5171         }
5172 
5173 
5174         /*
5175          *      Process urgent data
5176          */
5177                 
5178         if(tcp_urg(sk, th, saddr, len))
5179         {
5180                 kfree_skb(skb, FREE_READ);
5181                 release_sock(sk);
5182                 return 0;
5183         }
5184         
5185         /*
5186          *      Process the encapsulated data
5187          */
5188         
5189         if(tcp_data(skb,sk, saddr, len))
5190         {
5191                 kfree_skb(skb, FREE_READ);
5192                 release_sock(sk);
5193                 return 0;
5194         }
5195 
5196         /*
5197          *      And done
5198          */     
5199         
5200         release_sock(sk);
5201         return 0;
5202 }
5203 
5204 /*
5205  *      This routine sends a packet with an out of date sequence
5206  *      number. It assumes the other end will try to ack it.
5207  */
5208 
5209 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5210 {
5211         struct sk_buff *buff,*skb;
5212         struct tcphdr *t1;
5213         struct device *dev=NULL;
5214         int tmp;
5215 
5216         if (sk->zapped)
5217                 return; /* After a valid reset we can send no more */
5218 
5219         /*
5220          *      Write data can still be transmitted/retransmitted in the
5221          *      following states.  If any other state is encountered, return.
5222          *      [listen/close will never occur here anyway]
5223          */
5224 
5225         if (sk->state != TCP_ESTABLISHED && 
5226             sk->state != TCP_CLOSE_WAIT &&
5227             sk->state != TCP_FIN_WAIT1 && 
5228             sk->state != TCP_LAST_ACK &&
5229             sk->state != TCP_CLOSING
5230         ) 
5231         {
5232                 return;
5233         }
5234         if ( before(sk->sent_seq, sk->window_seq) && 
5235             (skb=skb_peek(&sk->write_queue)))
5236         {
5237                 /*
5238                  * We are probing the opening of a window
5239                  * but the window size is != 0
5240                  * must have been a result SWS advoidance ( sender )
5241                  */
5242             
5243                 struct iphdr *iph;
5244                 struct tcphdr *th;
5245                 struct tcphdr *nth;
5246                 unsigned long win_size;
5247 #if 0
5248                 unsigned long ow_size;
5249 #endif
5250                 void * tcp_data_start;
5251         
5252                 /*
5253                  *      How many bytes can we send ?
5254                  */
5255                  
5256                 win_size = sk->window_seq - sk->sent_seq;
5257 
5258                 /*
5259                  *      Recover the buffer pointers
5260                  */
5261                  
5262                 iph = (struct iphdr *)skb->ip_hdr;
5263                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5264 
5265                 /*
5266                  *      Grab the data for a temporary frame
5267                  */
5268                  
5269                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5270                                      (iph->ihl << 2) +
5271                                      sk->prot->max_header + 15, 
5272                                      1, GFP_ATOMIC);
5273                 if ( buff == NULL )
5274                         return;
5275 
5276                 /* 
5277                  *      If we strip the packet on the write queue we must
5278                  *      be ready to retransmit this one 
5279                  */
5280             
5281                 buff->free = /*0*/1;
5282 
5283                 buff->sk = sk;
5284                 buff->localroute = sk->localroute;
5285                 
5286                 /*
5287                  *      Put headers on the new packet
5288                  */
5289 
5290                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5291                                          IPPROTO_TCP, sk->opt, buff->truesize,
5292                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5293                 if (tmp < 0) 
5294                 {
5295                         sock_wfree(sk, buff);
5296                         return;
5297                 }
5298                 
5299                 /*
5300                  *      Move the TCP header over
5301                  */
5302 
5303                 buff->dev = dev;
5304 
5305                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5306 
5307                 memcpy(nth, th, th->doff * 4);
5308                 
5309                 /*
5310                  *      Correct the new header
5311                  */
5312                  
5313                 nth->ack = 1; 
5314                 nth->ack_seq = htonl(sk->acked_seq);
5315                 nth->window = htons(tcp_select_window(sk));
5316                 nth->check = 0;
5317 
5318                 /*
5319                  *      Find the first data byte.
5320                  */
5321                  
5322                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5323                                 (iph->ihl << 2) + th->doff * 4;
5324 
5325                 /*
5326                  *      Add it to our new buffer
5327                  */
5328                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5329                 
5330                 /*
5331                  *      Remember our right edge sequence number.
5332                  */
5333                  
5334                 buff->end_seq = sk->sent_seq + win_size;
5335                 sk->sent_seq = buff->end_seq;           /* Hack */
5336 #if 0
5337 
5338                 /*
5339                  *      now: shrink the queue head segment 
5340                  */
5341                  
5342                 th->check = 0;
5343                 ow_size = skb->len - win_size - 
5344                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5345 
5346                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5347                 skb_trim(skb,skb->len-win_size);
5348                 sk->sent_seq += win_size;
5349                 th->seq = htonl(sk->sent_seq);
5350                 if (th->urg)
5351                 {
5352                         unsigned short urg_ptr;
5353         
5354                         urg_ptr = ntohs(th->urg_ptr);
5355                         if (urg_ptr <= win_size)
5356                                 th->urg = 0;
5357                         else
5358                         {
5359                                 urg_ptr -= win_size;
5360                                 th->urg_ptr = htons(urg_ptr);
5361                                 nth->urg_ptr = htons(win_size);
5362                         }
5363                 }
5364 #else
5365                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5366                         nth->urg = 0;
5367 #endif          
5368 
5369                 /*
5370                  *      Checksum the split buffer
5371                  */
5372                  
5373                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5374                            nth->doff * 4 + win_size , sk);
5375         }
5376         else
5377         {       
5378                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5379                 if (buff == NULL) 
5380                         return;
5381 
5382                 buff->free = 1;
5383                 buff->sk = sk;
5384                 buff->localroute = sk->localroute;
5385 
5386                 /*
5387                  *      Put in the IP header and routing stuff. 
5388                  */
5389                  
5390                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5391                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5392                 if (tmp < 0) 
5393                 {
5394                         sock_wfree(sk, buff);
5395                         return;
5396                 }
5397 
5398                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5399                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5400 
5401                 /*
5402                  *      Use a previous sequence.
5403                  *      This should cause the other end to send an ack.
5404                  */
5405          
5406                 t1->seq = htonl(sk->sent_seq-1);
5407                 t1->ack = 1; 
5408                 t1->res1= 0;
5409                 t1->res2= 0;
5410                 t1->rst = 0;
5411                 t1->urg = 0;
5412                 t1->psh = 0;
5413                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5414                 t1->syn = 0;
5415                 t1->ack_seq = htonl(sk->acked_seq);
5416                 t1->window = htons(tcp_select_window(sk));
5417                 t1->doff = sizeof(*t1)/4;
5418                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5419 
5420         }               
5421 
5422         /*
5423          *      Send it.
5424          */
5425         
5426         sk->prot->queue_xmit(sk, dev, buff, 1);
5427         tcp_statistics.TcpOutSegs++;
5428 }
5429 
5430 /*
5431  *      A window probe timeout has occurred.
5432  */
5433 
5434 void tcp_send_probe0(struct sock *sk)
     /*  */
5435 {
5436         if (sk->zapped)
5437                 return;         /* After a valid reset we can send no more */
5438 
5439         tcp_write_wakeup(sk);
5440 
5441         sk->backoff++;
5442         sk->rto = min(sk->rto << 1, 120*HZ);
5443         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5444         sk->retransmits++;
5445         sk->prot->retransmits ++;
5446 }
5447 
5448 /*
5449  *      Socket option code for TCP. 
5450  */
5451   
5452 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5453 {
5454         int val,err;
5455 
5456         if(level!=SOL_TCP)
5457                 return ip_setsockopt(sk,level,optname,optval,optlen);
5458 
5459         if (optval == NULL) 
5460                 return(-EINVAL);
5461 
5462         err=verify_area(VERIFY_READ, optval, sizeof(int));
5463         if(err)
5464                 return err;
5465         
5466         val = get_user((int *)optval);
5467 
5468         switch(optname)
5469         {
5470                 case TCP_MAXSEG:
5471 /*
5472  * values greater than interface MTU won't take effect.  however at
5473  * the point when this call is done we typically don't yet know
5474  * which interface is going to be used
5475  */
5476                         if(val<1||val>MAX_WINDOW)
5477                                 return -EINVAL;
5478                         sk->user_mss=val;
5479                         return 0;
5480                 case TCP_NODELAY:
5481                         sk->nonagle=(val==0)?0:1;
5482                         return 0;
5483                 default:
5484                         return(-ENOPROTOOPT);
5485         }
5486 }
5487 
5488 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5489 {
5490         int val,err;
5491 
5492         if(level!=SOL_TCP)
5493                 return ip_getsockopt(sk,level,optname,optval,optlen);
5494                         
5495         switch(optname)
5496         {
5497                 case TCP_MAXSEG:
5498                         val=sk->user_mss;
5499                         break;
5500                 case TCP_NODELAY:
5501                         val=sk->nonagle;
5502                         break;
5503                 default:
5504                         return(-ENOPROTOOPT);
5505         }
5506         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5507         if(err)
5508                 return err;
5509         put_user(sizeof(int),(int *) optlen);
5510 
5511         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5512         if(err)
5513                 return err;
5514         put_user(val,(int *)optval);
5515 
5516         return(0);
5517 }       
5518 
5519 
5520 struct proto tcp_prot = {
5521         tcp_close,
5522         ip_build_header,
5523         tcp_connect,
5524         tcp_accept,
5525         ip_queue_xmit,
5526         tcp_retransmit,
5527         tcp_write_wakeup,
5528         tcp_read_wakeup,
5529         tcp_rcv,
5530         tcp_select,
5531         tcp_ioctl,
5532         NULL,
5533         tcp_shutdown,
5534         tcp_setsockopt,
5535         tcp_getsockopt,
5536         tcp_sendmsg,
5537         tcp_recvmsg,
5538         NULL,           /* No special bind() */
5539         128,
5540         0,
5541         "TCP",
5542         0, 0,
5543         {NULL,}
5544 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS