net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathalogical case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *
 187  * To Fix:
 188  *              Fast path the code. Two things here - fix the window calculation
 189  *              so it doesn't iterate over the queue, also spot packets with no funny
 190  *              options arriving in order and process directly.
 191  *
 192  *              Rewrite output state machine to use a single queue and do low window
 193  *              situations as per the spec (RFC 1122)
 194  *              Speed up input assembly algorithm.
 195  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 196  *              could do with it working on IPv4
 197  *              User settable/learned rtt/max window/mtu
 198  *              Fix the window handling to use PR's new code.
 199  *
 200  *              Change the fundamental structure to a single send queue maintained
 201  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 202  *              active routes too]). Cut the queue off in tcp_retransmit/
 203  *              tcp_transmit.
 204  *              Change the receive queue to assemble as it goes. This lets us
 205  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 206  *              tcp_data/tcp_read as well as the window shrink crud.
 207  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 208  *              tcp_queue_skb seem obvious routines to extract.
 209  *      
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *              
 247  *      TCP_CLOSE               socket is finished
 248  */
 249 
 250 /*
 251  * RFC1122 status:
 252  * NOTE: I'm not going to be doing comments in the code for this one except
 253  * for violations and the like.  tcp.c is just too big... If I say something
 254  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 255  * with Alan. -- MS 950903
 256  * 
 257  * Use of PSH (4.2.2.2)
 258  *   MAY aggregate data sent without the PSH flag. (does)
 259  *   MAY queue data received without the PSH flag. (does)
 260  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 261  *   MAY implement PSH on send calls. (doesn't, thus:)
 262  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 263  *     MUST set PSH on last segment (does)
 264  *   MAY pass received PSH to application layer (doesn't)
 265  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 266  * 
 267  * Window Size (4.2.2.3, 4.2.2.16)
 268  *   MUST treat window size as an unsigned number (does)
 269  *   SHOULD treat window size as a 32-bit number (does not)
 270  *   MUST NOT shrink window once it is offered (does not normally)
 271  *   
 272  * Urgent Pointer (4.2.2.4)
 273  * **MUST point urgent pointer to last byte of urgent data (not right
 274  *     after). (doesn't, to be like BSD)
 275  *   MUST inform application layer asynchronously of incoming urgent
 276  *     data. (does)
 277  *   MUST provide application with means of determining the amount of
 278  *     urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  *      [Follows BSD 1 byte of urgent data]
 282  * 
 283  * TCP Options (4.2.2.5)
 284  *   MUST be able to receive TCP options in any segment. (does)
 285  *   MUST ignore unsupported options (does)
 286  *   
 287  * Maximum Segment Size Option (4.2.2.6)
 288  *   MUST implement both sending and receiving MSS. (does)
 289  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 290  *     it always). (does, even when MSS == 536, which is legal)
 291  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 292  *   MUST calculate "effective send MSS" correctly:
 293  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 294  *     (does - but allows operator override)
 295  *  
 296  * TCP Checksum (4.2.2.7)
 297  *   MUST generate and check TCP checksum. (does)
 298  * 
 299  * Initial Sequence Number Selection (4.2.2.8)
 300  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 301  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 302  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 303  * 
 304  * Simultaneous Open Attempts (4.2.2.10)
 305  *   MUST support simultaneous open attempts (does)
 306  * 
 307  * Recovery from Old Duplicate SYN (4.2.2.11)
 308  *   MUST keep track of active vs. passive open (does)
 309  * 
 310  * RST segment (4.2.2.12)
 311  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 312  *     anything with it, which is standard)
 313  * 
 314  * Closing a Connection (4.2.2.13)
 315  *   MUST inform application of whether connectin was closed by RST or
 316  *     normal close. (does)
 317  *   MAY allow "half-duplex" close (treat connection as closed for the
 318  *     local app, even before handshake is done). (does)
 319  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 320  * 
 321  * Retransmission Timeout (4.2.2.15)
 322  *   MUST implement Jacobson's slow start and congestion avoidance
 323  *     stuff. (does) 
 324  * 
 325  * Probing Zero Windows (4.2.2.17)
 326  *   MUST support probing of zero windows. (does)
 327  *   MAY keep offered window closed indefinitely. (does)
 328  *   MUST allow remote window to stay closed indefinitely. (does)
 329  * 
 330  * Passive Open Calls (4.2.2.18)
 331  *   MUST NOT let new passive open affect other connections. (doesn't)
 332  *   MUST support passive opens (LISTENs) concurrently. (does)
 333  *   
 334  * Time to Live (4.2.2.19)
 335  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 336  * 
 337  * Event Processing (4.2.2.20)
 338  *   SHOULD queue out-of-order segments. (does)
 339  *   MUST aggregate ACK segments whenever possible. (does but badly)
 340  *   
 341  * Retransmission Timeout Calculation (4.2.3.1)
 342  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 343  *     calculation. (does, or at least explains them in the comments 8*b)
 344  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 345  * 
 346  * When to Send an ACK Segment (4.2.3.2)
 347  *   SHOULD implement delayed ACK. (does not)
 348  *   MUST keep ACK delay < 0.5 sec. (N/A)
 349  * 
 350  * When to Send a Window Update (4.2.3.3)
 351  *   MUST implement receiver-side SWS. (does)
 352  *   
 353  * When to Send Data (4.2.3.4)
 354  *   MUST implement sender-side SWS. (does)
 355  *   SHOULD implement Nagle algorithm. (does)
 356  * 
 357  * TCP Connection Failures (4.2.3.5)
 358  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 359  *   SHOULD inform application layer of soft errors. (does)
 360  *   
 361  * TCP Keep-Alives (4.2.3.6)
 362  *   MAY provide keep-alives. (does)
 363  *   MUST make keep-alives configurable on a per-connection basis. (does)
 364  *   MUST default to no keep-alives. (does)
 365  * **MUST make keep-alive interval configurable. (doesn't)
 366  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 367  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 368  *     connection. (doesn't)
 369  *   SHOULD send keep-alive with no data. (does)
 370  * 
 371  * TCP Multihoming (4.2.3.7)
 372  *   MUST get source address from IP layer before sending first
 373  *     SYN. (does)
 374  *   MUST use same local address for all segments of a connection. (does)
 375  * 
 376  * IP Options (4.2.3.8)
 377  *   MUST ignore unsupported IP options. (does)
 378  *   MAY support Time Stamp and Record Route. (does)
 379  *   MUST allow application to specify a source route. (does)
 380  *   MUST allow receieved Source Route option to set route for all future
 381  *     segments on this connection. (does not (security issues))
 382  * 
 383  * ICMP messages (4.2.3.9)
 384  *   MUST act on ICMP errors. (does)
 385  *   MUST slow transmission upon receipt of a Source Quench. (does)
 386  *   MUST NOT abort connection upon receipt of soft Destination
 387  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 388  *     Problems. (doesn't)
 389  *   SHOULD report soft Destination Unreachables etc. to the
 390  *     application. (does)
 391  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 392  *     messages (2, 3, 4). (does)
 393  * 
 394  * Remote Address Validation (4.2.3.10)
 395  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 396  *   MUST ignore SYN with invalid source address. (does)
 397  *   MUST silently discard incoming SYN for broadcast/multicast
 398  *     address. (does) 
 399  * 
 400  * Asynchronous Reports (4.2.4.1)
 401  * **MUST provide mechanism for reporting soft errors to application
 402  *     layer. (doesn't)
 403  * 
 404  * Type of Service (4.2.4.2)
 405  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 406  * 
 407  * (Whew. -- MS 950903)
 408  **/
 409 
 410 #include <linux/types.h>
 411 #include <linux/sched.h>
 412 #include <linux/mm.h>
 413 #include <linux/time.h>
 414 #include <linux/string.h>
 415 #include <linux/config.h>
 416 #include <linux/socket.h>
 417 #include <linux/sockios.h>
 418 #include <linux/termios.h>
 419 #include <linux/in.h>
 420 #include <linux/fcntl.h>
 421 #include <linux/inet.h>
 422 #include <linux/netdevice.h>
 423 #include <net/snmp.h>
 424 #include <net/ip.h>
 425 #include <net/protocol.h>
 426 #include <net/icmp.h>
 427 #include <net/tcp.h>
 428 #include <net/arp.h>
 429 #include <linux/skbuff.h>
 430 #include <net/sock.h>
 431 #include <net/route.h>
 432 #include <linux/errno.h>
 433 #include <linux/timer.h>
 434 #include <asm/system.h>
 435 #include <asm/segment.h>
 436 #include <linux/mm.h>
 437 #include <net/checksum.h>
 438 
 439 /*
 440  *      The MSL timer is the 'normal' timer.
 441  */
 442  
 443 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 444 
 445 #define SEQ_TICK 3
 446 unsigned long seq_offset;
 447 struct tcp_mib  tcp_statistics;
 448 
 449 /*
 450  *      Cached last hit socket
 451  */
 452  
 453 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 454 volatile unsigned short  th_cache_dport, th_cache_sport;
 455 volatile struct sock *th_cache_sk;
 456 
 457 void tcp_cache_zap(void)
     /*  */
 458 {
 459         unsigned long flags;
 460         save_flags(flags);
 461         cli();
 462         th_cache_saddr=0;
 463         th_cache_daddr=0;
 464         th_cache_dport=0;
 465         th_cache_sport=0;
 466         th_cache_sk=NULL;
 467         restore_flags(flags);
 468 }
 469 
 470 static void tcp_close(struct sock *sk, int timeout);
 471 
 472 
 473 /*
 474  *      The less said about this the better, but it works and will do for 1.2 
 475  */
 476 
 477 static struct wait_queue *master_select_wakeup;
 478 
 479 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 480 {
 481         if (a < b) 
 482                 return(a);
 483         return(b);
 484 }
 485 
 486 #undef STATE_TRACE
 487 
 488 #ifdef STATE_TRACE
 489 static char *statename[]={
 490         "Unused","Established","Syn Sent","Syn Recv",
 491         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 492         "Close Wait","Last ACK","Listen","Closing"
 493 };
 494 #endif
 495 
 496 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 497 {
 498         if(sk->state==TCP_ESTABLISHED)
 499                 tcp_statistics.TcpCurrEstab--;
 500 #ifdef STATE_TRACE
 501         if(sk->debug)
 502                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 503 #endif  
 504         /* This is a hack but it doesn't occur often and it's going to
 505            be a real        to fix nicely */
 506            
 507         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 508         {
 509                 wake_up_interruptible(&master_select_wakeup);
 510         }
 511         sk->state=state;
 512         if(state==TCP_ESTABLISHED)
 513                 tcp_statistics.TcpCurrEstab++;
 514         if(sk->state==TCP_CLOSE)
 515                 tcp_cache_zap();
 516 }
 517 
 518 /*
 519  *      This routine picks a TCP windows for a socket based on
 520  *      the following constraints
 521  *  
 522  *      1. The window can never be shrunk once it is offered (RFC 793)
 523  *      2. We limit memory per socket
 524  *   
 525  *      For now we use NET2E3's heuristic of offering half the memory
 526  *      we have handy. All is not as bad as this seems however because
 527  *      of two things. Firstly we will bin packets even within the window
 528  *      in order to get the data we are waiting for into the memory limit.
 529  *      Secondly we bin common duplicate forms at receive time
 530  *      Better heuristics welcome
 531  */
 532    
 533 int tcp_select_window(struct sock *sk)
     /*  */
 534 {
 535         int new_window = sock_rspace(sk);
 536         
 537         if(sk->window_clamp)
 538                 new_window=min(sk->window_clamp,new_window);
 539         /*
 540          *      Two things are going on here.  First, we don't ever offer a
 541          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 542          *      receiver side of SWS as specified in RFC1122.
 543          *      Second, we always give them at least the window they
 544          *      had before, in order to avoid retracting window.  This
 545          *      is technically allowed, but RFC1122 advises against it and
 546          *      in practice it causes trouble.
 547          *
 548          *      Fixme: This doesn't correctly handle the case where
 549          *      new_window > sk->window but not by enough to allow for the
 550          *      shift in sequence space. 
 551          */
 552         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 553                 return(sk->window);
 554         return(new_window);
 555 }
 556 
 557 /*
 558  *      Find someone to 'accept'. Must be called with
 559  *      sk->inuse=1 or cli()
 560  */ 
 561 
 562 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 563 {
 564         struct sk_buff *p=skb_peek(&s->receive_queue);
 565         if(p==NULL)
 566                 return NULL;
 567         do
 568         {
 569                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 570                         return p;
 571                 p=p->next;
 572         }
 573         while(p!=(struct sk_buff *)&s->receive_queue);
 574         return NULL;
 575 }
 576 
 577 /*
 578  *      Remove a completed connection and return it. This is used by
 579  *      tcp_accept() to get connections from the queue.
 580  */
 581 
 582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 583 {
 584         struct sk_buff *skb;
 585         unsigned long flags;
 586         save_flags(flags);
 587         cli(); 
 588         skb=tcp_find_established(s);
 589         if(skb!=NULL)
 590                 skb_unlink(skb);        /* Take it off the queue */
 591         restore_flags(flags);
 592         return skb;
 593 }
 594 
 595 /* 
 596  *      This routine closes sockets which have been at least partially
 597  *      opened, but not yet accepted. Currently it is only called by
 598  *      tcp_close, and timeout mirrors the value there. 
 599  */
 600 
 601 static void tcp_close_pending (struct sock *sk) 
     /*  */
 602 {
 603         struct sk_buff *skb;
 604 
 605         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 606         {
 607                 skb->sk->dead=1;
 608                 tcp_close(skb->sk, 0);
 609                 kfree_skb(skb, FREE_READ);
 610         }
 611         return;
 612 }
 613 
 614 /*
 615  *      Enter the time wait state. 
 616  */
 617 
 618 static void tcp_time_wait(struct sock *sk)
     /*  */
 619 {
 620         tcp_set_state(sk,TCP_TIME_WAIT);
 621         sk->shutdown = SHUTDOWN_MASK;
 622         if (!sk->dead)
 623                 sk->state_change(sk);
 624         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 625 }
 626 
 627 /*
 628  *      A socket has timed out on its send queue and wants to do a
 629  *      little retransmitting. Currently this means TCP.
 630  */
 631 
 632 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 633 {
 634         struct sk_buff * skb;
 635         struct proto *prot;
 636         struct device *dev;
 637         int ct=0;
 638         struct rtable *rt;
 639 
 640         prot = sk->prot;
 641         skb = sk->send_head;
 642 
 643         while (skb != NULL)
 644         {
 645                 struct tcphdr *th;
 646                 struct iphdr *iph;
 647                 int size;
 648 
 649                 dev = skb->dev;
 650                 IS_SKB(skb);
 651                 skb->when = jiffies;
 652 
 653                 /*
 654                  *      Discard the surplus MAC header
 655                  */
 656                  
 657                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 658 
 659                 /*
 660                  * In general it's OK just to use the old packet.  However we
 661                  * need to use the current ack and window fields.  Urg and
 662                  * urg_ptr could possibly stand to be updated as well, but we
 663                  * don't keep the necessary data.  That shouldn't be a problem,
 664                  * if the other end is doing the right thing.  Since we're
 665                  * changing the packet, we have to issue a new IP identifier.
 666                  */
 667 
 668                 iph = (struct iphdr *)skb->data;
 669                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 670                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 671                 
 672                 /*
 673                  *      Note: We ought to check for window limits here but
 674                  *      currently this is done (less efficiently) elsewhere.
 675                  */
 676 
 677                 /*
 678                  *      Put a MAC header back on (may cause ARPing)
 679                  */
 680                  
 681                 {
 682                         /* ANK: UGLY, but the bug, that was here, should be fixed.
 683                          */
 684                         struct options *  opt = (struct options*)skb->proto_priv;
 685                         rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
 686                 }
 687 
 688                 iph->id = htons(ip_id_count++);
 689 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
 690                 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
 691                         iph->frag_off &= ~htons(IP_DF);
 692 #endif
 693                 ip_send_check(iph);
 694                         
 695                 if (rt==NULL)   /* Deep poo */
 696                 {
 697                         if(skb->sk)
 698                         {
 699                                 skb->sk->err_soft=ENETUNREACH;
 700                                 skb->sk->error_report(skb->sk);
 701                         }
 702                 }
 703                 else
 704                 {
 705                         dev=rt->rt_dev;
 706                         skb->raddr=rt->rt_gateway;
 707                         skb->dev=dev;
 708                         skb->arp=1;
 709                         if (rt->rt_hh)
 710                         {
 711                                 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
 712                                 if (!rt->rt_hh->hh_uptodate)
 713                                 {
 714                                         skb->arp = 0;
 715 #if RT_CACHE_DEBUG >= 2
 716                                         printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
 717 #endif
 718                                 }
 719                         }
 720                         else if (dev->hard_header)
 721                         {
 722                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 723                                         skb->arp=0;
 724                         }
 725                 
 726                         /*
 727                          *      This is not the right way to handle this. We have to
 728                          *      issue an up to date window and ack report with this 
 729                          *      retransmit to keep the odd buggy tcp that relies on 
 730                          *      the fact BSD does this happy. 
 731                          *      We don't however need to recalculate the entire 
 732                          *      checksum, so someone wanting a small problem to play
 733                          *      with might like to implement RFC1141/RFC1624 and speed
 734                          *      this up by avoiding a full checksum.
 735                          */
 736                  
 737                         th->ack_seq = htonl(sk->acked_seq);
 738                         th->window = ntohs(tcp_select_window(sk));
 739                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 740                 
 741                         /*
 742                          *      If the interface is (still) up and running, kick it.
 743                          */
 744         
 745                         if (dev->flags & IFF_UP)
 746                         {
 747                                 /*
 748                                  *      If the packet is still being sent by the device/protocol
 749                                  *      below then don't retransmit. This is both needed, and good -
 750                                  *      especially with connected mode AX.25 where it stops resends
 751                                  *      occurring of an as yet unsent anyway frame!
 752                                  *      We still add up the counts as the round trip time wants
 753                                  *      adjusting.
 754                                  */
 755                                 if (sk && !skb_device_locked(skb))
 756                                 {
 757                                         /* Remove it from any existing driver queue first! */
 758                                         skb_unlink(skb);
 759                                         /* Now queue it */
 760                                         ip_statistics.IpOutRequests++;
 761                                         dev_queue_xmit(skb, dev, sk->priority);
 762                                 }
 763                         }
 764                 }
 765                 
 766                 /*
 767                  *      Count retransmissions
 768                  */
 769                  
 770                 ct++;
 771                 sk->retransmits++;
 772                 sk->prot->retransmits ++;
 773                 tcp_statistics.TcpRetransSegs++;
 774                 
 775 
 776                 /*
 777                  *      Only one retransmit requested.
 778                  */
 779         
 780                 if (!all)
 781                         break;
 782 
 783                 /*
 784                  *      This should cut it off before we send too many packets.
 785                  */
 786 
 787                 if (ct >= sk->cong_window)
 788                         break;
 789                 skb = skb->link3;
 790         }
 791 }
 792 
 793 /*
 794  *      Reset the retransmission timer
 795  */
 796  
 797 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 798 {
 799         del_timer(&sk->retransmit_timer);
 800         sk->ip_xmit_timeout = why;
 801         if((long)when < 0)
 802         {
 803                 when=3;
 804                 printk("Error: Negative timer in xmit_timer\n");
 805         }
 806         sk->retransmit_timer.expires=jiffies+when;
 807         add_timer(&sk->retransmit_timer);
 808 }
 809 
 810 /*
 811  *      This is the normal code called for timeouts.  It does the retransmission
 812  *      and then does backoff.  tcp_do_retransmit is separated out because
 813  *      tcp_ack needs to send stuff from the retransmit queue without
 814  *      initiating a backoff.
 815  */
 816 
 817 
 818 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 819 {
 820         tcp_do_retransmit(sk, all);
 821 
 822         /*
 823          * Increase the timeout each time we retransmit.  Note that
 824          * we do not increase the rtt estimate.  rto is initialized
 825          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 826          * that doubling rto each time is the least we can get away with.
 827          * In KA9Q, Karn uses this for the first few times, and then
 828          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 829          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 830          * defined in the protocol as the maximum possible RTT.  I guess
 831          * we'll have to use something other than TCP to talk to the
 832          * University of Mars.
 833          *
 834          * PAWS allows us longer timeouts and large windows, so once
 835          * implemented ftp to mars will work nicely. We will have to fix
 836          * the 120 second clamps though!
 837          */
 838 
 839         sk->retransmits++;
 840         sk->prot->retransmits++;
 841         sk->backoff++;
 842         sk->rto = min(sk->rto << 1, 120*HZ);
 843         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 844 }
 845 
 846 
 847 /*
 848  *      A timer event has trigger a tcp retransmit timeout. The
 849  *      socket xmit queue is ready and set up to send. Because
 850  *      the ack receive code keeps the queue straight we do
 851  *      nothing clever here.
 852  */
 853 
 854 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 855 {
 856         if (all) 
 857         {
 858                 tcp_retransmit_time(sk, all);
 859                 return;
 860         }
 861 
 862         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 863         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 864         sk->cong_count = 0;
 865 
 866         sk->cong_window = 1;
 867 
 868         /* Do the actual retransmit. */
 869         tcp_retransmit_time(sk, all);
 870 }
 871 
 872 /*
 873  *      A write timeout has occurred. Process the after effects.
 874  */
 875 
 876 static int tcp_write_timeout(struct sock *sk)
     /*  */
 877 {
 878         /*
 879          *      Look for a 'soft' timeout.
 880          */
 881         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 882                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 883         {
 884                 /*
 885                  *      Attempt to recover if arp has changed (unlikely!) or
 886                  *      a route has shifted (not supported prior to 1.3).
 887                  */
 888                 ip_rt_advice(&sk->ip_route_cache, 0);
 889         }
 890         
 891         /*
 892          *      Have we tried to SYN too many times (repent repent 8))
 893          */
 894          
 895         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 896         {
 897                 if(sk->err_soft)
 898                         sk->err=sk->err_soft;
 899                 else
 900                         sk->err=ETIMEDOUT;
 901                 sk->error_report(sk);
 902                 del_timer(&sk->retransmit_timer);
 903                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 904                 tcp_set_state(sk,TCP_CLOSE);
 905                 /* Don't FIN, we got nothing back */
 906                 release_sock(sk);
 907                 return 0;
 908         }
 909         /*
 910          *      Has it gone just too far ?
 911          */
 912         if (sk->retransmits > TCP_RETR2) 
 913         {
 914                 if(sk->err_soft)
 915                         sk->err = sk->err_soft;
 916                 else
 917                         sk->err = ETIMEDOUT;
 918                 sk->error_report(sk);
 919                 del_timer(&sk->retransmit_timer);
 920                 /*
 921                  *      Time wait the socket 
 922                  */
 923                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 924                 {
 925                         tcp_set_state(sk,TCP_TIME_WAIT);
 926                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 927                 }
 928                 else
 929                 {
 930                         /*
 931                          *      Clean up time.
 932                          */
 933                         tcp_set_state(sk, TCP_CLOSE);
 934                         release_sock(sk);
 935                         return 0;
 936                 }
 937         }
 938         return 1;
 939 }
 940 
 941 /*
 942  *      The TCP retransmit timer. This lacks a few small details.
 943  *
 944  *      1.      An initial rtt timeout on the probe0 should cause what we can
 945  *              of the first write queue buffer to be split and sent.
 946  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 947  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 948  *              tcp_err should save a 'soft error' for us.
 949  */
 950 
 951 static void retransmit_timer(unsigned long data)
     /*  */
 952 {
 953         struct sock *sk = (struct sock*)data;
 954         int why = sk->ip_xmit_timeout;
 955 
 956         /*
 957          *      We are reset. We will send no more retransmits.
 958          */
 959          
 960         if(sk->zapped)
 961                 return;
 962                 
 963         /* 
 964          *      Only process if socket is not in use
 965          */
 966 
 967         cli();
 968         if (sk->inuse || in_bh) 
 969         {
 970                 /* Try again in 1 second */
 971                 sk->retransmit_timer.expires = jiffies+HZ;
 972                 add_timer(&sk->retransmit_timer);
 973                 sti();
 974                 return;
 975         }
 976 
 977         sk->inuse = 1;
 978         sti();
 979 
 980         /* Always see if we need to send an ack. */
 981 
 982         if (sk->ack_backlog) 
 983         {
 984                 sk->prot->read_wakeup (sk);
 985                 if (! sk->dead)
 986                         sk->data_ready(sk,0);
 987         }
 988 
 989         /* Now we need to figure out why the socket was on the timer. */
 990 
 991         switch (why) 
 992         {
 993                 /* Window probing */
 994                 case TIME_PROBE0:
 995                         tcp_send_probe0(sk);
 996                         tcp_write_timeout(sk);
 997                         break;
 998                 /* Retransmitting */
 999                 case TIME_WRITE:
1000                         /* It could be we got here because we needed to send an ack.
1001                          * So we need to check for that.
1002                          */
1003                 {
1004                         struct sk_buff *skb;
1005                         unsigned long flags;
1006 
1007                         save_flags(flags);
1008                         cli();
1009                         skb = sk->send_head;
1010                         if (!skb) 
1011                         {
1012                                 restore_flags(flags);
1013                         } 
1014                         else 
1015                         {
1016                                 /*
1017                                  *      Kicked by a delayed ack. Reset timer
1018                                  *      correctly now
1019                                  */
1020                                 if (jiffies < skb->when + sk->rto) 
1021                                 {
1022                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1023                                         restore_flags(flags);
1024                                         break;
1025                                 }
1026                                 restore_flags(flags);
1027                                 /*
1028                                  *      Retransmission
1029                                  */
1030                                 sk->retransmits++;
1031                                 sk->prot->retransmits++;
1032                                 sk->prot->retransmit (sk, 0);
1033                                 tcp_write_timeout(sk);
1034                         }
1035                         break;
1036                 }
1037                 /* Sending Keepalives */
1038                 case TIME_KEEPOPEN:
1039                         /* 
1040                          * this reset_timer() call is a hack, this is not
1041                          * how KEEPOPEN is supposed to work.
1042                          */
1043                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1044 
1045                         /* Send something to keep the connection open. */
1046                         if (sk->prot->write_wakeup)
1047                                   sk->prot->write_wakeup (sk);
1048                         sk->retransmits++;
1049                         sk->prot->retransmits++;
1050                         tcp_write_timeout(sk);
1051                         break;
1052                 default:
1053                         printk ("rexmit_timer: timer expired - reason unknown\n");
1054                         break;
1055         }
1056         release_sock(sk);
1057 }
1058 
1059 /*
1060  * This routine is called by the ICMP module when it gets some
1061  * sort of error condition.  If err < 0 then the socket should
1062  * be closed and the error returned to the user.  If err > 0
1063  * it's just the icmp type << 8 | icmp code.  After adjustment
1064  * header points to the first 8 bytes of the tcp header.  We need
1065  * to find the appropriate port.
1066  */
1067 
1068 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1069         __u32 saddr, struct inet_protocol *protocol)
1070 {
1071         struct tcphdr *th = (struct tcphdr *)header;
1072         struct sock *sk;
1073         
1074         /*
1075          *      This one is _WRONG_. FIXME urgently.
1076          */
1077 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY     
1078         struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1079 #endif  
1080         th =(struct tcphdr *)header;
1081         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1082 
1083         if (sk == NULL) 
1084                 return;
1085   
1086         if (type == ICMP_SOURCE_QUENCH) 
1087         {
1088                 /*
1089                  * FIXME:
1090                  * For now we will just trigger a linear backoff.
1091                  * The slow start code should cause a real backoff here.
1092                  */
1093                 if (sk->cong_window > 4)
1094                         sk->cong_window--;
1095                 return;
1096         }
1097         
1098         if (type == ICMP_PARAMETERPROB)
1099         {
1100                 sk->err=EPROTO;
1101                 sk->error_report(sk);
1102         }
1103 
1104 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1105         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1106         {
1107                 struct rtable * rt;
1108                 /*
1109                  * Ugly trick to pass MTU to protocol layer.
1110                  * Really we should add argument "info" to error handler.
1111                  */
1112                 unsigned short new_mtu = ntohs(iph->id);
1113 
1114                 if ((rt = sk->ip_route_cache) != NULL)
1115                         if (rt->rt_mtu > new_mtu)
1116                                 rt->rt_mtu = new_mtu;
1117 
1118                 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
1119                         && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
1120                         sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1121 
1122                 return;
1123         }
1124 #endif
1125 
1126         /*
1127          * If we've already connected we will keep trying
1128          * until we time out, or the user gives up.
1129          */
1130 
1131         if (code < 13)
1132         {       
1133                 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1134                 {
1135                         sk->err = icmp_err_convert[code].errno;
1136                         if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1137                         {
1138                                 tcp_statistics.TcpAttemptFails++;
1139                                 tcp_set_state(sk,TCP_CLOSE);
1140                                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1141                         }
1142                 }
1143                 else    /* Only an error on timeout */
1144                         sk->err_soft = icmp_err_convert[code].errno;
1145         }
1146 }
1147 
1148 
1149 /*
1150  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1151  *      in the received data queue (ie a frame missing that needs sending to us). Not
1152  *      sorting using two queues as data arrives makes life so much harder.
1153  */
1154 
1155 static int tcp_readable(struct sock *sk)
     /*  */
1156 {
1157         unsigned long counted;
1158         unsigned long amount;
1159         struct sk_buff *skb;
1160         int sum;
1161         unsigned long flags;
1162 
1163         if(sk && sk->debug)
1164                 printk("tcp_readable: %p - ",sk);
1165 
1166         save_flags(flags);
1167         cli();
1168         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1169         {
1170                 restore_flags(flags);
1171                 if(sk && sk->debug) 
1172                         printk("empty\n");
1173                 return(0);
1174         }
1175   
1176         counted = sk->copied_seq;       /* Where we are at the moment */
1177         amount = 0;
1178   
1179         /* 
1180          *      Do until a push or until we are out of data. 
1181          */
1182          
1183         do 
1184         {
1185                 if (before(counted, skb->seq))          /* Found a hole so stops here */
1186                         break;
1187                 sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
1188                 if (skb->h.th->syn)
1189                         sum++;
1190                 if (sum > 0) 
1191                 {                                       /* Add it up, move on */
1192                         amount += sum;
1193                         if (skb->h.th->syn) 
1194                                 amount--;
1195                         counted += sum;
1196                 }
1197                 /*
1198                  * Don't count urg data ... but do it in the right place!
1199                  * Consider: "old_data (ptr is here) URG PUSH data"
1200                  * The old code would stop at the first push because
1201                  * it counted the urg (amount==1) and then does amount--
1202                  * *after* the loop.  This means tcp_readable() always
1203                  * returned zero if any URG PUSH was in the queue, even
1204                  * though there was normal data available. If we subtract
1205                  * the urg data right here, we even get it to work for more
1206                  * than one URG PUSH skb without normal data.
1207                  * This means that select() finally works now with urg data
1208                  * in the queue.  Note that rlogin was never affected
1209                  * because it doesn't use select(); it uses two processes
1210                  * and a blocking read().  And the queue scan in tcp_read()
1211                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1212                  */
1213                 if (skb->h.th->urg)
1214                         amount--;       /* don't count urg data */
1215                 if (amount && skb->h.th->psh) break;
1216                 skb = skb->next;
1217         }
1218         while(skb != (struct sk_buff *)&sk->receive_queue);
1219 
1220         restore_flags(flags);
1221         if(sk->debug)
1222                 printk("got %lu bytes.\n",amount);
1223         return(amount);
1224 }
1225 
1226 /*
1227  * LISTEN is a special case for select..
1228  */
1229 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1230 {
1231         if (sel_type == SEL_IN) {
1232                 int retval;
1233 
1234                 sk->inuse = 1;
1235                 retval = (tcp_find_established(sk) != NULL);
1236                 release_sock(sk);
1237                 if (!retval)
1238                         select_wait(&master_select_wakeup,wait);
1239                 return retval;
1240         }
1241         return 0;
1242 }
1243 
1244 
1245 /*
1246  *      Wait for a TCP event.
1247  *
1248  *      Note that we don't need to set "sk->inuse", as the upper select layers
1249  *      take care of normal races (between the test and the event) and we don't
1250  *      go look at any of the socket buffers directly.
1251  */
1252 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1253 {
1254         if (sk->state == TCP_LISTEN)
1255                 return tcp_listen_select(sk, sel_type, wait);
1256 
1257         switch(sel_type) {
1258         case SEL_IN:
1259                 if (sk->err)
1260                         return 1;
1261                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1262                         break;
1263 
1264                 if (sk->shutdown & RCV_SHUTDOWN)
1265                         return 1;
1266                         
1267                 if (sk->acked_seq == sk->copied_seq)
1268                         break;
1269 
1270                 if (sk->urg_seq != sk->copied_seq ||
1271                     sk->acked_seq != sk->copied_seq+1 ||
1272                     sk->urginline || !sk->urg_data)
1273                         return 1;
1274                 break;
1275 
1276         case SEL_OUT:
1277                 if (sk->err)
1278                         return 1;
1279                 if (sk->shutdown & SEND_SHUTDOWN) 
1280                         return 0;
1281                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1282                         break;
1283                 /*
1284                  * This is now right thanks to a small fix
1285                  * by Matt Dillon.
1286                  */
1287 
1288                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1289                         break;
1290                 return 1;
1291 
1292         case SEL_EX:
1293                 if (sk->urg_data)
1294                         return 1;
1295                 break;
1296         }
1297         select_wait(sk->sleep, wait);
1298         return 0;
1299 }
1300 
1301 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1302 {
1303         int err;
1304         switch(cmd) 
1305         {
1306 
1307                 case TIOCINQ:
1308 #ifdef FIXME    /* FIXME: */
1309                 case FIONREAD:
1310 #endif
1311                 {
1312                         unsigned long amount;
1313 
1314                         if (sk->state == TCP_LISTEN) 
1315                                 return(-EINVAL);
1316 
1317                         sk->inuse = 1;
1318                         amount = tcp_readable(sk);
1319                         release_sock(sk);
1320                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1321                         if(err)
1322                                 return err;
1323                         put_user(amount, (int *)arg);
1324                         return(0);
1325                 }
1326                 case SIOCATMARK:
1327                 {
1328                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1329 
1330                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1331                         if (err)
1332                                 return err;
1333                         put_user(answ,(int *) arg);
1334                         return(0);
1335                 }
1336                 case TIOCOUTQ:
1337                 {
1338                         unsigned long amount;
1339 
1340                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1341                         amount = sock_wspace(sk);
1342                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1343                         if(err)
1344                                 return err;
1345                         put_user(amount, (int *)arg);
1346                         return(0);
1347                 }
1348                 default:
1349                         return(-EINVAL);
1350         }
1351 }
1352 
1353 
1354 /*
1355  *      This routine computes a TCP checksum. 
1356  *
1357  *      Modified January 1995 from a go-faster DOS routine by
1358  *      Jorge Cwik <jorge@laser.satlink.net>
1359  */
1360  
1361 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1362           unsigned long saddr, unsigned long daddr, unsigned long base)
1363 {     
1364         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1365 }
1366 
1367 
1368 
1369 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1370                 unsigned long daddr, int len, struct sock *sk)
1371 {
1372         th->check = 0;
1373         th->check = tcp_check(th, len, saddr, daddr,
1374                 csum_partial((char *)th,len,0));
1375         return;
1376 }
1377 
1378 /*
1379  *      This is the main buffer sending routine. We queue the buffer
1380  *      having checked it is sane seeming.
1381  */
1382  
1383 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1384 {
1385         int size;
1386         struct tcphdr * th = skb->h.th;
1387 
1388         /*
1389          *      length of packet (not counting length of pre-tcp headers) 
1390          */
1391          
1392         size = skb->len - ((unsigned char *) th - skb->data);
1393 
1394         /*
1395          *      Sanity check it.. 
1396          */
1397          
1398         if (size < sizeof(struct tcphdr) || size > skb->len) 
1399         {
1400                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1401                         skb, skb->data, th, skb->len);
1402                 kfree_skb(skb, FREE_WRITE);
1403                 return;
1404         }
1405 
1406         /*
1407          *      If we have queued a header size packet.. (these crash a few
1408          *      tcp stacks if ack is not set)
1409          */
1410          
1411         if (size == sizeof(struct tcphdr)) 
1412         {
1413                 /* If it's got a syn or fin it's notionally included in the size..*/
1414                 if(!th->syn && !th->fin) 
1415                 {
1416                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1417                         kfree_skb(skb,FREE_WRITE);
1418                         return;
1419                 }
1420         }
1421 
1422         /*
1423          *      Actual processing.
1424          */
1425          
1426         tcp_statistics.TcpOutSegs++;  
1427         skb->seq = ntohl(th->seq);
1428         skb->end_seq = skb->seq + size - 4*th->doff;
1429         
1430         /*
1431          *      We must queue if
1432          *
1433          *      a) The right edge of this frame exceeds the window
1434          *      b) We are retransmitting (Nagle's rule)
1435          *      c) We have too many packets 'in flight'
1436          */
1437          
1438         if (after(skb->end_seq, sk->window_seq) ||
1439             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1440              sk->packets_out >= sk->cong_window) 
1441         {
1442                 /* checksum will be supplied by tcp_write_xmit.  So
1443                  * we shouldn't need to set it at all.  I'm being paranoid */
1444                 th->check = 0;
1445                 if (skb->next != NULL) 
1446                 {
1447                         printk("tcp_send_partial: next != NULL\n");
1448                         skb_unlink(skb);
1449                 }
1450                 skb_queue_tail(&sk->write_queue, skb);
1451                 
1452                 /*
1453                  *      If we don't fit we have to start the zero window
1454                  *      probes. This is broken - we really need to do a partial
1455                  *      send _first_ (This is what causes the Cisco and PC/TCP
1456                  *      grief).
1457                  */
1458                  
1459                 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1460                     sk->send_head == NULL && sk->ack_backlog == 0)
1461                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1462         } 
1463         else 
1464         {
1465                 /*
1466                  *      This is going straight out
1467                  */
1468                  
1469                 th->ack_seq = htonl(sk->acked_seq);
1470                 th->window = htons(tcp_select_window(sk));
1471 
1472                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1473 
1474                 sk->sent_seq = sk->write_seq;
1475                 
1476                 /*
1477                  *      This is mad. The tcp retransmit queue is put together
1478                  *      by the ip layer. This causes half the problems with
1479                  *      unroutable FIN's and other things.
1480                  */
1481                  
1482                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1483                 
1484                 /*
1485                  *      Set for next retransmit based on expected ACK time.
1486                  *      FIXME: We set this every time which means our 
1487                  *      retransmits are really about a window behind.
1488                  */
1489 
1490                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1491         }
1492 }
1493 
1494 /*
1495  *      Locking problems lead us to a messy situation where we can have
1496  *      multiple partially complete buffers queued up. This is really bad
1497  *      as we don't want to be sending partial buffers. Fix this with
1498  *      a semaphore or similar to lock tcp_write per socket.
1499  *
1500  *      These routines are pretty self descriptive.
1501  */
1502  
1503 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1504 {
1505         struct sk_buff * skb;
1506         unsigned long flags;
1507 
1508         save_flags(flags);
1509         cli();
1510         skb = sk->partial;
1511         if (skb) {
1512                 sk->partial = NULL;
1513                 del_timer(&sk->partial_timer);
1514         }
1515         restore_flags(flags);
1516         return skb;
1517 }
1518 
1519 /*
1520  *      Empty the partial queue
1521  */
1522  
1523 static void tcp_send_partial(struct sock *sk)
     /*  */
1524 {
1525         struct sk_buff *skb;
1526 
1527         if (sk == NULL)
1528                 return;
1529         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1530                 tcp_send_skb(sk, skb);
1531 }
1532 
1533 /*
1534  *      Queue a partial frame
1535  */
1536  
1537 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1538 {
1539         struct sk_buff * tmp;
1540         unsigned long flags;
1541 
1542         save_flags(flags);
1543         cli();
1544         tmp = sk->partial;
1545         if (tmp)
1546                 del_timer(&sk->partial_timer);
1547         sk->partial = skb;
1548         init_timer(&sk->partial_timer);
1549         /*
1550          *      Wait up to 1 second for the buffer to fill.
1551          */
1552         sk->partial_timer.expires = jiffies+HZ;
1553         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1554         sk->partial_timer.data = (unsigned long) sk;
1555         add_timer(&sk->partial_timer);
1556         restore_flags(flags);
1557         if (tmp)
1558                 tcp_send_skb(sk, tmp);
1559 }
1560 
1561 
1562 /*
1563  *      This routine sends an ack and also updates the window. 
1564  */
1565  
1566 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1567              struct sock *sk,
1568              struct tcphdr *th, unsigned long daddr)
1569 {
1570         struct sk_buff *buff;
1571         struct tcphdr *t1;
1572         struct device *dev = NULL;
1573         int tmp;
1574 
1575         if(sk->zapped)
1576                 return;         /* We have been reset, we may not send again */
1577                 
1578         /*
1579          * We need to grab some memory, and put together an ack,
1580          * and then put it into the queue to be sent.
1581          */
1582 
1583         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1584         if (buff == NULL) 
1585         {
1586                 /* 
1587                  *      Force it to send an ack. We don't have to do this
1588                  *      (ACK is unreliable) but it's much better use of 
1589                  *      bandwidth on slow links to send a spare ack than
1590                  *      resend packets. 
1591                  */
1592                  
1593                 sk->ack_backlog++;
1594                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1595                 {
1596                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1597                 }
1598                 return;
1599         }
1600 
1601         /*
1602          *      Assemble a suitable TCP frame
1603          */
1604          
1605         buff->sk = sk;
1606         buff->localroute = sk->localroute;
1607 
1608         /* 
1609          *      Put in the IP header and routing stuff. 
1610          */
1611          
1612         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1613                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1614         if (tmp < 0) 
1615         {
1616                 buff->free = 1;
1617                 sock_wfree(sk, buff);
1618                 return;
1619         }
1620         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1621 
1622         memcpy(t1, th, sizeof(*t1));
1623 
1624         /*
1625          *      Swap the send and the receive. 
1626          */
1627          
1628         t1->dest = th->source;
1629         t1->source = th->dest;
1630         t1->seq = ntohl(sequence);
1631         t1->ack = 1;
1632         sk->window = tcp_select_window(sk);
1633         t1->window = ntohs(sk->window);
1634         t1->res1 = 0;
1635         t1->res2 = 0;
1636         t1->rst = 0;
1637         t1->urg = 0;
1638         t1->syn = 0;
1639         t1->psh = 0;
1640         t1->fin = 0;
1641         
1642         /*
1643          *      If we have nothing queued for transmit and the transmit timer
1644          *      is on we are just doing an ACK timeout and need to switch
1645          *      to a keepalive.
1646          */
1647          
1648         if (ack == sk->acked_seq) 
1649         {
1650                 sk->ack_backlog = 0;
1651                 sk->bytes_rcv = 0;
1652                 sk->ack_timed = 0;
1653                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1654                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1655                 {
1656                         if(sk->keepopen) {
1657                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1658                         } else {
1659                                 delete_timer(sk);
1660                         }
1661                 }
1662         }
1663         
1664         /*
1665          *      Fill in the packet and send it
1666          */
1667          
1668         t1->ack_seq = htonl(ack);
1669         t1->doff = sizeof(*t1)/4;
1670         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1671         if (sk->debug)
1672                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1673         tcp_statistics.TcpOutSegs++;
1674         sk->prot->queue_xmit(sk, dev, buff, 1);
1675 }
1676 
1677 
1678 /* 
1679  *      This routine builds a generic TCP header. 
1680  */
1681  
1682 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1683 {
1684 
1685         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1686         th->seq = htonl(sk->write_seq);
1687         th->psh =(push == 0) ? 1 : 0;
1688         th->doff = sizeof(*th)/4;
1689         th->ack = 1;
1690         th->fin = 0;
1691         sk->ack_backlog = 0;
1692         sk->bytes_rcv = 0;
1693         sk->ack_timed = 0;
1694         th->ack_seq = htonl(sk->acked_seq);
1695         sk->window = tcp_select_window(sk);
1696         th->window = htons(sk->window);
1697 
1698         return(sizeof(*th));
1699 }
1700 
1701 /*
1702  *      This routine copies from a user buffer into a socket,
1703  *      and starts the transmit system.
1704  */
1705 
1706 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1707           int len, int nonblock, int flags)
1708 {
1709         int copied = 0;
1710         int copy;
1711         int tmp;
1712         int seglen;
1713         int iovct=0;
1714         struct sk_buff *skb;
1715         struct sk_buff *send_tmp;
1716         struct proto *prot;
1717         struct device *dev = NULL;
1718         unsigned char *from;
1719         
1720         /*
1721          *      Do sanity checking for sendmsg/sendto/send
1722          */
1723          
1724         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1725                 return -EINVAL;
1726         if (msg->msg_name)
1727         {
1728                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1729                 if(sk->state == TCP_CLOSE)
1730                         return -ENOTCONN;
1731                 if (msg->msg_namelen < sizeof(*addr))
1732                         return -EINVAL;
1733                 if (addr->sin_family && addr->sin_family != AF_INET) 
1734                         return -EINVAL;
1735                 if (addr->sin_port != sk->dummy_th.dest) 
1736                         return -EISCONN;
1737                 if (addr->sin_addr.s_addr != sk->daddr) 
1738                         return -EISCONN;
1739         }
1740         
1741         /*
1742          *      Ok commence sending
1743          */
1744         
1745         while(iovct<msg->msg_iovlen)
1746         {
1747                 seglen=msg->msg_iov[iovct].iov_len;
1748                 from=msg->msg_iov[iovct++].iov_base;
1749                 sk->inuse=1;
1750                 prot = sk->prot;
1751                 while(seglen > 0) 
1752                 {
1753                         if (sk->err) 
1754                         {                       /* Stop on an error */
1755                                 release_sock(sk);
1756                                 if (copied) 
1757                                         return(copied);
1758                                 return sock_error(sk);
1759                         }
1760 
1761                         /*
1762                          *      First thing we do is make sure that we are established. 
1763                          */
1764         
1765                         if (sk->shutdown & SEND_SHUTDOWN) 
1766                         {
1767                                 release_sock(sk);
1768                                 sk->err = EPIPE;
1769                                 if (copied) 
1770                                         return(copied);
1771                                 sk->err = 0;
1772                                 return(-EPIPE);
1773                         }
1774 
1775                         /* 
1776                          *      Wait for a connection to finish.
1777                          */
1778                 
1779                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1780                         {
1781                                 if (sk->err) 
1782                                 {
1783                                         release_sock(sk);
1784                                         if (copied) 
1785                                                 return(copied);
1786                                         return sock_error(sk);
1787                                 }               
1788         
1789                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1790                                 {
1791                                         release_sock(sk);
1792                                         if (copied) 
1793                                                 return(copied);
1794         
1795                                         if (sk->err) 
1796                                                 return sock_error(sk);
1797 
1798                                         if (sk->keepopen) 
1799                                         {
1800                                                 send_sig(SIGPIPE, current, 0);
1801                                         }
1802                                         return(-EPIPE);
1803                                 }
1804         
1805                                 if (nonblock || copied) 
1806                                 {
1807                                         release_sock(sk);
1808                                         if (copied) 
1809                                                 return(copied);
1810                                         return(-EAGAIN);
1811                                 }
1812         
1813                                 release_sock(sk);
1814                                 cli();
1815                         
1816                                 if (sk->state != TCP_ESTABLISHED &&
1817                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1818                                 {
1819                                         interruptible_sleep_on(sk->sleep);      
1820                                         if (current->signal & ~current->blocked)
1821                                         {
1822                                                 sti();
1823                                                 if (copied) 
1824                                                         return(copied);
1825                                                 return(-ERESTARTSYS);
1826                                         }
1827                                 }
1828                                 sk->inuse = 1;
1829                                 sti();
1830                         }
1831         
1832                 /*
1833                  * The following code can result in copy <= if sk->mss is ever
1834                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1835                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1836                  * had better not get here until we've seen his SYN and at least one
1837                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1838                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1839                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1840                  * before the exchange of SYN's.  If the initial ack from the other
1841                  * end has a window of 0, max_window and thus mss will both be 0.
1842                  */
1843         
1844                 /* 
1845                  *      Now we need to check if we have a half built packet. 
1846                  */
1847 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1848                 /*
1849                  *      FIXME:  I'm almost sure that this fragment is BUG,
1850                  *              but it works... I do not know why 8) --ANK
1851                  *
1852                  *      Really, we should rebuild all the queues...
1853                  *      It's difficult. Temprorary hack is to send all
1854                  *      queued segments with allowed fragmentation.
1855                  */
1856                 {
1857                         int new_mss = min(sk->mtu, sk->max_window);
1858                         if (new_mss < sk->mss)
1859                         {
1860                                 tcp_send_partial(sk);
1861                                 sk->mss = new_mss;
1862                         }
1863                 }
1864 #endif
1865         
1866                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1867                         {
1868                                 int hdrlen;
1869 
1870                                  /* IP header + TCP header */
1871                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1872                                          + sizeof(struct tcphdr);
1873         
1874                                 /* Add more stuff to the end of skb->len */
1875                                 if (!(flags & MSG_OOB)) 
1876                                 {
1877                                         copy = min(sk->mss - (skb->len - hdrlen), seglen);
1878                                         if (copy <= 0) 
1879                                         {
1880                                                 printk("TCP: **bug**: \"copy\" <= 0\n");
1881                                                 return -EFAULT;
1882                                         }                 
1883                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1884                                         from += copy;
1885                                         copied += copy;
1886                                         len -= copy;
1887                                         sk->write_seq += copy;
1888                                         seglen -= copy;
1889                                 }
1890                                 if ((skb->len - hdrlen) >= sk->mss ||
1891                                         (flags & MSG_OOB) || !sk->packets_out)
1892                                         tcp_send_skb(sk, skb);
1893                                 else
1894                                         tcp_enqueue_partial(skb, sk);
1895                                 continue;
1896                         }
1897 
1898                 /*
1899                  * We also need to worry about the window.
1900                  * If window < 1/2 the maximum window we've seen from this
1901                  *   host, don't use it.  This is sender side
1902                  *   silly window prevention, as specified in RFC1122.
1903                  *   (Note that this is different than earlier versions of
1904                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1905                  *   use the whole MSS.  Since the results in the right
1906                  *   edge of the packet being outside the window, it will
1907                  *   be queued for later rather than sent.
1908                  */
1909 
1910                         copy = sk->window_seq - sk->write_seq;
1911                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1912                                 copy = sk->mss;
1913                         if (copy > seglen)
1914                                 copy = seglen;
1915 
1916                 /*
1917                  *      We should really check the window here also. 
1918                  */
1919                  
1920                         send_tmp = NULL;
1921                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1922                         {
1923                                 /*
1924                                  *      We will release the socket in case we sleep here. 
1925                                  */
1926                                 release_sock(sk);
1927                                 /*
1928                                  *      NB: following must be mtu, because mss can be increased.
1929                                  *      mss is always <= mtu 
1930                                  */
1931                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1932                                 sk->inuse = 1;
1933                                 send_tmp = skb;
1934                         } 
1935                         else 
1936                         {
1937                                 /*
1938                                  *      We will release the socket in case we sleep here. 
1939                                  */
1940                                 release_sock(sk);
1941                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1942                                 sk->inuse = 1;
1943                         }
1944         
1945                         /*
1946                          *      If we didn't get any memory, we need to sleep. 
1947                          */
1948         
1949                         if (skb == NULL) 
1950                         {
1951                                 sk->socket->flags |= SO_NOSPACE;
1952                                 if (nonblock) 
1953                                 {
1954                                         release_sock(sk);
1955                                         if (copied) 
1956                                                 return(copied);
1957                                         return(-EAGAIN);
1958                                 }
1959 
1960                                 /*
1961                                  *      FIXME: here is another race condition. 
1962                                  */
1963 
1964                                 tmp = sk->wmem_alloc;
1965                                 release_sock(sk);
1966                                 cli();
1967                                 /*
1968                                  *      Again we will try to avoid it. 
1969                                  */
1970                                 if (tmp <= sk->wmem_alloc &&
1971                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1972                                         && sk->err == 0) 
1973                                 {
1974                                         sk->socket->flags &= ~SO_NOSPACE;
1975                                         interruptible_sleep_on(sk->sleep);
1976                                         if (current->signal & ~current->blocked) 
1977                                         {
1978                                                 sti();
1979                                                 if (copied) 
1980                                                         return(copied);
1981                                                 return(-ERESTARTSYS);
1982                                         }
1983                                 }
1984                                 sk->inuse = 1;
1985                                 sti();
1986                                 continue;
1987                         }
1988 
1989                         skb->sk = sk;
1990                         skb->free = 0;
1991                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1992         
1993                         /*
1994                          * FIXME: we need to optimize this.
1995                          * Perhaps some hints here would be good.
1996                          */
1997                 
1998                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1999                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2000                         if (tmp < 0 ) 
2001                         {
2002                                 sock_wfree(sk, skb);
2003                                 release_sock(sk);
2004                                 if (copied) 
2005                                         return(copied);
2006                                 return(tmp);
2007                         }
2008 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
2009                         skb->ip_hdr->frag_off |= htons(IP_DF);
2010 #endif
2011                         skb->dev = dev;
2012                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
2013                         tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2014                         if (tmp < 0) 
2015                         {
2016                                 sock_wfree(sk, skb);
2017                                 release_sock(sk);
2018                                 if (copied) 
2019                                         return(copied);
2020                                 return(tmp);
2021                         }
2022         
2023                         if (flags & MSG_OOB) 
2024                         {
2025                                 skb->h.th->urg = 1;
2026                                 skb->h.th->urg_ptr = ntohs(copy);
2027                         }
2028 
2029                         memcpy_fromfs(skb_put(skb,copy), from, copy);
2030                 
2031                         from += copy;
2032                         copied += copy;
2033                         len -= copy;
2034                         seglen -= copy;
2035                         skb->free = 0;
2036                         sk->write_seq += copy;
2037                 
2038                         if (send_tmp != NULL && sk->packets_out) 
2039                         {
2040                                 tcp_enqueue_partial(send_tmp, sk);
2041                                 continue;
2042                         }
2043                         tcp_send_skb(sk, skb);
2044                 }
2045         }
2046         sk->err = 0;
2047 
2048 /*
2049  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2050  *      interactive fast network servers. It's meant to be on and
2051  *      it really improves the throughput though not the echo time
2052  *      on my slow slip link - Alan
2053  */
2054 
2055 /*
2056  *      Avoid possible race on send_tmp - c/o Johannes Stille 
2057  */
2058  
2059         if(sk->partial && ((!sk->packets_out) 
2060      /* If not nagling we can send on the before case too.. */
2061               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2062         ))
2063                 tcp_send_partial(sk);
2064 
2065         release_sock(sk);
2066         return(copied);
2067 }
2068 
2069 /*
2070  *      Send an ack if one is backlogged at this point. Ought to merge
2071  *      this with tcp_send_ack().
2072  */
2073  
2074 static void tcp_read_wakeup(struct sock *sk)
     /*  */
2075 {
2076         int tmp;
2077         struct device *dev = NULL;
2078         struct tcphdr *t1;
2079         struct sk_buff *buff;
2080 
2081         if (!sk->ack_backlog) 
2082                 return;
2083 
2084         /*
2085          * If we're closed, don't send an ack, or we'll get a RST
2086          * from the closed destination.
2087          */
2088         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2089                 return; 
2090 
2091         /*
2092          * FIXME: we need to put code here to prevent this routine from
2093          * being called.  Being called once in a while is ok, so only check
2094          * if this is the second time in a row.
2095          */
2096 
2097         /*
2098          * We need to grab some memory, and put together an ack,
2099          * and then put it into the queue to be sent.
2100          */
2101 
2102         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2103         if (buff == NULL) 
2104         {
2105                 /* Try again real soon. */
2106                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2107                 return;
2108         }
2109 
2110         buff->sk = sk;
2111         buff->localroute = sk->localroute;
2112         
2113         /*
2114          *      Put in the IP header and routing stuff. 
2115          */
2116 
2117         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2118                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2119         if (tmp < 0) 
2120         {
2121                 buff->free = 1;
2122                 sock_wfree(sk, buff);
2123                 return;
2124         }
2125 
2126         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2127 
2128         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2129         t1->seq = htonl(sk->sent_seq);
2130         t1->ack = 1;
2131         t1->res1 = 0;
2132         t1->res2 = 0;
2133         t1->rst = 0;
2134         t1->urg = 0;
2135         t1->syn = 0;
2136         t1->psh = 0;
2137         sk->ack_backlog = 0;
2138         sk->bytes_rcv = 0;
2139         sk->window = tcp_select_window(sk);
2140         t1->window = htons(sk->window);
2141         t1->ack_seq = htonl(sk->acked_seq);
2142         t1->doff = sizeof(*t1)/4;
2143         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2144         sk->prot->queue_xmit(sk, dev, buff, 1);
2145         tcp_statistics.TcpOutSegs++;
2146 }
2147 
2148 
2149 /*
2150  *      FIXME:
2151  *      This routine frees used buffers.
2152  *      It should consider sending an ACK to let the
2153  *      other end know we now have a bigger window.
2154  */
2155 
2156 static void cleanup_rbuf(struct sock *sk)
     /*  */
2157 {
2158         unsigned long flags;
2159         unsigned long left;
2160         struct sk_buff *skb;
2161         unsigned long rspace;
2162 
2163         if(sk->debug)
2164                 printk("cleaning rbuf for sk=%p\n", sk);
2165   
2166         save_flags(flags);
2167         cli();
2168   
2169         left = sock_rspace(sk);
2170  
2171         /*
2172          *      We have to loop through all the buffer headers,
2173          *      and try to free up all the space we can.
2174          */
2175 
2176         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2177         {
2178                 if (!skb->used || skb->users) 
2179                         break;
2180                 skb_unlink(skb);
2181                 skb->sk = sk;
2182                 kfree_skb(skb, FREE_READ);
2183         }
2184 
2185         restore_flags(flags);
2186 
2187         /*
2188          *      FIXME:
2189          *      At this point we should send an ack if the difference
2190          *      in the window, and the amount of space is bigger than
2191          *      TCP_WINDOW_DIFF.
2192          */
2193 
2194         if(sk->debug)
2195                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2196                                             left);
2197         if ((rspace=sock_rspace(sk)) != left) 
2198         {
2199                 /*
2200                  * This area has caused the most trouble.  The current strategy
2201                  * is to simply do nothing if the other end has room to send at
2202                  * least 3 full packets, because the ack from those will auto-
2203                  * matically update the window.  If the other end doesn't think
2204                  * we have much space left, but we have room for at least 1 more
2205                  * complete packet than it thinks we do, we will send an ack
2206                  * immediately.  Otherwise we will wait up to .5 seconds in case
2207                  * the user reads some more.
2208                  */
2209                 sk->ack_backlog++;
2210         /*
2211          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2212          * if the other end is offering a window smaller than the agreed on MSS
2213          * (called sk->mtu here).  In theory there's no connection between send
2214          * and receive, and so no reason to think that they're going to send
2215          * small packets.  For the moment I'm using the hack of reducing the mss
2216          * only on the send side, so I'm putting mtu here.
2217          */
2218 
2219                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2220                 {
2221                         /* Send an ack right now. */
2222                         tcp_read_wakeup(sk);
2223                 } 
2224                 else 
2225                 {
2226                         /* Force it to send an ack soon. */
2227                         int was_active = del_timer(&sk->retransmit_timer);
2228                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2229                         {
2230                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2231                         } 
2232                         else
2233                                 add_timer(&sk->retransmit_timer);
2234                 }
2235         }
2236 } 
2237 
2238 
2239 /*
2240  *      Handle reading urgent data. BSD has very simple semantics for
2241  *      this, no blocking and very strange errors 8)
2242  */
2243  
2244 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2245              struct msghdr *msg, int len, int flags, int *addr_len)
2246 {
2247         /*
2248          *      No URG data to read
2249          */
2250         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2251                 return -EINVAL; /* Yes this is right ! */
2252                 
2253         if (sk->err) 
2254                 return sock_error(sk);
2255                 
2256         if (sk->state == TCP_CLOSE || sk->done) 
2257         {
2258                 if (!sk->done) 
2259                 {
2260                         sk->done = 1;
2261                         return 0;
2262                 }
2263                 return -ENOTCONN;
2264         }
2265 
2266         if (sk->shutdown & RCV_SHUTDOWN) 
2267         {
2268                 sk->done = 1;
2269                 return 0;
2270         }
2271         sk->inuse = 1;
2272         if (sk->urg_data & URG_VALID) 
2273         {
2274                 char c = sk->urg_data;
2275                 if (!(flags & MSG_PEEK))
2276                         sk->urg_data = URG_READ;
2277                 memcpy_toiovec(msg->msg_iov, &c, 1);
2278                 if(msg->msg_name)
2279                 {
2280                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2281                         sin->sin_family=AF_INET;
2282                         sin->sin_addr.s_addr=sk->daddr;
2283                         sin->sin_port=sk->dummy_th.dest;
2284                 }
2285                 if(addr_len)
2286                         *addr_len=sizeof(struct sockaddr_in);
2287                 release_sock(sk);
2288                 return 1;
2289         }
2290         release_sock(sk);
2291         
2292         /*
2293          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2294          * the available implementations agree in this case:
2295          * this call should never block, independent of the
2296          * blocking state of the socket.
2297          * Mike <pall@rz.uni-karlsruhe.de>
2298          */
2299         return -EAGAIN;
2300 }
2301 
2302 
2303 /*
2304  *      This routine copies from a sock struct into the user buffer. 
2305  */
2306  
2307 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2308         int len, int nonblock, int flags, int *addr_len)
2309 {
2310         struct wait_queue wait = { current, NULL };
2311         int copied = 0;
2312         u32 peek_seq;
2313         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2314         unsigned long used;
2315 
2316         /* 
2317          *      This error should be checked. 
2318          */
2319          
2320         if (sk->state == TCP_LISTEN)
2321                 return -ENOTCONN;
2322 
2323         /*
2324          *      Urgent data needs to be handled specially. 
2325          */
2326          
2327         if (flags & MSG_OOB)
2328                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2329 
2330         /*
2331          *      Copying sequence to update. This is volatile to handle
2332          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2333          *      inline and thus not flush cached variables otherwise).
2334          */
2335          
2336         peek_seq = sk->copied_seq;
2337         seq = &sk->copied_seq;
2338         if (flags & MSG_PEEK)
2339                 seq = &peek_seq;
2340 
2341         add_wait_queue(sk->sleep, &wait);
2342         sk->inuse = 1;
2343         while (len > 0) 
2344         {
2345                 struct sk_buff * skb;
2346                 u32 offset;
2347         
2348                 /*
2349                  * Are we at urgent data? Stop if we have read anything.
2350                  */
2351                  
2352                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2353                         break;
2354 
2355                 /*
2356                  *      Next get a buffer.
2357                  */
2358                  
2359                 current->state = TASK_INTERRUPTIBLE;
2360 
2361                 skb = skb_peek(&sk->receive_queue);
2362                 do 
2363                 {
2364                         if (!skb)
2365                                 break;
2366                         if (before(*seq, skb->seq))
2367                                 break;
2368                         offset = *seq - skb->seq;
2369                         if (skb->h.th->syn)
2370                                 offset--;
2371                         if (offset < skb->len)
2372                                 goto found_ok_skb;
2373                         if (skb->h.th->fin)
2374                                 goto found_fin_ok;
2375                         if (!(flags & MSG_PEEK))
2376                                 skb->used = 1;
2377                         skb = skb->next;
2378                 }
2379                 while (skb != (struct sk_buff *)&sk->receive_queue);
2380 
2381                 if (copied)
2382                         break;
2383 
2384                 if (sk->err) 
2385                 {
2386                         copied = sock_error(sk);
2387                         break;
2388                 }
2389 
2390                 if (sk->state == TCP_CLOSE) 
2391                 {
2392                         if (!sk->done) 
2393                         {
2394                                 sk->done = 1;
2395                                 break;
2396                         }
2397                         copied = -ENOTCONN;
2398                         break;
2399                 }
2400 
2401                 if (sk->shutdown & RCV_SHUTDOWN) 
2402                 {
2403                         sk->done = 1;
2404                         break;
2405                 }
2406                         
2407                 if (nonblock) 
2408                 {
2409                         copied = -EAGAIN;
2410                         break;
2411                 }
2412 
2413                 cleanup_rbuf(sk);
2414                 release_sock(sk);
2415                 sk->socket->flags |= SO_WAITDATA;
2416                 schedule();
2417                 sk->socket->flags &= ~SO_WAITDATA;
2418                 sk->inuse = 1;
2419 
2420                 if (current->signal & ~current->blocked) 
2421                 {
2422                         copied = -ERESTARTSYS;
2423                         break;
2424                 }
2425                 continue;
2426 
2427         found_ok_skb:
2428                 /*
2429                  *      Lock the buffer. We can be fairly relaxed as
2430                  *      an interrupt will never steal a buffer we are 
2431                  *      using unless I've missed something serious in
2432                  *      tcp_data.
2433                  */
2434                 
2435                 skb->users++;
2436                 
2437                 /*
2438                  *      Ok so how much can we use ? 
2439                  */
2440                  
2441                 used = skb->len - offset;
2442                 if (len < used)
2443                         used = len;
2444                 /*
2445                  *      Do we have urgent data here? 
2446                  */
2447                 
2448                 if (sk->urg_data) 
2449                 {
2450                         u32 urg_offset = sk->urg_seq - *seq;
2451                         if (urg_offset < used) 
2452                         {
2453                                 if (!urg_offset) 
2454                                 {
2455                                         if (!sk->urginline) 
2456                                         {
2457                                                 ++*seq;
2458                                                 offset++;
2459                                                 used--;
2460                                         }
2461                                 }
2462                                 else
2463                                         used = urg_offset;
2464                         }
2465                 }
2466                 
2467                 /*
2468                  *      Copy it - We _MUST_ update *seq first so that we
2469                  *      don't ever double read when we have dual readers
2470                  */
2471                  
2472                 *seq += used;
2473 
2474                 /*
2475                  *      This memcpy_tofs can sleep. If it sleeps and we
2476                  *      do a second read it relies on the skb->users to avoid
2477                  *      a crash when cleanup_rbuf() gets called.
2478                  */
2479                  
2480                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2481                         skb->h.th->doff*4 + offset, used);
2482                 copied += used;
2483                 len -= used;
2484                 
2485                 /*
2486                  *      We now will not sleep again until we are finished
2487                  *      with skb. Sorry if you are doing the SMP port
2488                  *      but you'll just have to fix it neatly ;)
2489                  */
2490                  
2491                 skb->users --;
2492                 
2493                 if (after(sk->copied_seq,sk->urg_seq))
2494                         sk->urg_data = 0;
2495                 if (used + offset < skb->len)
2496                         continue;
2497                 
2498                 /*
2499                  *      Process the FIN.
2500                  */
2501 
2502                 if (skb->h.th->fin)
2503                         goto found_fin_ok;
2504                 if (flags & MSG_PEEK)
2505                         continue;
2506                 skb->used = 1;
2507                 continue;
2508 
2509         found_fin_ok:
2510                 ++*seq;
2511                 if (flags & MSG_PEEK)
2512                         break;
2513                         
2514                 /*
2515                  *      All is done
2516                  */
2517                  
2518                 skb->used = 1;
2519                 sk->shutdown |= RCV_SHUTDOWN;
2520                 break;
2521 
2522         }
2523         
2524         if(copied>0 && msg->msg_name)
2525         {
2526                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2527                 sin->sin_family=AF_INET;
2528                 sin->sin_addr.s_addr=sk->daddr;
2529                 sin->sin_port=sk->dummy_th.dest;
2530         }
2531         if(addr_len)
2532                 *addr_len=sizeof(struct sockaddr_in);
2533                 
2534         remove_wait_queue(sk->sleep, &wait);
2535         current->state = TASK_RUNNING;
2536 
2537         /* Clean up data we have read: This will do ACK frames */
2538         cleanup_rbuf(sk);
2539         release_sock(sk);
2540         return copied;
2541 }
2542 
2543 
2544 
2545 /*
2546  *      State processing on a close. This implements the state shift for
2547  *      sending our FIN frame. Note that we only send a FIN for some 
2548  *      states. A shutdown() may have already sent the FIN, or we may be
2549  *      closed.
2550  */
2551  
2552 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2553 {
2554         int ns=TCP_CLOSE;
2555         int send_fin=0;
2556         switch(sk->state)
2557         {
2558                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2559                         break;
2560                 case TCP_SYN_RECV:
2561                 case TCP_ESTABLISHED:   /* Closedown begin */
2562                         ns=TCP_FIN_WAIT1;
2563                         send_fin=1;
2564                         break;
2565                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2566                 case TCP_FIN_WAIT2:
2567                 case TCP_CLOSING:
2568                         ns=sk->state;
2569                         break;
2570                 case TCP_CLOSE:
2571                 case TCP_LISTEN:
2572                         break;
2573                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2574                                            wait only for the ACK */
2575                         ns=TCP_LAST_ACK;
2576                         send_fin=1;
2577         }
2578         
2579         tcp_set_state(sk,ns);
2580                 
2581         /*
2582          *      This is a (useful) BSD violating of the RFC. There is a
2583          *      problem with TCP as specified in that the other end could
2584          *      keep a socket open forever with no application left this end.
2585          *      We use a 3 minute timeout (about the same as BSD) then kill
2586          *      our end. If they send after that then tough - BUT: long enough
2587          *      that we won't make the old 4*rto = almost no time - whoops
2588          *      reset mistake.
2589          */
2590         if(dead && ns==TCP_FIN_WAIT2)
2591         {
2592                 int timer_active=del_timer(&sk->timer);
2593                 if(timer_active)
2594                         add_timer(&sk->timer);
2595                 else
2596                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2597         }
2598         
2599         return send_fin;
2600 }
2601 
2602 /*
2603  *      Send a fin.
2604  */
2605 
2606 static void tcp_send_fin(struct sock *sk)
     /*  */
2607 {
2608         struct proto *prot =(struct proto *)sk->prot;
2609         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2610         struct tcphdr *t1;
2611         struct sk_buff *buff;
2612         struct device *dev=NULL;
2613         int tmp;
2614                 
2615         release_sock(sk); /* in case the malloc sleeps. */
2616         
2617         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2618         sk->inuse = 1;
2619 
2620         if (buff == NULL)
2621         {
2622                 /* This is a disaster if it occurs */
2623                 printk("tcp_send_fin: Impossible malloc failure");
2624                 return;
2625         }
2626 
2627         /*
2628          *      Administrivia
2629          */
2630          
2631         buff->sk = sk;
2632         buff->localroute = sk->localroute;
2633 
2634         /*
2635          *      Put in the IP header and routing stuff. 
2636          */
2637 
2638         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2639                            IPPROTO_TCP, sk->opt,
2640                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2641         if (tmp < 0) 
2642         {
2643                 int t;
2644                 /*
2645                  *      Finish anyway, treat this as a send that got lost. 
2646                  *      (Not good).
2647                  */
2648                  
2649                 buff->free = 1;
2650                 sock_wfree(sk,buff);
2651                 sk->write_seq++;
2652                 t=del_timer(&sk->timer);
2653                 if(t)
2654                         add_timer(&sk->timer);
2655                 else
2656                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2657                 return;
2658         }
2659         
2660         /*
2661          *      We ought to check if the end of the queue is a buffer and
2662          *      if so simply add the fin to that buffer, not send it ahead.
2663          */
2664 
2665         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2666         buff->dev = dev;
2667         memcpy(t1, th, sizeof(*t1));
2668         buff->seq = sk->write_seq;
2669         sk->write_seq++;
2670         buff->end_seq = sk->write_seq;
2671         t1->seq = htonl(buff->seq);
2672         t1->ack = 1;
2673         t1->ack_seq = htonl(sk->acked_seq);
2674         t1->window = htons(sk->window=tcp_select_window(sk));
2675         t1->fin = 1;
2676         t1->rst = 0;
2677         t1->doff = sizeof(*t1)/4;
2678         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2679 
2680         /*
2681          * If there is data in the write queue, the fin must be appended to
2682          * the write queue.
2683          */
2684         
2685         if (skb_peek(&sk->write_queue) != NULL) 
2686         {
2687                 buff->free = 0;
2688                 if (buff->next != NULL) 
2689                 {
2690                         printk("tcp_send_fin: next != NULL\n");
2691                         skb_unlink(buff);
2692                 }
2693                 skb_queue_tail(&sk->write_queue, buff);
2694         } 
2695         else 
2696         {
2697                 sk->sent_seq = sk->write_seq;
2698                 sk->prot->queue_xmit(sk, dev, buff, 0);
2699                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2700         }
2701 }
2702 
2703 /*
2704  *      Shutdown the sending side of a connection. Much like close except
2705  *      that we don't receive shut down or set sk->dead=1.
2706  */
2707 
2708 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2709 {
2710         /*
2711          *      We need to grab some memory, and put together a FIN,
2712          *      and then put it into the queue to be sent.
2713          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2714          */
2715 
2716         if (!(how & SEND_SHUTDOWN)) 
2717                 return;
2718          
2719         /*
2720          *      If we've already sent a FIN, or it's a closed state
2721          */
2722          
2723         if (sk->state == TCP_FIN_WAIT1 ||
2724             sk->state == TCP_FIN_WAIT2 ||
2725             sk->state == TCP_CLOSING ||
2726             sk->state == TCP_LAST_ACK ||
2727             sk->state == TCP_TIME_WAIT || 
2728             sk->state == TCP_CLOSE ||
2729             sk->state == TCP_LISTEN
2730           )
2731         {
2732                 return;
2733         }
2734         sk->inuse = 1;
2735 
2736         /*
2737          * flag that the sender has shutdown
2738          */
2739 
2740         sk->shutdown |= SEND_SHUTDOWN;
2741 
2742         /*
2743          *  Clear out any half completed packets. 
2744          */
2745 
2746         if (sk->partial)
2747                 tcp_send_partial(sk);
2748                 
2749         /*
2750          *      FIN if needed
2751          */
2752          
2753         if(tcp_close_state(sk,0))
2754                 tcp_send_fin(sk);
2755                 
2756         release_sock(sk);
2757 }
2758 
2759 /*
2760  *      This routine will send an RST to the other tcp. 
2761  */
2762  
2763 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2764           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2765 {
2766         struct sk_buff *buff;
2767         struct tcphdr *t1;
2768         int tmp;
2769         struct device *ndev=NULL;
2770 
2771         /*
2772          *      Cannot reset a reset (Think about it).
2773          */
2774          
2775         if(th->rst)
2776                 return;
2777   
2778         /*
2779          * We need to grab some memory, and put together an RST,
2780          * and then put it into the queue to be sent.
2781          */
2782 
2783         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2784         if (buff == NULL) 
2785                 return;
2786 
2787         buff->sk = NULL;
2788         buff->dev = dev;
2789         buff->localroute = 0;
2790 
2791         /*
2792          *      Put in the IP header and routing stuff. 
2793          */
2794 
2795         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2796                            sizeof(struct tcphdr),tos,ttl,NULL);
2797         if (tmp < 0) 
2798         {
2799                 buff->free = 1;
2800                 sock_wfree(NULL, buff);
2801                 return;
2802         }
2803 
2804         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2805         memcpy(t1, th, sizeof(*t1));
2806 
2807         /*
2808          *      Swap the send and the receive. 
2809          */
2810 
2811         t1->dest = th->source;
2812         t1->source = th->dest;
2813         t1->rst = 1;  
2814         t1->window = 0;
2815   
2816         if(th->ack)
2817         {
2818                 t1->ack = 0;
2819                 t1->seq = th->ack_seq;
2820                 t1->ack_seq = 0;
2821         }
2822         else
2823         {
2824                 t1->ack = 1;
2825                 if(!th->syn)
2826                         t1->ack_seq = th->seq;
2827                 else
2828                         t1->ack_seq = htonl(ntohl(th->seq)+1);
2829                 t1->seq = 0;
2830         }
2831 
2832         t1->syn = 0;
2833         t1->urg = 0;
2834         t1->fin = 0;
2835         t1->psh = 0;
2836         t1->doff = sizeof(*t1)/4;
2837         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2838         prot->queue_xmit(NULL, ndev, buff, 1);
2839         tcp_statistics.TcpOutSegs++;
2840 }
2841 
2842 
2843 /*
2844  *      Look for tcp options. Parses everything but only knows about MSS.
2845  *      This routine is always called with the packet containing the SYN.
2846  *      However it may also be called with the ack to the SYN.  So you
2847  *      can't assume this is always the SYN.  It's always called after
2848  *      we have set up sk->mtu to our own MTU.
2849  *
2850  *      We need at minimum to add PAWS support here. Possibly large windows
2851  *      as Linux gets deployed on 100Mb/sec networks.
2852  */
2853  
2854 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2855 {
2856         unsigned char *ptr;
2857         int length=(th->doff*4)-sizeof(struct tcphdr);
2858         int mss_seen = 0;
2859     
2860         ptr = (unsigned char *)(th + 1);
2861   
2862         while(length>0)
2863         {
2864                 int opcode=*ptr++;
2865                 int opsize=*ptr++;
2866                 switch(opcode)
2867                 {
2868                         case TCPOPT_EOL:
2869                                 return;
2870                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2871                                 length--;
2872                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2873                                 continue;
2874                         
2875                         default:
2876                                 if(opsize<=2)   /* Avoid silly options looping forever */
2877                                         return;
2878                                 switch(opcode)
2879                                 {
2880                                         case TCPOPT_MSS:
2881                                                 if(opsize==4 && th->syn)
2882                                                 {
2883                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2884                                                         mss_seen = 1;
2885                                                 }
2886                                                 break;
2887                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2888                                 }
2889                                 ptr+=opsize-2;
2890                                 length-=opsize;
2891                 }
2892         }
2893         if (th->syn) 
2894         {
2895                 if (! mss_seen)
2896                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2897         }
2898 #ifdef CONFIG_INET_PCTCP
2899         sk->mss = min(sk->max_window >> 1, sk->mtu);
2900 #else    
2901         sk->mss = min(sk->max_window, sk->mtu);
2902 #endif  
2903 }
2904 
2905 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2906 {
2907         dst = ntohl(dst);
2908         if (IN_CLASSA(dst))
2909                 return htonl(IN_CLASSA_NET);
2910         if (IN_CLASSB(dst))
2911                 return htonl(IN_CLASSB_NET);
2912         return htonl(IN_CLASSC_NET);
2913 }
2914 
2915 /*
2916  *      Default sequence number picking algorithm.
2917  *      As close as possible to RFC 793, which
2918  *      suggests using a 250kHz clock.
2919  *      Further reading shows this assumes 2MB/s networks.
2920  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2921  *      That's funny, Linux has one built in!  Use it!
2922  */
2923 
2924 extern inline u32 tcp_init_seq(void)
     /*  */
2925 {
2926         struct timeval tv;
2927         do_gettimeofday(&tv);
2928         return tv.tv_usec+tv.tv_sec*1000000;
2929 }
2930 
2931 /*
2932  *      This routine handles a connection request.
2933  *      It should make sure we haven't already responded.
2934  *      Because of the way BSD works, we have to send a syn/ack now.
2935  *      This also means it will be harder to close a socket which is
2936  *      listening.
2937  */
2938  
2939 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2940                  unsigned long daddr, unsigned long saddr,
2941                  struct options *opt, struct device *dev, u32 seq)
2942 {
2943         struct sk_buff *buff;
2944         struct tcphdr *t1;
2945         unsigned char *ptr;
2946         struct sock *newsk;
2947         struct tcphdr *th;
2948         struct device *ndev=NULL;
2949         int tmp;
2950         struct rtable *rt;
2951   
2952         th = skb->h.th;
2953 
2954         /* If the socket is dead, don't accept the connection. */
2955         if (!sk->dead) 
2956         {
2957                 sk->data_ready(sk,0);
2958         }
2959         else 
2960         {
2961                 if(sk->debug)
2962                         printk("Reset on %p: Connect on dead socket.\n",sk);
2963                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2964                 tcp_statistics.TcpAttemptFails++;
2965                 kfree_skb(skb, FREE_READ);
2966                 return;
2967         }
2968 
2969         /*
2970          * Make sure we can accept more.  This will prevent a
2971          * flurry of syns from eating up all our memory.
2972          */
2973 
2974         if (sk->ack_backlog >= sk->max_ack_backlog) 
2975         {
2976                 tcp_statistics.TcpAttemptFails++;
2977                 kfree_skb(skb, FREE_READ);
2978                 return;
2979         }
2980 
2981         /*
2982          * We need to build a new sock struct.
2983          * It is sort of bad to have a socket without an inode attached
2984          * to it, but the wake_up's will just wake up the listening socket,
2985          * and if the listening socket is destroyed before this is taken
2986          * off of the queue, this will take care of it.
2987          */
2988 
2989         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2990         if (newsk == NULL) 
2991         {
2992                 /* just ignore the syn.  It will get retransmitted. */
2993                 tcp_statistics.TcpAttemptFails++;
2994                 kfree_skb(skb, FREE_READ);
2995                 return;
2996         }
2997 
2998         memcpy(newsk, sk, sizeof(*newsk));
2999         newsk->opt = NULL;
3000         newsk->ip_route_cache  = NULL;
3001         if (opt && opt->optlen) {
3002           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
3003           if (!sk->opt) {
3004                 kfree_s(newsk, sizeof(struct sock));
3005                 tcp_statistics.TcpAttemptFails++;
3006                 kfree_skb(skb, FREE_READ);
3007                 return;
3008           }
3009           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
3010                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
3011                 kfree_s(newsk, sizeof(struct sock));
3012                 tcp_statistics.TcpAttemptFails++;
3013                 kfree_skb(skb, FREE_READ);
3014                 return;
3015           }
3016         }
3017         skb_queue_head_init(&newsk->write_queue);
3018         skb_queue_head_init(&newsk->receive_queue);
3019         newsk->send_head = NULL;
3020         newsk->send_tail = NULL;
3021         skb_queue_head_init(&newsk->back_log);
3022         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3023         newsk->rto = TCP_TIMEOUT_INIT;
3024         newsk->mdev = 0;
3025         newsk->max_window = 0;
3026         newsk->cong_window = 1;
3027         newsk->cong_count = 0;
3028         newsk->ssthresh = 0;
3029         newsk->backoff = 0;
3030         newsk->blog = 0;
3031         newsk->intr = 0;
3032         newsk->proc = 0;
3033         newsk->done = 0;
3034         newsk->partial = NULL;
3035         newsk->pair = NULL;
3036         newsk->wmem_alloc = 0;
3037         newsk->rmem_alloc = 0;
3038         newsk->localroute = sk->localroute;
3039 
3040         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3041 
3042         newsk->err = 0;
3043         newsk->shutdown = 0;
3044         newsk->ack_backlog = 0;
3045         newsk->acked_seq = skb->seq+1;
3046         newsk->copied_seq = skb->seq+1;
3047         newsk->fin_seq = skb->seq;
3048         newsk->state = TCP_SYN_RECV;
3049         newsk->timeout = 0;
3050         newsk->ip_xmit_timeout = 0;
3051         newsk->write_seq = seq; 
3052         newsk->window_seq = newsk->write_seq;
3053         newsk->rcv_ack_seq = newsk->write_seq;
3054         newsk->urg_data = 0;
3055         newsk->retransmits = 0;
3056         newsk->linger=0;
3057         newsk->destroy = 0;
3058         init_timer(&newsk->timer);
3059         newsk->timer.data = (unsigned long)newsk;
3060         newsk->timer.function = &net_timer;
3061         init_timer(&newsk->retransmit_timer);
3062         newsk->retransmit_timer.data = (unsigned long)newsk;
3063         newsk->retransmit_timer.function=&retransmit_timer;
3064         newsk->dummy_th.source = skb->h.th->dest;
3065         newsk->dummy_th.dest = skb->h.th->source;
3066         
3067         /*
3068          *      Swap these two, they are from our point of view. 
3069          */
3070          
3071         newsk->daddr = saddr;
3072         newsk->saddr = daddr;
3073         newsk->rcv_saddr = daddr;
3074 
3075         put_sock(newsk->num,newsk);
3076         newsk->dummy_th.res1 = 0;
3077         newsk->dummy_th.doff = 6;
3078         newsk->dummy_th.fin = 0;
3079         newsk->dummy_th.syn = 0;
3080         newsk->dummy_th.rst = 0;        
3081         newsk->dummy_th.psh = 0;
3082         newsk->dummy_th.ack = 0;
3083         newsk->dummy_th.urg = 0;
3084         newsk->dummy_th.res2 = 0;
3085         newsk->acked_seq = skb->seq + 1;
3086         newsk->copied_seq = skb->seq + 1;
3087         newsk->socket = NULL;
3088 
3089         /*
3090          *      Grab the ttl and tos values and use them 
3091          */
3092 
3093         newsk->ip_ttl=sk->ip_ttl;
3094         newsk->ip_tos=skb->ip_hdr->tos;
3095 
3096         /*
3097          *      Use 512 or whatever user asked for 
3098          */
3099 
3100         /*
3101          *      Note use of sk->user_mss, since user has no direct access to newsk 
3102          */
3103 
3104         rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3105         newsk->ip_route_cache = rt;
3106         
3107         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3108                 newsk->window_clamp = rt->rt_window;
3109         else
3110                 newsk->window_clamp = 0;
3111                 
3112         if (sk->user_mss)
3113                 newsk->mtu = sk->user_mss;
3114         else if (rt)
3115                 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3116         else 
3117                 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3118 
3119         /*
3120          *      But not bigger than device MTU 
3121          */
3122 
3123         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3124 
3125 #ifdef CONFIG_SKIP
3126         
3127         /*
3128          *      SKIP devices set their MTU to 65535. This is so they can take packets
3129          *      unfragmented to security process then fragment. They could lie to the
3130          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
3131          *      simply because the final package we want unfragmented is going to be
3132          *
3133          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3134          */
3135          
3136         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
3137                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3138 #endif
3139         /*
3140          *      This will min with what arrived in the packet 
3141          */
3142 
3143         tcp_options(newsk,skb->h.th);
3144         
3145         tcp_cache_zap();
3146 
3147         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3148         if (buff == NULL) 
3149         {
3150                 sk->err = ENOMEM;
3151                 newsk->dead = 1;
3152                 newsk->state = TCP_CLOSE;
3153                 /* And this will destroy it */
3154                 release_sock(newsk);
3155                 kfree_skb(skb, FREE_READ);
3156                 tcp_statistics.TcpAttemptFails++;
3157                 return;
3158         }
3159   
3160         buff->sk = newsk;
3161         buff->localroute = newsk->localroute;
3162 
3163         /*
3164          *      Put in the IP header and routing stuff. 
3165          */
3166 
3167         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3168                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3169 
3170         /*
3171          *      Something went wrong. 
3172          */
3173 
3174         if (tmp < 0) 
3175         {
3176                 sk->err = tmp;
3177                 buff->free = 1;
3178                 kfree_skb(buff,FREE_WRITE);
3179                 newsk->dead = 1;
3180                 newsk->state = TCP_CLOSE;
3181                 release_sock(newsk);
3182                 skb->sk = sk;
3183                 kfree_skb(skb, FREE_READ);
3184                 tcp_statistics.TcpAttemptFails++;
3185                 return;
3186         }
3187 
3188         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3189   
3190         memcpy(t1, skb->h.th, sizeof(*t1));
3191         buff->seq = newsk->write_seq++;
3192         buff->end_seq = newsk->write_seq;
3193         /*
3194          *      Swap the send and the receive. 
3195          */
3196         t1->dest = skb->h.th->source;
3197         t1->source = newsk->dummy_th.source;
3198         t1->seq = ntohl(buff->seq);
3199         t1->ack = 1;
3200         newsk->window = tcp_select_window(newsk);
3201         newsk->sent_seq = newsk->write_seq;
3202         t1->window = ntohs(newsk->window);
3203         t1->res1 = 0;
3204         t1->res2 = 0;
3205         t1->rst = 0;
3206         t1->urg = 0;
3207         t1->psh = 0;
3208         t1->syn = 1;
3209         t1->ack_seq = htonl(newsk->acked_seq);
3210         t1->doff = sizeof(*t1)/4+1;
3211         ptr = skb_put(buff,4);
3212         ptr[0] = 2;
3213         ptr[1] = 4;
3214         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3215         ptr[3] =(newsk->mtu) & 0xff;
3216 
3217         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3218         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3219         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3220         skb->sk = newsk;
3221 
3222         /*
3223          *      Charge the sock_buff to newsk. 
3224          */
3225          
3226         sk->rmem_alloc -= skb->truesize;
3227         newsk->rmem_alloc += skb->truesize;
3228         
3229         skb_queue_tail(&sk->receive_queue,skb);
3230         sk->ack_backlog++;
3231         release_sock(newsk);
3232         tcp_statistics.TcpOutSegs++;
3233 }
3234 
3235 
3236 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3237 {
3238         /*
3239          * We need to grab some memory, and put together a FIN, 
3240          * and then put it into the queue to be sent.
3241          */
3242         
3243         sk->inuse = 1;
3244         
3245         if(th_cache_sk==sk)
3246                 tcp_cache_zap();
3247         if(sk->state == TCP_LISTEN)
3248         {
3249                 /* Special case */
3250                 tcp_set_state(sk, TCP_CLOSE);
3251                 tcp_close_pending(sk);
3252                 release_sock(sk);
3253                 return;
3254         }
3255         
3256         sk->keepopen = 1;
3257         sk->shutdown = SHUTDOWN_MASK;
3258 
3259         if (!sk->dead) 
3260                 sk->state_change(sk);
3261 
3262         if (timeout == 0) 
3263         {
3264                 struct sk_buff *skb;
3265                 
3266                 /*
3267                  *  We need to flush the recv. buffs.  We do this only on the
3268                  *  descriptor close, not protocol-sourced closes, because the
3269                  *  reader process may not have drained the data yet!
3270                  */
3271                  
3272                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3273                         kfree_skb(skb, FREE_READ);
3274                 /*
3275                  *      Get rid off any half-completed packets. 
3276                  */
3277 
3278                 if (sk->partial) 
3279                         tcp_send_partial(sk);
3280         }
3281 
3282                 
3283         /*
3284          *      Timeout is not the same thing - however the code likes
3285          *      to send both the same way (sigh).
3286          */
3287          
3288         if(timeout)
3289         {
3290                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3291         }
3292         else
3293         {
3294                 if(tcp_close_state(sk,1)==1)
3295                 {
3296                         tcp_send_fin(sk);
3297                 }
3298         }
3299         release_sock(sk);
3300 }
3301 
3302 
3303 /*
3304  *      This routine takes stuff off of the write queue,
3305  *      and puts it in the xmit queue. This happens as incoming acks
3306  *      open up the remote window for us.
3307  */
3308  
3309 static void tcp_write_xmit(struct sock *sk)
     /*  */
3310 {
3311         struct sk_buff *skb;
3312 
3313         /*
3314          *      The bytes will have to remain here. In time closedown will
3315          *      empty the write queue and all will be happy 
3316          */
3317 
3318         if(sk->zapped)
3319                 return;
3320 
3321         /*
3322          *      Anything on the transmit queue that fits the window can
3323          *      be added providing we are not
3324          *
3325          *      a) retransmitting (Nagle's rule)
3326          *      b) exceeding our congestion window.
3327          */
3328          
3329         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3330                 before(skb->end_seq, sk->window_seq + 1) &&
3331                 (sk->retransmits == 0 ||
3332                  sk->ip_xmit_timeout != TIME_WRITE ||
3333                  before(skb->end_seq, sk->rcv_ack_seq + 1))
3334                 && sk->packets_out < sk->cong_window) 
3335         {
3336                 IS_SKB(skb);
3337                 skb_unlink(skb);
3338                 
3339                 /*
3340                  *      See if we really need to send the packet. 
3341                  */
3342                  
3343                 if (before(skb->end_seq, sk->rcv_ack_seq +1)) 
3344                 {
3345                         /*
3346                          *      This is acked data. We can discard it. This 
3347                          *      cannot currently occur.
3348                          */
3349                          
3350                         sk->retransmits = 0;
3351                         kfree_skb(skb, FREE_WRITE);
3352                         if (!sk->dead) 
3353                                 sk->write_space(sk);
3354                 } 
3355                 else
3356                 {
3357                         struct tcphdr *th;
3358                         struct iphdr *iph;
3359                         int size;
3360 /*
3361  * put in the ack seq and window at this point rather than earlier,
3362  * in order to keep them monotonic.  We really want to avoid taking
3363  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3364  * Ack and window will in general have changed since this packet was put
3365  * on the write queue.
3366  */
3367                         iph = skb->ip_hdr;
3368                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3369                         size = skb->len - (((unsigned char *) th) - skb->data);
3370 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3371                         if (size > sk->mtu - sizeof(struct iphdr))
3372                         {
3373                                 iph->frag_off &= ~htons(IP_DF);
3374                                 ip_send_check(iph);
3375                         }
3376 #endif
3377                         
3378                         th->ack_seq = htonl(sk->acked_seq);
3379                         th->window = htons(tcp_select_window(sk));
3380 
3381                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3382 
3383                         sk->sent_seq = skb->end_seq;
3384                         
3385                         /*
3386                          *      IP manages our queue for some crazy reason
3387                          */
3388                          
3389                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3390                         
3391                         /*
3392                          *      Again we slide the timer wrongly
3393                          */
3394                          
3395                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3396                 }
3397         }
3398 }
3399 
3400 
3401 /*
3402  *      This routine deals with incoming acks, but not outgoing ones.
3403  */
3404 
3405 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3406 {
3407         u32 ack;
3408         int flag = 0;
3409 
3410         /* 
3411          * 1 - there was data in packet as well as ack or new data is sent or 
3412          *     in shutdown state
3413          * 2 - data from retransmit queue was acked and removed
3414          * 4 - window shrunk or data from retransmit queue was acked and removed
3415          */
3416 
3417         if(sk->zapped)
3418                 return(1);      /* Dead, cant ack any more so why bother */
3419 
3420         /*
3421          *      Have we discovered a larger window
3422          */
3423          
3424         ack = ntohl(th->ack_seq);
3425 
3426         if (ntohs(th->window) > sk->max_window) 
3427         {
3428                 sk->max_window = ntohs(th->window);
3429 #ifdef CONFIG_INET_PCTCP
3430                 /* Hack because we don't send partial packets to non SWS
3431                    handling hosts */
3432                 sk->mss = min(sk->max_window>>1, sk->mtu);
3433 #else
3434                 sk->mss = min(sk->max_window, sk->mtu);
3435 #endif  
3436         }
3437 
3438         /*
3439          *      We have dropped back to keepalive timeouts. Thus we have
3440          *      no retransmits pending.
3441          */
3442          
3443         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3444                 sk->retransmits = 0;
3445 
3446         /*
3447          *      If the ack is newer than sent or older than previous acks
3448          *      then we can probably ignore it.
3449          */
3450          
3451         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3452         {
3453                 if(sk->debug)
3454                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3455                         
3456                 /*
3457                  *      Keepalive processing.
3458                  */
3459                  
3460                 if (after(ack, sk->sent_seq)) 
3461                 {
3462                         return(0);
3463                 }
3464                 
3465                 /*
3466                  *      Restart the keepalive timer.
3467                  */
3468                  
3469                 if (sk->keepopen) 
3470                 {
3471                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3472                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3473                 }
3474                 return(1);
3475         }
3476 
3477         /*
3478          *      If there is data set flag 1
3479          */
3480          
3481         if (len != th->doff*4) 
3482                 flag |= 1;
3483 
3484         /*
3485          *      See if our window has been shrunk. 
3486          */
3487 
3488         if (after(sk->window_seq, ack+ntohs(th->window))) 
3489         {
3490                 /*
3491                  * We may need to move packets from the send queue
3492                  * to the write queue, if the window has been shrunk on us.
3493                  * The RFC says you are not allowed to shrink your window
3494                  * like this, but if the other end does, you must be able
3495                  * to deal with it.
3496                  */
3497                 struct sk_buff *skb;
3498                 struct sk_buff *skb2;
3499                 struct sk_buff *wskb = NULL;
3500         
3501                 skb2 = sk->send_head;
3502                 sk->send_head = NULL;
3503                 sk->send_tail = NULL;
3504         
3505                 /*
3506                  *      This is an artifact of a flawed concept. We want one
3507                  *      queue and a smarter send routine when we send all.
3508                  */
3509         
3510                 flag |= 4;      /* Window changed */
3511         
3512                 sk->window_seq = ack + ntohs(th->window);
3513                 cli();
3514                 while (skb2 != NULL) 
3515                 {
3516                         skb = skb2;
3517                         skb2 = skb->link3;
3518                         skb->link3 = NULL;
3519                         if (after(skb->end_seq, sk->window_seq)) 
3520                         {
3521                                 if (sk->packets_out > 0) 
3522                                         sk->packets_out--;
3523                                 /* We may need to remove this from the dev send list. */
3524                                 if (skb->next != NULL) 
3525                                 {
3526                                         skb_unlink(skb);                                
3527                                 }
3528                                 /* Now add it to the write_queue. */
3529                                 if (wskb == NULL)
3530                                         skb_queue_head(&sk->write_queue,skb);
3531                                 else
3532                                         skb_append(wskb,skb);
3533                                 wskb = skb;
3534                         } 
3535                         else 
3536                         {
3537                                 if (sk->send_head == NULL) 
3538                                 {
3539                                         sk->send_head = skb;
3540                                         sk->send_tail = skb;
3541                                 }
3542                                 else
3543                                 {
3544                                         sk->send_tail->link3 = skb;
3545                                         sk->send_tail = skb;
3546                                 }
3547                                 skb->link3 = NULL;
3548                         }
3549                 }
3550                 sti();
3551         }
3552 
3553         /*
3554          *      Pipe has emptied
3555          */
3556          
3557         if (sk->send_tail == NULL || sk->send_head == NULL) 
3558         {
3559                 sk->send_head = NULL;
3560                 sk->send_tail = NULL;
3561                 sk->packets_out= 0;
3562         }
3563 
3564         /*
3565          *      Update the right hand window edge of the host
3566          */
3567          
3568         sk->window_seq = ack + ntohs(th->window);
3569 
3570         /*
3571          *      We don't want too many packets out there. 
3572          */
3573          
3574         if (sk->ip_xmit_timeout == TIME_WRITE && 
3575                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3576         {
3577                 /* 
3578                  * This is Jacobson's slow start and congestion avoidance. 
3579                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3580                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3581                  * counter and increment it once every cwnd times.  It's possible
3582                  * that this should be done only if sk->retransmits == 0.  I'm
3583                  * interpreting "new data is acked" as including data that has
3584                  * been retransmitted but is just now being acked.
3585                  */
3586                 if (sk->cong_window < sk->ssthresh)  
3587                         /* 
3588                          *      In "safe" area, increase
3589                          */
3590                         sk->cong_window++;
3591                 else 
3592                 {
3593                         /*
3594                          *      In dangerous area, increase slowly.  In theory this is
3595                          *      sk->cong_window += 1 / sk->cong_window
3596                          */
3597                         if (sk->cong_count >= sk->cong_window) 
3598                         {
3599                                 sk->cong_window++;
3600                                 sk->cong_count = 0;
3601                         }
3602                         else 
3603                                 sk->cong_count++;
3604                 }
3605         }
3606 
3607         /*
3608          *      Remember the highest ack received.
3609          */
3610          
3611         sk->rcv_ack_seq = ack;
3612         
3613         /*
3614          *      We passed data and got it acked, remove any soft error
3615          *      log. Something worked...
3616          */
3617          
3618         sk->err_soft = 0;
3619 
3620         /*
3621          *      If this ack opens up a zero window, clear backoff.  It was
3622          *      being used to time the probes, and is probably far higher than
3623          *      it needs to be for normal retransmission.
3624          */
3625 
3626         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3627         {
3628                 sk->retransmits = 0;    /* Our probe was answered */
3629                 
3630                 /*
3631                  *      Was it a usable window open ?
3632                  */
3633                  
3634                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3635                     ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
3636                 {
3637                         sk->backoff = 0;
3638                         
3639                         /*
3640                          *      Recompute rto from rtt.  this eliminates any backoff.
3641                          */
3642 
3643                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3644                         if (sk->rto > 120*HZ)
3645                                 sk->rto = 120*HZ;
3646                         if (sk->rto < HZ/5)     /* Was 1*HZ, then 1 - turns out we must allow about
3647                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3648                                                    .2 of a second is going to need huge windows (SIGH) */
3649                         sk->rto = HZ/5;
3650                 }
3651         }
3652 
3653         /* 
3654          *      See if we can take anything off of the retransmit queue.
3655          */
3656    
3657         while(sk->send_head != NULL) 
3658         {
3659                 /* Check for a bug. */
3660                 if (sk->send_head->link3 &&
3661                     after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) 
3662                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3663                         
3664                 /*
3665                  *      If our packet is before the ack sequence we can
3666                  *      discard it as it's confirmed to have arrived the other end.
3667                  */
3668                  
3669                 if (before(sk->send_head->end_seq, ack+1)) 
3670                 {
3671                         struct sk_buff *oskb;   
3672                         if (sk->retransmits) 
3673                         {       
3674                                 /*
3675                                  *      We were retransmitting.  don't count this in RTT est 
3676                                  */
3677                                 flag |= 2;
3678 
3679                                 /*
3680                                  * even though we've gotten an ack, we're still
3681                                  * retransmitting as long as we're sending from
3682                                  * the retransmit queue.  Keeping retransmits non-zero
3683                                  * prevents us from getting new data interspersed with
3684                                  * retransmissions.
3685                                  */
3686 
3687                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3688                                         sk->retransmits = 1;
3689                                 else
3690                                         sk->retransmits = 0;
3691                         }
3692                         /*
3693                          * Note that we only reset backoff and rto in the
3694                          * rtt recomputation code.  And that doesn't happen
3695                          * if there were retransmissions in effect.  So the
3696                          * first new packet after the retransmissions is
3697                          * sent with the backoff still in effect.  Not until
3698                          * we get an ack from a non-retransmitted packet do
3699                          * we reset the backoff and rto.  This allows us to deal
3700                          * with a situation where the network delay has increased
3701                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3702                          */
3703 
3704                         /*
3705                          *      We have one less packet out there. 
3706                          */
3707                          
3708                         if (sk->packets_out > 0) 
3709                                 sk->packets_out --;
3710                         /* 
3711                          *      Wake up the process, it can probably write more. 
3712                          */
3713                         if (!sk->dead) 
3714                                 sk->write_space(sk);
3715                         oskb = sk->send_head;
3716 
3717                         if (!(flag&2))  /* Not retransmitting */
3718                         {
3719                                 long m;
3720         
3721                                 /*
3722                                  *      The following amusing code comes from Jacobson's
3723                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3724                                  *      are scaled versions of rtt and mean deviation.
3725                                  *      This is designed to be as fast as possible 
3726                                  *      m stands for "measurement".
3727                                  */
3728         
3729                                 m = jiffies - oskb->when;  /* RTT */
3730                                 if(m<=0)
3731                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3732                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3733                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3734                                 if (m < 0)
3735                                         m = -m;         /* m is now abs(error) */
3736                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3737                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3738         
3739                                 /*
3740                                  *      Now update timeout.  Note that this removes any backoff.
3741                                  */
3742                          
3743                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3744                                 if (sk->rto > 120*HZ)
3745                                         sk->rto = 120*HZ;
3746                                 if (sk->rto < HZ/5)     /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3747                                         sk->rto = HZ/5;
3748                                 sk->backoff = 0;
3749                         }
3750                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3751                                            In this case as we just set it up */
3752                         cli();
3753                         oskb = sk->send_head;
3754                         IS_SKB(oskb);
3755                         sk->send_head = oskb->link3;
3756                         if (sk->send_head == NULL) 
3757                         {
3758                                 sk->send_tail = NULL;
3759                         }
3760 
3761                 /*
3762                  *      We may need to remove this from the dev send list. 
3763                  */
3764 
3765                         if (oskb->next)
3766                                 skb_unlink(oskb);
3767                         sti();
3768                         kfree_skb(oskb, FREE_WRITE); /* write. */
3769                         if (!sk->dead) 
3770                                 sk->write_space(sk);
3771                 }
3772                 else
3773                 {
3774                         break;
3775                 }
3776         }
3777 
3778         /*
3779          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3780          * returns non-NULL, we complete ignore the timer stuff in the else
3781          * clause.  We ought to organize the code so that else clause can
3782          * (should) be executed regardless, possibly moving the PROBE timer
3783          * reset over.  The skb_peek() thing should only move stuff to the
3784          * write queue, NOT also manage the timer functions.
3785          */
3786 
3787         /*
3788          * Maybe we can take some stuff off of the write queue,
3789          * and put it onto the xmit queue.
3790          */
3791         if (skb_peek(&sk->write_queue) != NULL) 
3792         {
3793                 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3794                         (sk->retransmits == 0 || 
3795                          sk->ip_xmit_timeout != TIME_WRITE ||
3796                          before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3797                         && sk->packets_out < sk->cong_window) 
3798                 {
3799                         /*
3800                          *      Add more data to the send queue.
3801                          */
3802                         flag |= 1;
3803                         tcp_write_xmit(sk);
3804                 }
3805                 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3806                         sk->send_head == NULL &&
3807                         sk->ack_backlog == 0 &&
3808                         sk->state != TCP_TIME_WAIT) 
3809                 {
3810                         /*
3811                          *      Data to queue but no room.
3812                          */
3813                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3814                 }               
3815         }
3816         else
3817         {
3818                 /*
3819                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3820                  * from TCP_CLOSE we don't do anything
3821                  *
3822                  * from anything else, if there is write data (or fin) pending,
3823                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3824                  * a KEEPALIVE timeout, else we delete the timer.
3825                  *
3826                  * We do not set flag for nominal write data, otherwise we may
3827                  * force a state where we start to write itsy bitsy tidbits
3828                  * of data.
3829                  */
3830 
3831                 switch(sk->state) {
3832                 case TCP_TIME_WAIT:
3833                         /*
3834                          * keep us in TIME_WAIT until we stop getting packets,
3835                          * reset the timeout.
3836                          */
3837                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3838                         break;
3839                 case TCP_CLOSE:
3840                         /*
3841                          * don't touch the timer.
3842                          */
3843                         break;
3844                 default:
3845                         /*
3846                          *      Must check send_head, write_queue, and ack_backlog
3847                          *      to determine which timeout to use.
3848                          */
3849                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3850                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3851                         } else if (sk->keepopen) {
3852                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3853                         } else {
3854                                 del_timer(&sk->retransmit_timer);
3855                                 sk->ip_xmit_timeout = 0;
3856                         }
3857                         break;
3858                 }
3859         }
3860 
3861         /*
3862          *      We have nothing queued but space to send. Send any partial
3863          *      packets immediately (end of Nagle rule application).
3864          */
3865          
3866         if (sk->packets_out == 0 && sk->partial != NULL &&
3867                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3868         {
3869                 flag |= 1;
3870                 tcp_send_partial(sk);
3871         }
3872 
3873         /*
3874          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3875          * we are now waiting for an acknowledge to our FIN.  The other end is
3876          * already in TIME_WAIT.
3877          *
3878          * Move to TCP_CLOSE on success.
3879          */
3880 
3881         if (sk->state == TCP_LAST_ACK) 
3882         {
3883                 if (!sk->dead)
3884                         sk->state_change(sk);
3885                 if(sk->debug)
3886                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3887                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3888                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3889                 {
3890                         flag |= 1;
3891                         sk->shutdown = SHUTDOWN_MASK;
3892                         tcp_set_state(sk,TCP_CLOSE);
3893                         return 1;
3894                 }
3895         }
3896 
3897         /*
3898          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3899          *
3900          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3901          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3902          */
3903 
3904         if (sk->state == TCP_FIN_WAIT1) 
3905         {
3906 
3907                 if (!sk->dead) 
3908                         sk->state_change(sk);
3909                 if (sk->rcv_ack_seq == sk->write_seq) 
3910                 {
3911                         flag |= 1;
3912                         sk->shutdown |= SEND_SHUTDOWN;
3913                         tcp_set_state(sk, TCP_FIN_WAIT2);
3914                 }
3915         }
3916 
3917         /*
3918          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3919          *
3920          *      Move to TIME_WAIT
3921          */
3922 
3923         if (sk->state == TCP_CLOSING) 
3924         {
3925 
3926                 if (!sk->dead) 
3927                         sk->state_change(sk);
3928                 if (sk->rcv_ack_seq == sk->write_seq) 
3929                 {
3930                         flag |= 1;
3931                         tcp_time_wait(sk);
3932                 }
3933         }
3934         
3935         /*
3936          *      Final ack of a three way shake 
3937          */
3938          
3939         if(sk->state==TCP_SYN_RECV)
3940         {
3941                 tcp_set_state(sk, TCP_ESTABLISHED);
3942                 tcp_options(sk,th);
3943                 sk->dummy_th.dest=th->source;
3944                 sk->copied_seq = sk->acked_seq;
3945                 if(!sk->dead)
3946                         sk->state_change(sk);
3947                 if(sk->max_window==0)
3948                 {
3949                         sk->max_window=32;      /* Sanity check */
3950                         sk->mss=min(sk->max_window,sk->mtu);
3951                 }
3952         }
3953         
3954         /*
3955          * I make no guarantees about the first clause in the following
3956          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3957          * what conditions "!flag" would be true.  However I think the rest
3958          * of the conditions would prevent that from causing any
3959          * unnecessary retransmission. 
3960          *   Clearly if the first packet has expired it should be 
3961          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3962          * harder to explain:  You have to look carefully at how and when the
3963          * timer is set and with what timeout.  The most recent transmission always
3964          * sets the timer.  So in general if the most recent thing has timed
3965          * out, everything before it has as well.  So we want to go ahead and
3966          * retransmit some more.  If we didn't explicitly test for this
3967          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3968          * would not be true.  If you look at the pattern of timing, you can
3969          * show that rto is increased fast enough that the next packet would
3970          * almost never be retransmitted immediately.  Then you'd end up
3971          * waiting for a timeout to send each packet on the retransmission
3972          * queue.  With my implementation of the Karn sampling algorithm,
3973          * the timeout would double each time.  The net result is that it would
3974          * take a hideous amount of time to recover from a single dropped packet.
3975          * It's possible that there should also be a test for TIME_WRITE, but
3976          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3977          * got to be in real retransmission mode.
3978          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3979          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3980          * As long as no further losses occur, this seems reasonable.
3981          */
3982         
3983         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3984                (((flag&2) && sk->retransmits) ||
3985                (sk->send_head->when + sk->rto < jiffies))) 
3986         {
3987                 if(sk->send_head->when + sk->rto < jiffies)
3988                         tcp_retransmit(sk,0);   
3989                 else
3990                 {
3991                         tcp_do_retransmit(sk, 1);
3992                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3993                 }
3994         }
3995 
3996         return(1);
3997 }
3998 
3999 
4000 /*
4001  *      Process the FIN bit. This now behaves as it is supposed to work
4002  *      and the FIN takes effect when it is validly part of sequence
4003  *      space. Not before when we get holes.
4004  *
4005  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4006  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
4007  *      TIME-WAIT)
4008  *
4009  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
4010  *      close and we go into CLOSING (and later onto TIME-WAIT)
4011  *
4012  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4013  *
4014  */
4015  
4016 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
4017 {
4018         sk->fin_seq = skb->end_seq;
4019 
4020         if (!sk->dead) 
4021         {
4022                 sk->state_change(sk);
4023                 sock_wake_async(sk->socket, 1);
4024         }
4025 
4026         switch(sk->state) 
4027         {
4028                 case TCP_SYN_RECV:
4029                 case TCP_SYN_SENT:
4030                 case TCP_ESTABLISHED:
4031                         /*
4032                          * move to CLOSE_WAIT, tcp_data() already handled
4033                          * sending the ack.
4034                          */
4035                         tcp_set_state(sk,TCP_CLOSE_WAIT);
4036                         if (th->rst)
4037                                 sk->shutdown = SHUTDOWN_MASK;
4038                         break;
4039 
4040                 case TCP_CLOSE_WAIT:
4041                 case TCP_CLOSING:
4042                         /*
4043                          * received a retransmission of the FIN, do
4044                          * nothing.
4045                          */
4046                         break;
4047                 case TCP_TIME_WAIT:
4048                         /*
4049                          * received a retransmission of the FIN,
4050                          * restart the TIME_WAIT timer.
4051                          */
4052                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4053                         return(0);
4054                 case TCP_FIN_WAIT1:
4055                         /*
4056                          * This case occurs when a simultaneous close
4057                          * happens, we must ack the received FIN and
4058                          * enter the CLOSING state.
4059                          *
4060                          * This causes a WRITE timeout, which will either
4061                          * move on to TIME_WAIT when we timeout, or resend
4062                          * the FIN properly (maybe we get rid of that annoying
4063                          * FIN lost hang). The TIME_WRITE code is already correct
4064                          * for handling this timeout.
4065                          */
4066 
4067                         if(sk->ip_xmit_timeout != TIME_WRITE)
4068                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4069                         tcp_set_state(sk,TCP_CLOSING);
4070                         break;
4071                 case TCP_FIN_WAIT2:
4072                         /*
4073                          * received a FIN -- send ACK and enter TIME_WAIT
4074                          */
4075                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4076                         sk->shutdown|=SHUTDOWN_MASK;
4077                         tcp_set_state(sk,TCP_TIME_WAIT);
4078                         break;
4079                 case TCP_CLOSE:
4080                         /*
4081                          * already in CLOSE
4082                          */
4083                         break;
4084                 default:
4085                         tcp_set_state(sk,TCP_LAST_ACK);
4086         
4087                         /* Start the timers. */
4088                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4089                         return(0);
4090         }
4091 
4092         return(0);
4093 }
4094 
4095 
4096 
4097 /*
4098  *      This routine handles the data.  If there is room in the buffer,
4099  *      it will be have already been moved into it.  If there is no
4100  *      room, then we will just have to discard the packet.
4101  */
4102 
4103 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4104          unsigned long saddr, unsigned short len)
4105 {
4106         struct sk_buff *skb1, *skb2;
4107         struct tcphdr *th;
4108         int dup_dumped=0;
4109         u32 new_seq, shut_seq;
4110 
4111         th = skb->h.th;
4112         skb_pull(skb,th->doff*4);
4113         skb_trim(skb,len-(th->doff*4));
4114 
4115         /*
4116          *      The bytes in the receive read/assembly queue has increased. Needed for the
4117          *      low memory discard algorithm 
4118          */
4119            
4120         sk->bytes_rcv += skb->len;
4121         
4122         if (skb->len == 0 && !th->fin) 
4123         {
4124                 /* 
4125                  *      Don't want to keep passing ack's back and forth. 
4126                  *      (someone sent us dataless, boring frame)
4127                  */
4128                 if (!th->ack)
4129                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4130                 kfree_skb(skb, FREE_READ);
4131                 return(0);
4132         }
4133         
4134         /*
4135          *      We no longer have anyone receiving data on this connection.
4136          */
4137 
4138 #ifndef TCP_DONT_RST_SHUTDOWN            
4139 
4140         if(sk->shutdown & RCV_SHUTDOWN)
4141         {
4142                 /*
4143                  *      FIXME: BSD has some magic to avoid sending resets to
4144                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4145                  *      BSD stacks still have broken keepalives so we want to
4146                  *      cope with it.
4147                  */
4148 
4149                 if(skb->len)    /* We don't care if it's just an ack or
4150                                    a keepalive/window probe */
4151                 {
4152                         new_seq = skb->seq + skb->len + th->syn;        /* Right edge of _data_ part of frame */
4153                         
4154                         /* Do this the way 4.4BSD treats it. Not what I'd
4155                            regard as the meaning of the spec but it's what BSD
4156                            does and clearly they know everything 8) */
4157 
4158                         /*
4159                          *      This is valid because of two things
4160                          *
4161                          *      a) The way tcp_data behaves at the bottom.
4162                          *      b) A fin takes effect when read not when received.
4163                          */
4164                          
4165                         shut_seq = sk->acked_seq+1;     /* Last byte */
4166                         
4167                         if(after(new_seq,shut_seq))
4168                         {
4169                                 if(sk->debug)
4170                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4171                                                 sk, new_seq, shut_seq, sk->blog);
4172                                 if(sk->dead)
4173                                 {
4174                                         sk->acked_seq = new_seq + th->fin;
4175                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4176                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4177                                         tcp_statistics.TcpEstabResets++;
4178                                         sk->err = EPIPE;
4179                                         sk->error_report(sk);
4180                                         sk->shutdown = SHUTDOWN_MASK;
4181                                         tcp_set_state(sk,TCP_CLOSE);
4182                                         kfree_skb(skb, FREE_READ);
4183                                         return 0;
4184                                 }
4185                         }
4186                 }
4187         }
4188 
4189 #endif
4190 
4191         /*
4192          *      Now we have to walk the chain, and figure out where this one
4193          *      goes into it.  This is set up so that the last packet we received
4194          *      will be the first one we look at, that way if everything comes
4195          *      in order, there will be no performance loss, and if they come
4196          *      out of order we will be able to fit things in nicely.
4197          *
4198          *      [AC: This is wrong. We should assume in order first and then walk
4199          *       forwards from the first hole based upon real traffic patterns.]
4200          *      
4201          */
4202 
4203         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4204         {
4205                 skb_queue_head(&sk->receive_queue,skb);
4206                 skb1= NULL;
4207         } 
4208         else
4209         {
4210                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4211                 {
4212                         if(sk->debug)
4213                         {
4214                                 printk("skb1=%p :", skb1);
4215                                 printk("skb1->seq = %d: ", skb1->seq);
4216                                 printk("skb->seq = %d\n",skb->seq);
4217                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4218                                                 sk->acked_seq);
4219                         }
4220                         
4221                         /*
4222                          *      Optimisation: Duplicate frame or extension of previous frame from
4223                          *      same sequence point (lost ack case).
4224                          *      The frame contains duplicate data or replaces a previous frame
4225                          *      discard the previous frame (safe as sk->inuse is set) and put
4226                          *      the new one in its place.
4227                          */
4228                          
4229                         if (skb->seq==skb1->seq && skb->len>=skb1->len)
4230                         {
4231                                 skb_append(skb1,skb);
4232                                 skb_unlink(skb1);
4233                                 kfree_skb(skb1,FREE_READ);
4234                                 dup_dumped=1;
4235                                 skb1=NULL;
4236                                 break;
4237                         }
4238                         
4239                         /*
4240                          *      Found where it fits
4241                          */
4242                          
4243                         if (after(skb->seq+1, skb1->seq))
4244                         {
4245                                 skb_append(skb1,skb);
4246                                 break;
4247                         }
4248                         
4249                         /*
4250                          *      See if we've hit the start. If so insert.
4251                          */
4252                         if (skb1 == skb_peek(&sk->receive_queue))
4253                         {
4254                                 skb_queue_head(&sk->receive_queue, skb);
4255                                 break;
4256                         }
4257                 }
4258         }
4259 
4260         /*
4261          *      Figure out what the ack value for this frame is
4262          */
4263          
4264         if (before(sk->acked_seq, sk->copied_seq)) 
4265         {
4266                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4267                 sk->acked_seq = sk->copied_seq;
4268         }
4269 
4270         /*
4271          *      Now figure out if we can ack anything. This is very messy because we really want two
4272          *      receive queues, a completed and an assembly queue. We also want only one transmit
4273          *      queue.
4274          */
4275 
4276         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) 
4277         {
4278                 if (before(skb->seq, sk->acked_seq+1)) 
4279                 {
4280                         int newwindow;
4281 
4282                         if (after(skb->end_seq, sk->acked_seq)) 
4283                         {
4284                                 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4285                                 if (newwindow < 0)
4286                                         newwindow = 0;  
4287                                 sk->window = newwindow;
4288                                 sk->acked_seq = skb->end_seq;
4289                         }
4290                         skb->acked = 1;
4291 
4292                         /*
4293                          *      When we ack the fin, we do the FIN 
4294                          *      processing.
4295                          */
4296 
4297                         if (skb->h.th->fin) 
4298                         {
4299                                 tcp_fin(skb,sk,skb->h.th);
4300                         }
4301           
4302                         for(skb2 = skb->next;
4303                             skb2 != (struct sk_buff *)&sk->receive_queue;
4304                             skb2 = skb2->next) 
4305                         {
4306                                 if (before(skb2->seq, sk->acked_seq+1)) 
4307                                 {
4308                                         if (after(skb2->end_seq, sk->acked_seq))
4309                                         {
4310                                                 newwindow = sk->window -
4311                                                  (skb2->end_seq - sk->acked_seq);
4312                                                 if (newwindow < 0)
4313                                                         newwindow = 0;  
4314                                                 sk->window = newwindow;
4315                                                 sk->acked_seq = skb2->end_seq;
4316                                         }
4317                                         skb2->acked = 1;
4318                                         /*
4319                                          *      When we ack the fin, we do
4320                                          *      the fin handling.
4321                                          */
4322                                         if (skb2->h.th->fin) 
4323                                         {
4324                                                 tcp_fin(skb,sk,skb->h.th);
4325                                         }
4326 
4327                                         /*
4328                                          *      Force an immediate ack.
4329                                          */
4330                                          
4331                                         sk->ack_backlog = sk->max_ack_backlog;
4332                                 }
4333                                 else
4334                                 {
4335                                         break;
4336                                 }
4337                         }
4338 
4339                         /*
4340                          *      This also takes care of updating the window.
4341                          *      This if statement needs to be simplified.
4342                          */
4343                         if (!sk->delay_acks ||
4344                             sk->ack_backlog >= sk->max_ack_backlog || 
4345                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4346         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4347                         }
4348                         else 
4349                         {
4350                                 sk->ack_backlog++;
4351                                 if(sk->debug)
4352                                         printk("Ack queued.\n");
4353                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4354                         }
4355                 }
4356         }
4357 
4358         /*
4359          *      If we've missed a packet, send an ack.
4360          *      Also start a timer to send another.
4361          */
4362          
4363         if (!skb->acked) 
4364         {
4365         
4366         /*
4367          *      This is important.  If we don't have much room left,
4368          *      we need to throw out a few packets so we have a good
4369          *      window.  Note that mtu is used, not mss, because mss is really
4370          *      for the send side.  He could be sending us stuff as large as mtu.
4371          */
4372                  
4373                 while (sock_rspace(sk) < sk->mtu) 
4374                 {
4375                         skb1 = skb_peek(&sk->receive_queue);
4376                         if (skb1 == NULL) 
4377                         {
4378                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4379                                 break;
4380                         }
4381 
4382                         /*
4383                          *      Don't throw out something that has been acked. 
4384                          */
4385                  
4386                         if (skb1->acked) 
4387                         {
4388                                 break;
4389                         }
4390                 
4391                         skb_unlink(skb1);
4392                         kfree_skb(skb1, FREE_READ);
4393                 }
4394                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4395                 sk->ack_backlog++;
4396                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4397         }
4398         else
4399         {
4400                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4401         }
4402 
4403         /*
4404          *      Now tell the user we may have some data. 
4405          */
4406          
4407         if (!sk->dead) 
4408         {
4409                 if(sk->debug)
4410                         printk("Data wakeup.\n");
4411                 sk->data_ready(sk,0);
4412         } 
4413         return(0);
4414 }
4415 
4416 
4417 /*
4418  *      This routine is only called when we have urgent data
4419  *      signalled. Its the 'slow' part of tcp_urg. It could be
4420  *      moved inline now as tcp_urg is only called from one
4421  *      place. We handle URGent data wrong. We have to - as
4422  *      BSD still doesn't use the correction from RFC961.
4423  */
4424  
4425 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4426 {
4427         u32 ptr = ntohs(th->urg_ptr);
4428 
4429         if (ptr)
4430                 ptr--;
4431         ptr += ntohl(th->seq);
4432 
4433         /* ignore urgent data that we've already seen and read */
4434         if (after(sk->copied_seq, ptr))
4435                 return;
4436 
4437         /* do we already have a newer (or duplicate) urgent pointer? */
4438         if (sk->urg_data && !after(ptr, sk->urg_seq))
4439                 return;
4440 
4441         /* tell the world about our new urgent pointer */
4442         if (sk->proc != 0) {
4443                 if (sk->proc > 0) {
4444                         kill_proc(sk->proc, SIGURG, 1);
4445                 } else {
4446                         kill_pg(-sk->proc, SIGURG, 1);
4447                 }
4448         }
4449         sk->urg_data = URG_NOTYET;
4450         sk->urg_seq = ptr;
4451 }
4452 
4453 /*
4454  *      This is the 'fast' part of urgent handling.
4455  */
4456  
4457 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4458         unsigned long saddr, unsigned long len)
4459 {
4460         u32 ptr;
4461 
4462         /*
4463          *      Check if we get a new urgent pointer - normally not 
4464          */
4465          
4466         if (th->urg)
4467                 tcp_check_urg(sk,th);
4468 
4469         /*
4470          *      Do we wait for any urgent data? - normally not
4471          */
4472          
4473         if (sk->urg_data != URG_NOTYET)
4474                 return 0;
4475 
4476         /*
4477          *      Is the urgent pointer pointing into this packet? 
4478          */
4479          
4480         ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4481         if (ptr >= len)
4482                 return 0;
4483 
4484         /*
4485          *      Ok, got the correct packet, update info 
4486          */
4487          
4488         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4489         if (!sk->dead)
4490                 sk->data_ready(sk,0);
4491         return 0;
4492 }
4493 
4494 /*
4495  *      This will accept the next outstanding connection. 
4496  */
4497  
4498 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4499 {
4500         struct sock *newsk;
4501         struct sk_buff *skb;
4502   
4503   /*
4504    * We need to make sure that this socket is listening,
4505    * and that it has something pending.
4506    */
4507 
4508         if (sk->state != TCP_LISTEN) 
4509         {
4510                 sk->err = EINVAL;
4511                 return(NULL); 
4512         }
4513 
4514         /* Avoid the race. */
4515         cli();
4516         sk->inuse = 1;
4517 
4518         while((skb = tcp_dequeue_established(sk)) == NULL) 
4519         {
4520                 if (flags & O_NONBLOCK) 
4521                 {
4522                         sti();
4523                         release_sock(sk);
4524                         sk->err = EAGAIN;
4525                         return(NULL);
4526                 }
4527 
4528                 release_sock(sk);
4529                 interruptible_sleep_on(sk->sleep);
4530                 if (current->signal & ~current->blocked) 
4531                 {
4532                         sti();
4533                         sk->err = ERESTARTSYS;
4534                         return(NULL);
4535                 }
4536                 sk->inuse = 1;
4537         }
4538         sti();
4539 
4540         /*
4541          *      Now all we need to do is return skb->sk. 
4542          */
4543 
4544         newsk = skb->sk;
4545 
4546         kfree_skb(skb, FREE_READ);
4547         sk->ack_backlog--;
4548         release_sock(sk);
4549         return(newsk);
4550 }
4551 
4552 
4553 /*
4554  *      This will initiate an outgoing connection. 
4555  */
4556  
4557 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4558 {
4559         struct sk_buff *buff;
4560         struct device *dev=NULL;
4561         unsigned char *ptr;
4562         int tmp;
4563         int atype;
4564         struct tcphdr *t1;
4565         struct rtable *rt;
4566 
4567         if (sk->state != TCP_CLOSE) 
4568                 return(-EISCONN);
4569 
4570         /*
4571          *      Don't allow a double connect.
4572          */
4573                 
4574         if(sk->daddr)
4575                 return -EINVAL;
4576         
4577         if (addr_len < 8) 
4578                 return(-EINVAL);
4579 
4580         if (usin->sin_family && usin->sin_family != AF_INET) 
4581                 return(-EAFNOSUPPORT);
4582 
4583         /*
4584          *      connect() to INADDR_ANY means loopback (BSD'ism).
4585          */
4586         
4587         if(usin->sin_addr.s_addr==INADDR_ANY)
4588                 usin->sin_addr.s_addr=ip_my_addr();
4589                   
4590         /*
4591          *      Don't want a TCP connection going to a broadcast address 
4592          */
4593 
4594         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4595                 return -ENETUNREACH;
4596   
4597         sk->inuse = 1;
4598         sk->daddr = usin->sin_addr.s_addr;
4599         sk->write_seq = tcp_init_seq();
4600         sk->window_seq = sk->write_seq;
4601         sk->rcv_ack_seq = sk->write_seq -1;
4602         sk->err = 0;
4603         sk->dummy_th.dest = usin->sin_port;
4604         release_sock(sk);
4605 
4606         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4607         if (buff == NULL) 
4608         {
4609                 return(-ENOMEM);
4610         }
4611         sk->inuse = 1;
4612         buff->sk = sk;
4613         buff->free = 0;
4614         buff->localroute = sk->localroute;
4615         
4616 
4617         /*
4618          *      Put in the IP header and routing stuff.
4619          */
4620          
4621         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4622                 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4623         if (tmp < 0) 
4624         {
4625                 sock_wfree(sk, buff);
4626                 release_sock(sk);
4627                 return(-ENETUNREACH);
4628         }
4629         if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4630                 sk->saddr = rt->rt_src;
4631         sk->rcv_saddr = sk->saddr;
4632 
4633         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4634 
4635         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4636         buff->seq = sk->write_seq++;
4637         t1->seq = htonl(buff->seq);
4638         sk->sent_seq = sk->write_seq;
4639         buff->end_seq = sk->write_seq;
4640         t1->ack = 0;
4641         t1->window = 2;
4642         t1->res1=0;
4643         t1->res2=0;
4644         t1->rst = 0;
4645         t1->urg = 0;
4646         t1->psh = 0;
4647         t1->syn = 1;
4648         t1->urg_ptr = 0;
4649         t1->doff = 6;
4650         /* use 512 or whatever user asked for */
4651         
4652         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4653                 sk->window_clamp=rt->rt_window;
4654         else
4655                 sk->window_clamp=0;
4656 
4657         if (sk->user_mss)
4658                 sk->mtu = sk->user_mss;
4659         else if (rt)
4660                 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4661         else 
4662                 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4663 
4664         /*
4665          *      but not bigger than device MTU 
4666          */
4667 
4668         if(sk->mtu <32)
4669                 sk->mtu = 32;   /* Sanity limit */
4670                 
4671         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4672 
4673 #ifdef CONFIG_SKIP
4674         
4675         /*
4676          *      SKIP devices set their MTU to 65535. This is so they can take packets
4677          *      unfragmented to security process then fragment. They could lie to the
4678          *      TCP layer about a suitable MTU, but its easier to let skip sort it out
4679          *      simply because the final package we want unfragmented is going to be
4680          *
4681          *      [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4682          */
4683          
4684         if(skip_pick_mtu!=NULL)         /* If SKIP is loaded.. */
4685                 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4686 #endif
4687         
4688         /*
4689          *      Put in the TCP options to say MTU. 
4690          */
4691 
4692         ptr = skb_put(buff,4);
4693         ptr[0] = 2;
4694         ptr[1] = 4;
4695         ptr[2] = (sk->mtu) >> 8;
4696         ptr[3] = (sk->mtu) & 0xff;
4697         tcp_send_check(t1, sk->saddr, sk->daddr,
4698                   sizeof(struct tcphdr) + 4, sk);
4699 
4700         /*
4701          *      This must go first otherwise a really quick response will get reset. 
4702          */
4703 
4704         tcp_cache_zap();
4705         tcp_set_state(sk,TCP_SYN_SENT);
4706         if(rt&&rt->rt_flags&RTF_IRTT)
4707                 sk->rto = rt->rt_irtt;
4708         else
4709                 sk->rto = TCP_TIMEOUT_INIT;
4710         sk->retransmit_timer.function=&retransmit_timer;
4711         sk->retransmit_timer.data = (unsigned long)sk;
4712         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4713         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4714                                                                                         initial setting */
4715 
4716         sk->prot->queue_xmit(sk, dev, buff, 0);  
4717         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4718         tcp_statistics.TcpActiveOpens++;
4719         tcp_statistics.TcpOutSegs++;
4720   
4721         release_sock(sk);
4722         return(0);
4723 }
4724 
4725 
4726 /*
4727  *      This functions checks to see if the tcp header is actually acceptable. 
4728  */
4729  
4730 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4731              struct options *opt, unsigned long saddr, struct device *dev)
4732 {
4733         u32 next_seq;
4734 
4735         next_seq = len - 4*th->doff;
4736         if (th->fin)
4737                 next_seq++;
4738         /* if we have a zero window, we can't have any data in the packet.. */
4739         if (next_seq && !sk->window)
4740                 goto ignore_it;
4741         next_seq += ntohl(th->seq);
4742 
4743         /*
4744          * This isn't quite right.  sk->acked_seq could be more recent
4745          * than sk->window.  This is however close enough.  We will accept
4746          * slightly more packets than we should, but it should not cause
4747          * problems unless someone is trying to forge packets.
4748          */
4749 
4750         /* have we already seen all of this packet? */
4751         if (!after(next_seq+1, sk->acked_seq))
4752                 goto ignore_it;
4753         /* or does it start beyond the window? */
4754         if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4755                 goto ignore_it;
4756 
4757         /* ok, at least part of this packet would seem interesting.. */
4758         return 1;
4759 
4760 ignore_it:
4761         if (th->rst)
4762                 return 0;
4763 
4764         /*
4765          *      Send a reset if we get something not ours and we are
4766          *      unsynchronized. Note: We don't do anything to our end. We
4767          *      are just killing the bogus remote connection then we will
4768          *      connect again and it will work (with luck).
4769          */
4770          
4771         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4772         {
4773                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4774                 return 1;
4775         }
4776 
4777         /* Try to resync things. */
4778         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4779         return 0;
4780 }
4781 
4782 /*
4783  *      When we get a reset we do this.
4784  */
4785 
4786 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4787 {
4788         sk->zapped = 1;
4789         sk->err = ECONNRESET;
4790         if (sk->state == TCP_SYN_SENT)
4791                 sk->err = ECONNREFUSED;
4792         if (sk->state == TCP_CLOSE_WAIT)
4793                 sk->err = EPIPE;
4794 #ifdef TCP_DO_RFC1337           
4795         /*
4796          *      Time wait assassination protection [RFC1337]
4797          */
4798         if(sk->state!=TCP_TIME_WAIT)
4799         {       
4800                 tcp_set_state(sk,TCP_CLOSE);
4801                 sk->shutdown = SHUTDOWN_MASK;
4802         }
4803 #else   
4804         tcp_set_state(sk,TCP_CLOSE);
4805         sk->shutdown = SHUTDOWN_MASK;
4806 #endif  
4807         if (!sk->dead) 
4808                 sk->state_change(sk);
4809         kfree_skb(skb, FREE_READ);
4810         release_sock(sk);
4811         return(0);
4812 }
4813 
4814 /*
4815  *      A TCP packet has arrived.
4816  *              skb->h.raw is the TCP header.
4817  */
4818  
4819 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4820         __u32 daddr, unsigned short len,
4821         __u32 saddr, int redo, struct inet_protocol * protocol)
4822 {
4823         struct tcphdr *th;
4824         struct sock *sk;
4825         int syn_ok=0;
4826         
4827         tcp_statistics.TcpInSegs++;
4828         if(skb->pkt_type!=PACKET_HOST)
4829         {
4830                 kfree_skb(skb,FREE_READ);
4831                 return(0);
4832         }
4833   
4834         th = skb->h.th;
4835 
4836         /*
4837          *      Find the socket, using the last hit cache if applicable.
4838          */
4839 
4840         if(!redo && saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4841         {
4842                 sk=(struct sock *)th_cache_sk;
4843                 /*
4844                  *      We think this is causing the bug so
4845                  */
4846                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4847                         printk("Cache mismatch on TCP.\n");
4848         }
4849         else
4850         {
4851                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4852                 th_cache_saddr=saddr;
4853                 th_cache_daddr=daddr;
4854                 th_cache_dport=th->dest;
4855                 th_cache_sport=th->source;
4856                 th_cache_sk=sk;
4857         }               
4858 
4859         /*
4860          *      If this socket has got a reset it's to all intents and purposes 
4861          *      really dead. Count closed sockets as dead.
4862          *
4863          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4864          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4865          *      exist so should cause resets as if the port was unreachable.
4866          */
4867          
4868         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4869                 sk=NULL;
4870 
4871         if (!redo) 
4872         {
4873                 /*
4874                  *      Pull up the IP header.
4875                  */
4876                 skb_pull(skb, skb->h.raw-skb->data);
4877                 /*
4878                  *      Try to use the device checksum if provided.
4879                  */
4880                 if (
4881                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4882                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4883                     )
4884                 {
4885                         skb->sk = NULL;
4886                         kfree_skb(skb,FREE_READ);
4887                         /*
4888                          *      We don't release the socket because it was
4889                          *      never marked in use.
4890                          */
4891                         return(0);
4892                 }
4893 
4894                 skb->seq = ntohl(th->seq);
4895                 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4896                 skb->ack_seq = ntohl(th->ack_seq);
4897 
4898                 /* See if we know about the socket. */
4899                 if (sk == NULL) 
4900                 {
4901                         /*
4902                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4903                          */
4904                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4905                         skb->sk = NULL;
4906                         /*
4907                          *      Discard frame
4908                          */
4909                         kfree_skb(skb, FREE_READ);
4910                         return(0);
4911                 }
4912 
4913                 skb->acked = 0;
4914                 skb->used = 0;
4915                 skb->free = 0;
4916                 skb->saddr = daddr;
4917                 skb->daddr = saddr;
4918         
4919                 /* We may need to add it to the backlog here. */
4920                 cli();
4921                 if (sk->inuse) 
4922                 {
4923                         skb_queue_tail(&sk->back_log, skb);
4924                         sti();
4925                         return(0);
4926                 }
4927                 sk->inuse = 1;
4928                 sti();
4929         }
4930         else
4931         {
4932                 if (sk==NULL) 
4933                 {
4934                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4935                         skb->sk = NULL;
4936                         kfree_skb(skb, FREE_READ);
4937                         return(0);
4938                 }
4939         }
4940 
4941 
4942         if (!sk->prot) 
4943         {
4944                 printk("IMPOSSIBLE 3\n");
4945                 return(0);
4946         }
4947 
4948 
4949         /*
4950          *      Charge the memory to the socket. 
4951          */
4952          
4953         skb->sk=sk;
4954         sk->rmem_alloc += skb->truesize;
4955 
4956         /*
4957          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4958          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4959          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4960          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4961          */
4962 
4963         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4964         {
4965         
4966                 /*
4967                  *      Now deal with unusual cases.
4968                  */
4969          
4970                 if(sk->state==TCP_LISTEN)
4971                 {
4972                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4973                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4974 
4975                         /*
4976                          *      We don't care for RST, and non SYN are absorbed (old segments)
4977                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4978                          *      netmask on a running connection it can go broadcast. Even Sun's have
4979                          *      this problem so I'm ignoring it 
4980                          */
4981                            
4982                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4983                         {
4984                                 kfree_skb(skb, FREE_READ);
4985                                 release_sock(sk);
4986                                 return 0;
4987                         }
4988                 
4989                         /*      
4990                          *      Guess we need to make a new socket up 
4991                          */
4992                 
4993                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4994                 
4995                         /*
4996                          *      Now we have several options: In theory there is nothing else
4997                          *      in the frame. KA9Q has an option to send data with the syn,
4998                          *      BSD accepts data with the syn up to the [to be] advertised window
4999                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
5000                          *      it, that fits the spec precisely and avoids incompatibilities. It
5001                          *      would be nice in future to drop through and process the data.
5002                          */
5003                          
5004                         release_sock(sk);
5005                         return 0;
5006                 }
5007         
5008                 /* retransmitted SYN? */
5009                 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5010                 {
5011                         kfree_skb(skb, FREE_READ);
5012                         release_sock(sk);
5013                         return 0;
5014                 }
5015                 
5016                 /*
5017                  *      SYN sent means we have to look for a suitable ack and either reset
5018                  *      for bad matches or go to connected 
5019                  */
5020            
5021                 if(sk->state==TCP_SYN_SENT)
5022                 {
5023                         /* Crossed SYN or previous junk segment */
5024                         if(th->ack)
5025                         {
5026                                 /* We got an ack, but it's not a good ack */
5027                                 if(!tcp_ack(sk,th,saddr,len))
5028                                 {
5029                                         /* Reset the ack - its an ack from a 
5030                                            different connection  [ th->rst is checked in tcp_reset()] */
5031                                         tcp_statistics.TcpAttemptFails++;
5032                                         tcp_reset(daddr, saddr, th,
5033                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5034                                         kfree_skb(skb, FREE_READ);
5035                                         release_sock(sk);
5036                                         return(0);
5037                                 }
5038                                 if(th->rst)
5039                                         return tcp_std_reset(sk,skb);
5040                                 if(!th->syn)
5041                                 {
5042                                         /* A valid ack from a different connection
5043                                            start. Shouldn't happen but cover it */
5044                                         kfree_skb(skb, FREE_READ);
5045                                         release_sock(sk);
5046                                         return 0;
5047                                 }
5048                                 /*
5049                                  *      Ok.. it's good. Set up sequence numbers and
5050                                  *      move to established.
5051                                  */
5052                                 syn_ok=1;       /* Don't reset this connection for the syn */
5053                                 sk->acked_seq = skb->seq+1;
5054                                 sk->fin_seq = skb->seq;
5055                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5056                                 tcp_set_state(sk, TCP_ESTABLISHED);
5057                                 tcp_options(sk,th);
5058                                 sk->dummy_th.dest=th->source;
5059                                 sk->copied_seq = sk->acked_seq;
5060                                 if(!sk->dead)
5061                                 {
5062                                         sk->state_change(sk);
5063                                         sock_wake_async(sk->socket, 0);
5064                                 }
5065                                 if(sk->max_window==0)
5066                                 {
5067                                         sk->max_window = 32;
5068                                         sk->mss = min(sk->max_window, sk->mtu);
5069                                 }
5070                         }
5071                         else
5072                         {
5073                                 /* See if SYN's cross. Drop if boring */
5074                                 if(th->syn && !th->rst)
5075                                 {
5076                                         /* Crossed SYN's are fine - but talking to
5077                                            yourself is right out... */
5078                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5079                                                 sk->dummy_th.source==th->source &&
5080                                                 sk->dummy_th.dest==th->dest)
5081                                         {
5082                                                 tcp_statistics.TcpAttemptFails++;
5083                                                 return tcp_std_reset(sk,skb);
5084                                         }
5085                                         tcp_set_state(sk,TCP_SYN_RECV);
5086                                         
5087                                         /*
5088                                          *      FIXME:
5089                                          *      Must send SYN|ACK here
5090                                          */
5091                                 }               
5092                                 /* Discard junk segment */
5093                                 kfree_skb(skb, FREE_READ);
5094                                 release_sock(sk);
5095                                 return 0;
5096                         }
5097                         /*
5098                          *      SYN_RECV with data maybe.. drop through
5099                          */
5100                         goto rfc_step6;
5101                 }
5102 
5103         /*
5104          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5105          *      a more complex suggestion for fixing these reuse issues in RFC1644
5106          *      but not yet ready for general use. Also see RFC1379.
5107          */
5108         
5109 #define BSD_TIME_WAIT
5110 #ifdef BSD_TIME_WAIT
5111                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5112                         after(skb->seq, sk->acked_seq) && !th->rst)
5113                 {
5114                         u32 seq = sk->write_seq;
5115                         if(sk->debug)
5116                                 printk("Doing a BSD time wait\n");
5117                         tcp_statistics.TcpEstabResets++;           
5118                         sk->rmem_alloc -= skb->truesize;
5119                         skb->sk = NULL;
5120                         sk->err=ECONNRESET;
5121                         tcp_set_state(sk, TCP_CLOSE);
5122                         sk->shutdown = SHUTDOWN_MASK;
5123                         release_sock(sk);
5124                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5125                         if (sk && sk->state==TCP_LISTEN)
5126                         {
5127                                 sk->inuse=1;
5128                                 skb->sk = sk;
5129                                 sk->rmem_alloc += skb->truesize;
5130                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5131                                 release_sock(sk);
5132                                 return 0;
5133                         }
5134                         kfree_skb(skb, FREE_READ);
5135                         return 0;
5136                 }
5137 #endif  
5138         }
5139 
5140         /*
5141          *      We are now in normal data flow (see the step list in the RFC)
5142          *      Note most of these are inline now. I'll inline the lot when
5143          *      I have time to test it hard and look at what gcc outputs 
5144          */
5145         
5146         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5147         {
5148                 kfree_skb(skb, FREE_READ);
5149                 release_sock(sk);
5150                 return 0;
5151         }
5152 
5153         if(th->rst)
5154                 return tcp_std_reset(sk,skb);
5155         
5156         /*
5157          *      !syn_ok is effectively the state test in RFC793.
5158          */
5159          
5160         if(th->syn && !syn_ok)
5161         {
5162                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5163                 return tcp_std_reset(sk,skb);   
5164         }
5165 
5166         /*
5167          *      Process the ACK
5168          */
5169          
5170 
5171         if(th->ack && !tcp_ack(sk,th,saddr,len))
5172         {
5173                 /*
5174                  *      Our three way handshake failed.
5175                  */
5176                  
5177                 if(sk->state==TCP_SYN_RECV)
5178                 {
5179                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5180                 }
5181                 kfree_skb(skb, FREE_READ);
5182                 release_sock(sk);
5183                 return 0;
5184         }
5185         
5186 rfc_step6:              /* I'll clean this up later */
5187 
5188         /*
5189          *      If the accepted buffer put us over our queue size we
5190          *      now drop it (we must process the ack first to avoid
5191          *      deadlock cases).
5192          */
5193          
5194         if (sk->rmem_alloc  >= sk->rcvbuf) 
5195         {
5196                 kfree_skb(skb, FREE_READ);
5197                 release_sock(sk);
5198                 return(0);
5199         }
5200 
5201 
5202         /*
5203          *      Process urgent data
5204          */
5205                 
5206         if(tcp_urg(sk, th, saddr, len))
5207         {
5208                 kfree_skb(skb, FREE_READ);
5209                 release_sock(sk);
5210                 return 0;
5211         }
5212         
5213         /*
5214          *      Process the encapsulated data
5215          */
5216         
5217         if(tcp_data(skb,sk, saddr, len))
5218         {
5219                 kfree_skb(skb, FREE_READ);
5220                 release_sock(sk);
5221                 return 0;
5222         }
5223 
5224         /*
5225          *      And done
5226          */     
5227         
5228         release_sock(sk);
5229         return 0;
5230 }
5231 
5232 /*
5233  *      This routine sends a packet with an out of date sequence
5234  *      number. It assumes the other end will try to ack it.
5235  */
5236 
5237 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5238 {
5239         struct sk_buff *buff,*skb;
5240         struct tcphdr *t1;
5241         struct device *dev=NULL;
5242         int tmp;
5243 
5244         if (sk->zapped)
5245                 return; /* After a valid reset we can send no more */
5246 
5247         /*
5248          *      Write data can still be transmitted/retransmitted in the
5249          *      following states.  If any other state is encountered, return.
5250          *      [listen/close will never occur here anyway]
5251          */
5252 
5253         if (sk->state != TCP_ESTABLISHED && 
5254             sk->state != TCP_CLOSE_WAIT &&
5255             sk->state != TCP_FIN_WAIT1 && 
5256             sk->state != TCP_LAST_ACK &&
5257             sk->state != TCP_CLOSING
5258         ) 
5259         {
5260                 return;
5261         }
5262         if ( before(sk->sent_seq, sk->window_seq) && 
5263             (skb=skb_peek(&sk->write_queue)))
5264         {
5265                 /*
5266                  * We are probing the opening of a window
5267                  * but the window size is != 0
5268                  * must have been a result SWS advoidance ( sender )
5269                  */
5270             
5271                 struct iphdr *iph;
5272                 struct tcphdr *th;
5273                 struct tcphdr *nth;
5274                 unsigned long win_size;
5275 #if 0
5276                 unsigned long ow_size;
5277 #endif
5278                 void * tcp_data_start;
5279         
5280                 /*
5281                  *      How many bytes can we send ?
5282                  */
5283                  
5284                 win_size = sk->window_seq - sk->sent_seq;
5285 
5286                 /*
5287                  *      Recover the buffer pointers
5288                  */
5289                  
5290                 iph = (struct iphdr *)skb->ip_hdr;
5291                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5292 
5293                 /*
5294                  *      Grab the data for a temporary frame
5295                  */
5296                  
5297                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5298                                      (iph->ihl << 2) +
5299                                      sk->prot->max_header + 15, 
5300                                      1, GFP_ATOMIC);
5301                 if ( buff == NULL )
5302                         return;
5303 
5304                 /* 
5305                  *      If we strip the packet on the write queue we must
5306                  *      be ready to retransmit this one 
5307                  */
5308             
5309                 buff->free = /*0*/1;
5310 
5311                 buff->sk = sk;
5312                 buff->localroute = sk->localroute;
5313                 
5314                 /*
5315                  *      Put headers on the new packet
5316                  */
5317 
5318                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5319                                          IPPROTO_TCP, sk->opt, buff->truesize,
5320                                          sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5321                 if (tmp < 0) 
5322                 {
5323                         sock_wfree(sk, buff);
5324                         return;
5325                 }
5326                 
5327                 /*
5328                  *      Move the TCP header over
5329                  */
5330 
5331                 buff->dev = dev;
5332 
5333                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5334 
5335                 memcpy(nth, th, th->doff * 4);
5336                 
5337                 /*
5338                  *      Correct the new header
5339                  */
5340                  
5341                 nth->ack = 1; 
5342                 nth->ack_seq = htonl(sk->acked_seq);
5343                 nth->window = htons(tcp_select_window(sk));
5344                 nth->check = 0;
5345 
5346                 /*
5347                  *      Find the first data byte.
5348                  */
5349                  
5350                 tcp_data_start = (char *) th + (th->doff << 2);
5351 
5352                 /*
5353                  *      Add it to our new buffer
5354                  */
5355                  
5356                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5357                 
5358                 /*
5359                  *      Remember our right edge sequence number.
5360                  */
5361                  
5362                 buff->end_seq = sk->sent_seq + win_size;
5363                 sk->sent_seq = buff->end_seq;           /* Hack */
5364                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5365                         nth->urg = 0;
5366 
5367                 /*
5368                  *      Checksum the split buffer
5369                  */
5370                  
5371                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5372                            nth->doff * 4 + win_size , sk);
5373         }
5374         else
5375         {       
5376                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5377                 if (buff == NULL) 
5378                         return;
5379 
5380                 buff->free = 1;
5381                 buff->sk = sk;
5382                 buff->localroute = sk->localroute;
5383 
5384                 /*
5385                  *      Put in the IP header and routing stuff. 
5386                  */
5387                  
5388                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5389                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5390                 if (tmp < 0) 
5391                 {
5392                         sock_wfree(sk, buff);
5393                         return;
5394                 }
5395 
5396                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5397                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5398 
5399                 /*
5400                  *      Use a previous sequence.
5401                  *      This should cause the other end to send an ack.
5402                  */
5403          
5404                 t1->seq = htonl(sk->sent_seq-1);
5405                 t1->ack = 1; 
5406                 t1->res1= 0;
5407                 t1->res2= 0;
5408                 t1->rst = 0;
5409                 t1->urg = 0;
5410                 t1->psh = 0;
5411                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5412                 t1->syn = 0;
5413                 t1->ack_seq = htonl(sk->acked_seq);
5414                 t1->window = htons(tcp_select_window(sk));
5415                 t1->doff = sizeof(*t1)/4;
5416                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5417 
5418         }               
5419 
5420         /*
5421          *      Send it.
5422          */
5423         
5424         sk->prot->queue_xmit(sk, dev, buff, 1);
5425         tcp_statistics.TcpOutSegs++;
5426 }
5427 
5428 /*
5429  *      A window probe timeout has occurred.
5430  */
5431 
5432 void tcp_send_probe0(struct sock *sk)
     /*  */
5433 {
5434         if (sk->zapped)
5435                 return;         /* After a valid reset we can send no more */
5436 
5437         tcp_write_wakeup(sk);
5438 
5439         sk->backoff++;
5440         sk->rto = min(sk->rto << 1, 120*HZ);
5441         sk->retransmits++;
5442         sk->prot->retransmits ++;
5443         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5444 }
5445 
5446 /*
5447  *      Socket option code for TCP. 
5448  */
5449   
5450 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5451 {
5452         int val,err;
5453 
5454         if(level!=SOL_TCP)
5455                 return ip_setsockopt(sk,level,optname,optval,optlen);
5456 
5457         if (optval == NULL) 
5458                 return(-EINVAL);
5459 
5460         err=verify_area(VERIFY_READ, optval, sizeof(int));
5461         if(err)
5462                 return err;
5463         
5464         val = get_user((int *)optval);
5465 
5466         switch(optname)
5467         {
5468                 case TCP_MAXSEG:
5469 /*
5470  * values greater than interface MTU won't take effect.  however at
5471  * the point when this call is done we typically don't yet know
5472  * which interface is going to be used
5473  */
5474                         if(val<1||val>MAX_WINDOW)
5475                                 return -EINVAL;
5476                         sk->user_mss=val;
5477                         return 0;
5478                 case TCP_NODELAY:
5479                         sk->nonagle=(val==0)?0:1;
5480                         return 0;
5481                 default:
5482                         return(-ENOPROTOOPT);
5483         }
5484 }
5485 
5486 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5487 {
5488         int val,err;
5489 
5490         if(level!=SOL_TCP)
5491                 return ip_getsockopt(sk,level,optname,optval,optlen);
5492                         
5493         switch(optname)
5494         {
5495                 case TCP_MAXSEG:
5496                         val=sk->user_mss;
5497                         break;
5498                 case TCP_NODELAY:
5499                         val=sk->nonagle;
5500                         break;
5501                 default:
5502                         return(-ENOPROTOOPT);
5503         }
5504         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5505         if(err)
5506                 return err;
5507         put_user(sizeof(int),(int *) optlen);
5508 
5509         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5510         if(err)
5511                 return err;
5512         put_user(val,(int *)optval);
5513 
5514         return(0);
5515 }       
5516 
5517 
5518 struct proto tcp_prot = {
5519         tcp_close,
5520         ip_build_header,
5521         tcp_connect,
5522         tcp_accept,
5523         ip_queue_xmit,
5524         tcp_retransmit,
5525         tcp_write_wakeup,
5526         tcp_read_wakeup,
5527         tcp_rcv,
5528         tcp_select,
5529         tcp_ioctl,
5530         NULL,
5531         tcp_shutdown,
5532         tcp_setsockopt,
5533         tcp_getsockopt,
5534         tcp_sendmsg,
5535         tcp_recvmsg,
5536         NULL,           /* No special bind() */
5537         128,
5538         0,
5539         "TCP",
5540         0, 0,
5541         {NULL,}
5542 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS