root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_find_established
  6. tcp_dequeue_established
  7. tcp_close_pending
  8. tcp_time_wait
  9. tcp_do_retransmit
  10. reset_xmit_timer
  11. tcp_retransmit_time
  12. tcp_retransmit
  13. tcp_write_timeout
  14. retransmit_timer
  15. tcp_err
  16. tcp_readable
  17. tcp_listen_select
  18. tcp_select
  19. tcp_ioctl
  20. tcp_check
  21. tcp_send_check
  22. tcp_send_skb
  23. tcp_dequeue_partial
  24. tcp_send_partial
  25. tcp_enqueue_partial
  26. tcp_send_ack
  27. tcp_build_header
  28. tcp_sendmsg
  29. tcp_sendto
  30. tcp_write
  31. tcp_read_wakeup
  32. cleanup_rbuf
  33. tcp_recv_urg
  34. tcp_recvmsg
  35. tcp_recvfrom
  36. tcp_read
  37. tcp_close_state
  38. tcp_send_fin
  39. tcp_shutdown
  40. tcp_reset
  41. tcp_options
  42. default_mask
  43. tcp_init_seq
  44. tcp_conn_request
  45. tcp_close
  46. tcp_write_xmit
  47. tcp_ack
  48. tcp_fin
  49. tcp_data
  50. tcp_check_urg
  51. tcp_urg
  52. tcp_accept
  53. tcp_connect
  54. tcp_sequence
  55. tcp_std_reset
  56. tcp_rcv
  57. tcp_write_wakeup
  58. tcp_send_probe0
  59. tcp_setsockopt
  60. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications
 178  *
 179  *
 180  * To Fix:
 181  *              Fast path the code. Two things here - fix the window calculation
 182  *              so it doesn't iterate over the queue, also spot packets with no funny
 183  *              options arriving in order and process directly.
 184  *
 185  *              Implement RFC 1191 [Path MTU discovery]
 186  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 187  *              Rewrite output state machine to use a single queue and do low window
 188  *              situations as per the spec (RFC 1122)
 189  *              Speed up input assembly algorithm.
 190  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 191  *              could do with it working on IPv4
 192  *              User settable/learned rtt/max window/mtu
 193  *              Cope with MTU/device switches when retransmitting in tcp.
 194  *              Fix the window handling to use PR's new code.
 195  *
 196  *              Change the fundamental structure to a single send queue maintained
 197  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 198  *              active routes too]). Cut the queue off in tcp_retransmit/
 199  *              tcp_transmit.
 200  *              Change the receive queue to assemble as it goes. This lets us
 201  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 202  *              tcp_data/tcp_read as well as the window shrink crud.
 203  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 204  *              tcp_queue_skb seem obvious routines to extract.
 205  *      
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *              
 243  *      TCP_CLOSE               socket is finished
 244  */
 245 
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * 
 253  * Use of PSH (4.2.2.2)
 254  *   MAY aggregate data sent without the PSH flag. (does)
 255  *   MAY queue data recieved without the PSH flag. (does)
 256  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 257  *   MAY implement PSH on send calls. (doesn't, thus:)
 258  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 259  *     MUST set PSH on last segment (does)
 260  *   MAY pass received PSH to application layer (doesn't)
 261  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 262  * 
 263  * Window Size (4.2.2.3, 4.2.2.16)
 264  *   MUST treat window size as an unsigned number (does)
 265  *   SHOULD treat window size as a 32-bit number (does not)
 266  *   MUST NOT shrink window once it is offered (does not normally)
 267  *   
 268  * Urgent Pointer (4.2.2.4)
 269  * **MUST point urgent pointer to last byte of urgent data (not right
 270  *     after). (doesn't, to be like BSD)
 271  *   MUST inform application layer asynchronously of incoming urgent
 272  *     data. (does)
 273  *   MUST provide application with means of determining the amount of
 274  *     urgent data pending. (does)
 275  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 276  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 277  *      [Follows BSD 1 byte of urgent data]
 278  * 
 279  * TCP Options (4.2.2.5)
 280  *   MUST be able to recieve TCP options in any segment. (does)
 281  *   MUST ignore unsupported options (does)
 282  *   
 283  * Maximum Segment Size Option (4.2.2.6)
 284  *   MUST implement both sending and receiving MSS. (does)
 285  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 286  *     it always). (does, even when MSS == 536, which is legal)
 287  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 288  *   MUST calculate "effective send MSS" correctly:
 289  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 290  *     (does - but allows operator override)
 291  *  
 292  * TCP Checksum (4.2.2.7)
 293  *   MUST generate and check TCP checksum. (does)
 294  * 
 295  * Initial Sequence Number Selection (4.2.2.8)
 296  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 297  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 298  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 299  * 
 300  * Simultaneous Open Attempts (4.2.2.10)
 301  *   MUST support simultaneous open attempts (does)
 302  * 
 303  * Recovery from Old Duplicate SYN (4.2.2.11)
 304  *   MUST keep track of active vs. passive open (does)
 305  * 
 306  * RST segment (4.2.2.12)
 307  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 308  *     anything with it, which is standard)
 309  * 
 310  * Closing a Connection (4.2.2.13)
 311  *   MUST inform application of whether connectin was closed by RST or
 312  *     normal close. (does)
 313  *   MAY allow "half-duplex" close (treat connection as closed for the
 314  *     local app, even before handshake is done). (does)
 315  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 316  * 
 317  * Retransmission Timeout (4.2.2.15)
 318  *   MUST implement Jacobson's slow start and congestion avoidance
 319  *     stuff. (does) 
 320  * 
 321  * Probing Zero Windows (4.2.2.17)
 322  *   MUST support probing of zero windows. (does)
 323  *   MAY keep offered window closed indefinitely. (does)
 324  *   MUST allow remote window to stay closed indefinitely. (does)
 325  * 
 326  * Passive Open Calls (4.2.2.18)
 327  *   MUST NOT let new passive open affect other connections. (doesn't)
 328  *   MUST support passive opens (LISTENs) concurrently. (does)
 329  *   
 330  * Time to Live (4.2.2.19)
 331  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 332  * 
 333  * Event Processing (4.2.2.20)
 334  *   SHOULD queue out-of-order segments. (does)
 335  *   MUST aggregate ACK segments whenever possible. (does but badly)
 336  *   
 337  * Retransmission Timeout Calculation (4.2.3.1)
 338  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 339  *     calculation. (does, or at least explains them in the comments 8*b)
 340  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 341  * 
 342  * When to Send an ACK Segment (4.2.3.2)
 343  *   SHOULD implement delayed ACK. (does not)
 344  *   MUST keep ACK delay < 0.5 sec. (N/A)
 345  * 
 346  * When to Send a Window Update (4.2.3.3)
 347  *   MUST implement receiver-side SWS. (does)
 348  *   
 349  * When to Send Data (4.2.3.4)
 350  *   MUST implement sender-side SWS. (does - imperfectly)
 351  *   SHOULD implement Nagle algorithm. (does)
 352  * 
 353  * TCP Connection Failures (4.2.3.5)
 354  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 355  *   SHOULD inform application layer of soft errors. (doesn't)
 356  *   
 357  * TCP Keep-Alives (4.2.3.6)
 358  *   MAY provide keep-alives. (does)
 359  *   MUST make keep-alives configurable on a per-connection basis. (does)
 360  *   MUST default to no keep-alives. (does)
 361  * **MUST make keep-alive interval configurable. (doesn't)
 362  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 363  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 364  *     connection. (doesn't)
 365  *   SHOULD send keep-alive with no data. (does)
 366  * 
 367  * TCP Multihoming (4.2.3.7)
 368  *   MUST get source address from IP layer before sending first
 369  *     SYN. (does)
 370  *   MUST use same local address for all segments of a connection. (does)
 371  * 
 372  * IP Options (4.2.3.8)
 373  *   (I don't think the IP layer sees the IP options, yet.)
 374  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 375  *   MAY support Time Stamp and Record Route. (doesn't)
 376  * **MUST allow application to specify a source route. (doesn't?)
 377  * **MUST allow receieved Source Route option to set route for all future
 378  *     segments on this connection. (doesn't, not that I think it's a
 379  *     huge problem)
 380  * 
 381  * ICMP messages (4.2.3.9)
 382  *   MUST act on ICMP errors. (does)
 383  *   MUST slow transmission upon receipt of a Source Quench. (does)
 384  *   MUST NOT abort connection upon receipt of soft Destination
 385  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 386  *     Problems. (doesn't)
 387  *   SHOULD report soft Destination Unreachables etc. to the
 388  *     application. (doesn't)
 389  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 390  *     messages (2, 3, 4). (does)
 391  * 
 392  * Remote Address Validation (4.2.3.10)
 393  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 394  *   MUST ignore SYN with invalid source address. (does)
 395  *   MUST silently discard incoming SYN for broadcast/multicast
 396  *     address. (does) 
 397  * 
 398  * Asynchronous Reports (4.2.4.1)
 399  * **MUST provide mechanism for reporting soft errors to application
 400  *     layer. (doesn't)
 401  * 
 402  * Type of Service (4.2.4.2)
 403  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 404  * 
 405  * (Whew. -- MS 950903)
 406  **/
 407 
 408 #include <linux/types.h>
 409 #include <linux/sched.h>
 410 #include <linux/mm.h>
 411 #include <linux/time.h>
 412 #include <linux/string.h>
 413 #include <linux/config.h>
 414 #include <linux/socket.h>
 415 #include <linux/sockios.h>
 416 #include <linux/termios.h>
 417 #include <linux/in.h>
 418 #include <linux/fcntl.h>
 419 #include <linux/inet.h>
 420 #include <linux/netdevice.h>
 421 #include <net/snmp.h>
 422 #include <net/ip.h>
 423 #include <net/protocol.h>
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426 #include <net/arp.h>
 427 #include <linux/skbuff.h>
 428 #include <net/sock.h>
 429 #include <net/route.h>
 430 #include <linux/errno.h>
 431 #include <linux/timer.h>
 432 #include <asm/system.h>
 433 #include <asm/segment.h>
 434 #include <linux/mm.h>
 435 #include <net/checksum.h>
 436 
 437 /*
 438  *      The MSL timer is the 'normal' timer.
 439  */
 440  
 441 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 442 
 443 #define SEQ_TICK 3
 444 unsigned long seq_offset;
 445 struct tcp_mib  tcp_statistics;
 446 
 447 /*
 448  *      Cached last hit socket
 449  */
 450  
 451 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 452 volatile unsigned short  th_cache_dport, th_cache_sport;
 453 volatile struct sock *th_cache_sk;
 454 
 455 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 456 {
 457         unsigned long flags;
 458         save_flags(flags);
 459         cli();
 460         th_cache_saddr=0;
 461         th_cache_daddr=0;
 462         th_cache_dport=0;
 463         th_cache_sport=0;
 464         th_cache_sk=NULL;
 465         restore_flags(flags);
 466 }
 467 
 468 static void tcp_close(struct sock *sk, int timeout);
 469 
 470 
 471 /*
 472  *      The less said about this the better, but it works and will do for 1.2 
 473  */
 474 
 475 static struct wait_queue *master_select_wakeup;
 476 
 477 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 478 {
 479         if (a < b) 
 480                 return(a);
 481         return(b);
 482 }
 483 
 484 #undef STATE_TRACE
 485 
 486 #ifdef STATE_TRACE
 487 static char *statename[]={
 488         "Unused","Established","Syn Sent","Syn Recv",
 489         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 490         "Close Wait","Last ACK","Listen","Closing"
 491 };
 492 #endif
 493 
 494 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 495 {
 496         if(sk->state==TCP_ESTABLISHED)
 497                 tcp_statistics.TcpCurrEstab--;
 498 #ifdef STATE_TRACE
 499         if(sk->debug)
 500                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 501 #endif  
 502         /* This is a hack but it doesn't occur often and it's going to
 503            be a real        to fix nicely */
 504            
 505         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 506         {
 507                 wake_up_interruptible(&master_select_wakeup);
 508         }
 509         sk->state=state;
 510         if(state==TCP_ESTABLISHED)
 511                 tcp_statistics.TcpCurrEstab++;
 512         if(sk->state==TCP_CLOSE)
 513                 tcp_cache_zap();
 514 }
 515 
 516 /*
 517  *      This routine picks a TCP windows for a socket based on
 518  *      the following constraints
 519  *  
 520  *      1. The window can never be shrunk once it is offered (RFC 793)
 521  *      2. We limit memory per socket
 522  *   
 523  *      For now we use NET2E3's heuristic of offering half the memory
 524  *      we have handy. All is not as bad as this seems however because
 525  *      of two things. Firstly we will bin packets even within the window
 526  *      in order to get the data we are waiting for into the memory limit.
 527  *      Secondly we bin common duplicate forms at receive time
 528  *      Better heuristics welcome
 529  */
 530    
 531 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 532 {
 533         int new_window = sock_rspace(sk);
 534         
 535         if(sk->window_clamp)
 536                 new_window=min(sk->window_clamp,new_window);
 537         /*
 538          *      Two things are going on here.  First, we don't ever offer a
 539          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 540          *      receiver side of SWS as specified in RFC1122.
 541          *      Second, we always give them at least the window they
 542          *      had before, in order to avoid retracting window.  This
 543          *      is technically allowed, but RFC1122 advises against it and
 544          *      in practice it causes trouble.
 545          *
 546          *      Fixme: This doesn't correctly handle the case where
 547          *      new_window > sk->window but not by enough to allow for the
 548          *      shift in sequence space. 
 549          */
 550         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 551                 return(sk->window);
 552         return(new_window);
 553 }
 554 
 555 /*
 556  *      Find someone to 'accept'. Must be called with
 557  *      sk->inuse=1 or cli()
 558  */ 
 559 
 560 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 561 {
 562         struct sk_buff *p=skb_peek(&s->receive_queue);
 563         if(p==NULL)
 564                 return NULL;
 565         do
 566         {
 567                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 568                         return p;
 569                 p=p->next;
 570         }
 571         while(p!=(struct sk_buff *)&s->receive_queue);
 572         return NULL;
 573 }
 574 
 575 /*
 576  *      Remove a completed connection and return it. This is used by
 577  *      tcp_accept() to get connections from the queue.
 578  */
 579 
 580 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 581 {
 582         struct sk_buff *skb;
 583         unsigned long flags;
 584         save_flags(flags);
 585         cli(); 
 586         skb=tcp_find_established(s);
 587         if(skb!=NULL)
 588                 skb_unlink(skb);        /* Take it off the queue */
 589         restore_flags(flags);
 590         return skb;
 591 }
 592 
 593 /* 
 594  *      This routine closes sockets which have been at least partially
 595  *      opened, but not yet accepted. Currently it is only called by
 596  *      tcp_close, and timeout mirrors the value there. 
 597  */
 598 
 599 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 600 {
 601         struct sk_buff *skb;
 602 
 603         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 604         {
 605                 skb->sk->dead=1;
 606                 tcp_close(skb->sk, 0);
 607                 kfree_skb(skb, FREE_READ);
 608         }
 609         return;
 610 }
 611 
 612 /*
 613  *      Enter the time wait state. 
 614  */
 615 
 616 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 617 {
 618         tcp_set_state(sk,TCP_TIME_WAIT);
 619         sk->shutdown = SHUTDOWN_MASK;
 620         if (!sk->dead)
 621                 sk->state_change(sk);
 622         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 623 }
 624 
 625 /*
 626  *      A socket has timed out on its send queue and wants to do a
 627  *      little retransmitting. Currently this means TCP.
 628  */
 629 
 630 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 631 {
 632         struct sk_buff * skb;
 633         struct proto *prot;
 634         struct device *dev;
 635         int ct=0;
 636         struct rtable *rt;
 637 
 638         prot = sk->prot;
 639         skb = sk->send_head;
 640 
 641         while (skb != NULL)
 642         {
 643                 struct tcphdr *th;
 644                 struct iphdr *iph;
 645                 int size;
 646 
 647                 dev = skb->dev;
 648                 IS_SKB(skb);
 649                 skb->when = jiffies;
 650 
 651                 /*
 652                  *      Discard the surplus MAC header
 653                  */
 654                  
 655                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 656 
 657                 /*
 658                  * In general it's OK just to use the old packet.  However we
 659                  * need to use the current ack and window fields.  Urg and
 660                  * urg_ptr could possibly stand to be updated as well, but we
 661                  * don't keep the necessary data.  That shouldn't be a problem,
 662                  * if the other end is doing the right thing.  Since we're
 663                  * changing the packet, we have to issue a new IP identifier.
 664                  */
 665 
 666                 iph = (struct iphdr *)skb->data;
 667                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 668                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 669                 
 670                 /*
 671                  *      Note: We ought to check for window limits here but
 672                  *      currently this is done (less efficiently) elsewhere.
 673                  */
 674 
 675                 iph->id = htons(ip_id_count++);
 676                 ip_send_check(iph);
 677                 
 678                 /*
 679                  *      Put a MAC header back on (may cause ARPing)
 680                  */
 681                  
 682                 if(skb->localroute)
 683                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 684                 else
 685                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 686                         
 687                 if(rt==NULL)    /* Deep poo */
 688                 {
 689                         if(skb->sk)
 690                         {
 691                                 skb->sk->err=ENETUNREACH;
 692                                 skb->sk->error_report(skb->sk);
 693                         }
 694                 }
 695                 else
 696                 {
 697                         dev=rt->rt_dev;
 698                         skb->raddr=rt->rt_gateway;
 699                         if(skb->raddr==0)
 700                                 skb->raddr=iph->daddr;
 701                         skb->dev=dev;
 702                         skb->arp=1;
 703                         if(dev->hard_header)
 704                         {
 705                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 706                                         skb->arp=0;
 707                         }
 708                 
 709                         /*
 710                          *      This is not the right way to handle this. We have to
 711                          *      issue an up to date window and ack report with this 
 712                          *      retransmit to keep the odd buggy tcp that relies on 
 713                          *      the fact BSD does this happy. 
 714                          *      We don't however need to recalculate the entire 
 715                          *      checksum, so someone wanting a small problem to play
 716                          *      with might like to implement RFC1141/RFC1624 and speed
 717                          *      this up by avoiding a full checksum.
 718                          */
 719                  
 720                         th->ack_seq = ntohl(sk->acked_seq);
 721                         th->window = ntohs(tcp_select_window(sk));
 722                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 723                 
 724                         /*
 725                          *      If the interface is (still) up and running, kick it.
 726                          */
 727         
 728                         if (dev->flags & IFF_UP)
 729                         {
 730                                 /*
 731                                  *      If the packet is still being sent by the device/protocol
 732                                  *      below then don't retransmit. This is both needed, and good -
 733                                  *      especially with connected mode AX.25 where it stops resends
 734                                  *      occurring of an as yet unsent anyway frame!
 735                                  *      We still add up the counts as the round trip time wants
 736                                  *      adjusting.
 737                                  */
 738                                 if (sk && !skb_device_locked(skb))
 739                                 {
 740                                         /* Remove it from any existing driver queue first! */
 741                                         skb_unlink(skb);
 742                                         /* Now queue it */
 743                                         ip_statistics.IpOutRequests++;
 744                                         dev_queue_xmit(skb, dev, sk->priority);
 745                                 }
 746                         }
 747                 }
 748                 
 749                 /*
 750                  *      Count retransmissions
 751                  */
 752                  
 753                 ct++;
 754                 sk->prot->retransmits ++;
 755                 tcp_statistics.TcpRetransSegs++;
 756                 
 757 
 758                 /*
 759                  *      Only one retransmit requested.
 760                  */
 761         
 762                 if (!all)
 763                         break;
 764 
 765                 /*
 766                  *      This should cut it off before we send too many packets.
 767                  */
 768 
 769                 if (ct >= sk->cong_window)
 770                         break;
 771                 skb = skb->link3;
 772         }
 773 }
 774 
 775 /*
 776  *      Reset the retransmission timer
 777  */
 778  
 779 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 780 {
 781         del_timer(&sk->retransmit_timer);
 782         sk->ip_xmit_timeout = why;
 783         if((int)when < 0)
 784         {
 785                 when=3;
 786                 printk("Error: Negative timer in xmit_timer\n");
 787         }
 788         sk->retransmit_timer.expires=jiffies+when;
 789         add_timer(&sk->retransmit_timer);
 790 }
 791 
 792 /*
 793  *      This is the normal code called for timeouts.  It does the retransmission
 794  *      and then does backoff.  tcp_do_retransmit is separated out because
 795  *      tcp_ack needs to send stuff from the retransmit queue without
 796  *      initiating a backoff.
 797  */
 798 
 799 
 800 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 801 {
 802         tcp_do_retransmit(sk, all);
 803 
 804         /*
 805          * Increase the timeout each time we retransmit.  Note that
 806          * we do not increase the rtt estimate.  rto is initialized
 807          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 808          * that doubling rto each time is the least we can get away with.
 809          * In KA9Q, Karn uses this for the first few times, and then
 810          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 811          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 812          * defined in the protocol as the maximum possible RTT.  I guess
 813          * we'll have to use something other than TCP to talk to the
 814          * University of Mars.
 815          *
 816          * PAWS allows us longer timeouts and large windows, so once
 817          * implemented ftp to mars will work nicely. We will have to fix
 818          * the 120 second clamps though!
 819          */
 820 
 821         sk->retransmits++;
 822         sk->prot->retransmits++;
 823         sk->backoff++;
 824         sk->rto = min(sk->rto << 1, 120*HZ);
 825         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 826 }
 827 
 828 
 829 /*
 830  *      A timer event has trigger a tcp retransmit timeout. The
 831  *      socket xmit queue is ready and set up to send. Because
 832  *      the ack receive code keeps the queue straight we do
 833  *      nothing clever here.
 834  */
 835 
 836 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 837 {
 838         if (all) 
 839         {
 840                 tcp_retransmit_time(sk, all);
 841                 return;
 842         }
 843 
 844         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 845         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 846         sk->cong_count = 0;
 847 
 848         sk->cong_window = 1;
 849 
 850         /* Do the actual retransmit. */
 851         tcp_retransmit_time(sk, all);
 852 }
 853 
 854 /*
 855  *      A write timeout has occurred. Process the after effects.
 856  */
 857 
 858 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 859 {
 860         /*
 861          *      Look for a 'soft' timeout.
 862          */
 863         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 864                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 865         {
 866                 /*
 867                  *      Attempt to recover if arp has changed (unlikely!) or
 868                  *      a route has shifted (not supported prior to 1.3).
 869                  */
 870                 arp_destroy (sk->daddr, 0);
 871                 /*ip_route_check (sk->daddr);*/
 872         }
 873         
 874         /*
 875          *      Have we tried to SYN too many times (repent repent 8))
 876          */
 877          
 878         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 879         {
 880                 sk->err=ETIMEDOUT;
 881                 sk->error_report(sk);
 882                 del_timer(&sk->retransmit_timer);
 883                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 884                 tcp_set_state(sk,TCP_CLOSE);
 885                 /* Don't FIN, we got nothing back */
 886                 release_sock(sk);
 887                 return 0;
 888         }
 889         /*
 890          *      Has it gone just too far ?
 891          */
 892         if (sk->retransmits > TCP_RETR2) 
 893         {
 894                 sk->err = ETIMEDOUT;
 895                 sk->error_report(sk);
 896                 del_timer(&sk->retransmit_timer);
 897                 /*
 898                  *      Time wait the socket 
 899                  */
 900                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 901                 {
 902                         tcp_set_state(sk,TCP_TIME_WAIT);
 903                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 904                 }
 905                 else
 906                 {
 907                         /*
 908                          *      Clean up time.
 909                          */
 910                         tcp_set_state(sk, TCP_CLOSE);
 911                         release_sock(sk);
 912                         return 0;
 913                 }
 914         }
 915         return 1;
 916 }
 917 
 918 /*
 919  *      The TCP retransmit timer. This lacks a few small details.
 920  *
 921  *      1.      An initial rtt timeout on the probe0 should cause what we can
 922  *              of the first write queue buffer to be split and sent.
 923  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 924  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 925  *              tcp_err should save a 'soft error' for us.
 926  */
 927 
 928 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 929 {
 930         struct sock *sk = (struct sock*)data;
 931         int why = sk->ip_xmit_timeout;
 932 
 933         /* 
 934          * only process if socket is not in use
 935          */
 936 
 937         cli();
 938         if (sk->inuse || in_bh) 
 939         {
 940                 /* Try again in 1 second */
 941                 sk->retransmit_timer.expires = jiffies+HZ;
 942                 add_timer(&sk->retransmit_timer);
 943                 sti();
 944                 return;
 945         }
 946 
 947         sk->inuse = 1;
 948         sti();
 949 
 950         /* Always see if we need to send an ack. */
 951 
 952         if (sk->ack_backlog && !sk->zapped) 
 953         {
 954                 sk->prot->read_wakeup (sk);
 955                 if (! sk->dead)
 956                         sk->data_ready(sk,0);
 957         }
 958 
 959         /* Now we need to figure out why the socket was on the timer. */
 960 
 961         switch (why) 
 962         {
 963                 /* Window probing */
 964                 case TIME_PROBE0:
 965                         tcp_send_probe0(sk);
 966                         tcp_write_timeout(sk);
 967                         break;
 968                 /* Retransmitting */
 969                 case TIME_WRITE:
 970                         /* It could be we got here because we needed to send an ack.
 971                          * So we need to check for that.
 972                          */
 973                 {
 974                         struct sk_buff *skb;
 975                         unsigned long flags;
 976 
 977                         save_flags(flags);
 978                         cli();
 979                         skb = sk->send_head;
 980                         if (!skb) 
 981                         {
 982                                 restore_flags(flags);
 983                         } 
 984                         else 
 985                         {
 986                                 /*
 987                                  *      Kicked by a delayed ack. Reset timer
 988                                  *      correctly now
 989                                  */
 990                                 if (jiffies < skb->when + sk->rto) 
 991                                 {
 992                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 993                                         restore_flags(flags);
 994                                         break;
 995                                 }
 996                                 restore_flags(flags);
 997                                 /*
 998                                  *      Retransmission
 999                                  */
1000                                 sk->retransmits++;
1001                                 sk->prot->retransmits++;
1002                                 sk->prot->retransmit (sk, 0);
1003                                 tcp_write_timeout(sk);
1004                         }
1005                         break;
1006                 }
1007                 /* Sending Keepalives */
1008                 case TIME_KEEPOPEN:
1009                         /* 
1010                          * this reset_timer() call is a hack, this is not
1011                          * how KEEPOPEN is supposed to work.
1012                          */
1013                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1014 
1015                         /* Send something to keep the connection open. */
1016                         if (sk->prot->write_wakeup)
1017                                   sk->prot->write_wakeup (sk);
1018                         sk->retransmits++;
1019                         sk->prot->retransmits++;
1020                         tcp_write_timeout(sk);
1021                         break;
1022                 default:
1023                         printk ("rexmit_timer: timer expired - reason unknown\n");
1024                         break;
1025         }
1026         release_sock(sk);
1027 }
1028 
1029 /*
1030  * This routine is called by the ICMP module when it gets some
1031  * sort of error condition.  If err < 0 then the socket should
1032  * be closed and the error returned to the user.  If err > 0
1033  * it's just the icmp type << 8 | icmp code.  After adjustment
1034  * header points to the first 8 bytes of the tcp header.  We need
1035  * to find the appropriate port.
1036  */
1037 
1038 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
1039         __u32 saddr, struct inet_protocol *protocol)
1040 {
1041         struct tcphdr *th;
1042         struct sock *sk;
1043         struct iphdr *iph=(struct iphdr *)header;
1044   
1045         header+=4*iph->ihl;
1046    
1047 
1048         th =(struct tcphdr *)header;
1049         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1050 
1051         if (sk == NULL) 
1052                 return;
1053   
1054         if (type == ICMP_SOURCE_QUENCH) 
1055         {
1056                 /*
1057                  * FIXME:
1058                  * For now we will just trigger a linear backoff.
1059                  * The slow start code should cause a real backoff here.
1060                  */
1061                 if (sk->cong_window > 4)
1062                         sk->cong_window--;
1063                 return;
1064         }
1065         
1066         if (type == ICMP_PARAMETERPROB)
1067         {
1068                 sk->err=EPROTO;
1069                 sk->error_report(sk);
1070         }
1071 
1072         /*
1073          * If we've already connected we will keep trying
1074          * until we time out, or the user gives up.
1075          */
1076 
1077         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1078         {
1079                 sk->err = icmp_err_convert[code].errno;
1080                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1081                 {
1082                         tcp_statistics.TcpAttemptFails++;
1083                         tcp_set_state(sk,TCP_CLOSE);
1084                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1085                 }
1086         }
1087         return;
1088 }
1089 
1090 
1091 /*
1092  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1093  *      in the received data queue (ie a frame missing that needs sending to us). Not
1094  *      sorting using two queues as data arrives makes life so much harder.
1095  */
1096 
1097 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1098 {
1099         unsigned long counted;
1100         unsigned long amount;
1101         struct sk_buff *skb;
1102         int sum;
1103         unsigned long flags;
1104 
1105         if(sk && sk->debug)
1106                 printk("tcp_readable: %p - ",sk);
1107 
1108         save_flags(flags);
1109         cli();
1110         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1111         {
1112                 restore_flags(flags);
1113                 if(sk && sk->debug) 
1114                         printk("empty\n");
1115                 return(0);
1116         }
1117   
1118         counted = sk->copied_seq;       /* Where we are at the moment */
1119         amount = 0;
1120   
1121         /* 
1122          *      Do until a push or until we are out of data. 
1123          */
1124          
1125         do 
1126         {
1127                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1128                         break;
1129                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1130                 if (skb->h.th->syn)
1131                         sum++;
1132                 if (sum > 0) 
1133                 {                                       /* Add it up, move on */
1134                         amount += sum;
1135                         if (skb->h.th->syn) 
1136                                 amount--;
1137                         counted += sum;
1138                 }
1139                 /*
1140                  * Don't count urg data ... but do it in the right place!
1141                  * Consider: "old_data (ptr is here) URG PUSH data"
1142                  * The old code would stop at the first push because
1143                  * it counted the urg (amount==1) and then does amount--
1144                  * *after* the loop.  This means tcp_readable() always
1145                  * returned zero if any URG PUSH was in the queue, even
1146                  * though there was normal data available. If we subtract
1147                  * the urg data right here, we even get it to work for more
1148                  * than one URG PUSH skb without normal data.
1149                  * This means that select() finally works now with urg data
1150                  * in the queue.  Note that rlogin was never affected
1151                  * because it doesn't use select(); it uses two processes
1152                  * and a blocking read().  And the queue scan in tcp_read()
1153                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1154                  */
1155                 if (skb->h.th->urg)
1156                         amount--;       /* don't count urg data */
1157                 if (amount && skb->h.th->psh) break;
1158                 skb = skb->next;
1159         }
1160         while(skb != (struct sk_buff *)&sk->receive_queue);
1161 
1162         restore_flags(flags);
1163         if(sk->debug)
1164                 printk("got %lu bytes.\n",amount);
1165         return(amount);
1166 }
1167 
1168 /*
1169  * LISTEN is a special case for select..
1170  */
1171 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1172 {
1173         if (sel_type == SEL_IN) {
1174                 int retval;
1175 
1176                 sk->inuse = 1;
1177                 retval = (tcp_find_established(sk) != NULL);
1178                 release_sock(sk);
1179                 if (!retval)
1180                         select_wait(&master_select_wakeup,wait);
1181                 return retval;
1182         }
1183         return 0;
1184 }
1185 
1186 
1187 /*
1188  *      Wait for a TCP event.
1189  *
1190  *      Note that we don't need to set "sk->inuse", as the upper select layers
1191  *      take care of normal races (between the test and the event) and we don't
1192  *      go look at any of the socket buffers directly.
1193  */
1194 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1195 {
1196         if (sk->state == TCP_LISTEN)
1197                 return tcp_listen_select(sk, sel_type, wait);
1198 
1199         switch(sel_type) {
1200         case SEL_IN:
1201                 if (sk->err)
1202                         return 1;
1203                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204                         break;
1205 
1206                 if (sk->shutdown & RCV_SHUTDOWN)
1207                         return 1;
1208                         
1209                 if (sk->acked_seq == sk->copied_seq)
1210                         break;
1211 
1212                 if (sk->urg_seq != sk->copied_seq ||
1213                     sk->acked_seq != sk->copied_seq+1 ||
1214                     sk->urginline || !sk->urg_data)
1215                         return 1;
1216                 break;
1217 
1218         case SEL_OUT:
1219                 if (sk->err)
1220                         return 1;
1221                 if (sk->shutdown & SEND_SHUTDOWN) 
1222                         return 0;
1223                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1224                         break;
1225                 /*
1226                  * This is now right thanks to a small fix
1227                  * by Matt Dillon.
1228                  */
1229 
1230                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1231                         break;
1232                 return 1;
1233 
1234         case SEL_EX:
1235                 if (sk->urg_data)
1236                         return 1;
1237                 break;
1238         }
1239         select_wait(sk->sleep, wait);
1240         return 0;
1241 }
1242 
1243 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
1244 {
1245         int err;
1246         switch(cmd) 
1247         {
1248 
1249                 case TIOCINQ:
1250 #ifdef FIXME    /* FIXME: */
1251                 case FIONREAD:
1252 #endif
1253                 {
1254                         unsigned long amount;
1255 
1256                         if (sk->state == TCP_LISTEN) 
1257                                 return(-EINVAL);
1258 
1259                         sk->inuse = 1;
1260                         amount = tcp_readable(sk);
1261                         release_sock(sk);
1262                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1263                         if(err)
1264                                 return err;
1265                         put_user(amount, (int *)arg);
1266                         return(0);
1267                 }
1268                 case SIOCATMARK:
1269                 {
1270                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1271 
1272                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1273                         if (err)
1274                                 return err;
1275                         put_user(answ,(int *) arg);
1276                         return(0);
1277                 }
1278                 case TIOCOUTQ:
1279                 {
1280                         unsigned long amount;
1281 
1282                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1283                         amount = sock_wspace(sk);
1284                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1285                         if(err)
1286                                 return err;
1287                         put_user(amount, (int *)arg);
1288                         return(0);
1289                 }
1290                 default:
1291                         return(-EINVAL);
1292         }
1293 }
1294 
1295 
1296 /*
1297  *      This routine computes a TCP checksum. 
1298  *
1299  *      Modified January 1995 from a go-faster DOS routine by
1300  *      Jorge Cwik <jorge@laser.satlink.net>
1301  */
1302  
1303 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1304           unsigned long saddr, unsigned long daddr, unsigned long base)
1305 {     
1306         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1307 }
1308 
1309 
1310 
1311 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1312                 unsigned long daddr, int len, struct sock *sk)
1313 {
1314         th->check = 0;
1315         th->check = tcp_check(th, len, saddr, daddr,
1316                 csum_partial((char *)th,len,0));
1317         return;
1318 }
1319 
1320 /*
1321  *      This is the main buffer sending routine. We queue the buffer
1322  *      having checked it is sane seeming.
1323  */
1324  
1325 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1326 {
1327         int size;
1328         struct tcphdr * th = skb->h.th;
1329 
1330         /*
1331          *      length of packet (not counting length of pre-tcp headers) 
1332          */
1333          
1334         size = skb->len - ((unsigned char *) th - skb->data);
1335 
1336         /*
1337          *      Sanity check it.. 
1338          */
1339          
1340         if (size < sizeof(struct tcphdr) || size > skb->len) 
1341         {
1342                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1343                         skb, skb->data, th, skb->len);
1344                 kfree_skb(skb, FREE_WRITE);
1345                 return;
1346         }
1347 
1348         /*
1349          *      If we have queued a header size packet.. (these crash a few
1350          *      tcp stacks if ack is not set)
1351          */
1352          
1353         if (size == sizeof(struct tcphdr)) 
1354         {
1355                 /* If it's got a syn or fin it's notionally included in the size..*/
1356                 if(!th->syn && !th->fin) 
1357                 {
1358                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1359                         kfree_skb(skb,FREE_WRITE);
1360                         return;
1361                 }
1362         }
1363 
1364         /*
1365          *      Actual processing.
1366          */
1367          
1368         tcp_statistics.TcpOutSegs++;  
1369         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1370         
1371         /*
1372          *      We must queue if
1373          *
1374          *      a) The right edge of this frame exceeds the window
1375          *      b) We are retransmitting (Nagle's rule)
1376          *      c) We have too many packets 'in flight'
1377          */
1378          
1379         if (after(skb->h.seq, sk->window_seq) ||
1380             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1381              sk->packets_out >= sk->cong_window) 
1382         {
1383                 /* checksum will be supplied by tcp_write_xmit.  So
1384                  * we shouldn't need to set it at all.  I'm being paranoid */
1385                 th->check = 0;
1386                 if (skb->next != NULL) 
1387                 {
1388                         printk("tcp_send_partial: next != NULL\n");
1389                         skb_unlink(skb);
1390                 }
1391                 skb_queue_tail(&sk->write_queue, skb);
1392                 
1393                 /*
1394                  *      If we don't fit we have to start the zero window
1395                  *      probes. This is broken - we really need to do a partial
1396                  *      send _first_ (This is what causes the Cisco and PC/TCP
1397                  *      grief).
1398                  */
1399                  
1400                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1401                     sk->send_head == NULL && sk->ack_backlog == 0)
1402                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1403         } 
1404         else 
1405         {
1406                 /*
1407                  *      This is going straight out
1408                  */
1409                  
1410                 th->ack_seq = ntohl(sk->acked_seq);
1411                 th->window = ntohs(tcp_select_window(sk));
1412 
1413                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1414 
1415                 sk->sent_seq = sk->write_seq;
1416                 
1417                 /*
1418                  *      This is mad. The tcp retransmit queue is put together
1419                  *      by the ip layer. This causes half the problems with
1420                  *      unroutable FIN's and other things.
1421                  */
1422                  
1423                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1424                 
1425                 /*
1426                  *      Set for next retransmit based on expected ACK time.
1427                  *      FIXME: We set this every time which means our 
1428                  *      retransmits are really about a window behind.
1429                  */
1430 
1431                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1432         }
1433 }
1434 
1435 /*
1436  *      Locking problems lead us to a messy situation where we can have
1437  *      multiple partially complete buffers queued up. This is really bad
1438  *      as we don't want to be sending partial buffers. Fix this with
1439  *      a semaphore or similar to lock tcp_write per socket.
1440  *
1441  *      These routines are pretty self descriptive.
1442  */
1443  
1444 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1445 {
1446         struct sk_buff * skb;
1447         unsigned long flags;
1448 
1449         save_flags(flags);
1450         cli();
1451         skb = sk->partial;
1452         if (skb) {
1453                 sk->partial = NULL;
1454                 del_timer(&sk->partial_timer);
1455         }
1456         restore_flags(flags);
1457         return skb;
1458 }
1459 
1460 /*
1461  *      Empty the partial queue
1462  */
1463  
1464 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1465 {
1466         struct sk_buff *skb;
1467 
1468         if (sk == NULL)
1469                 return;
1470         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1471                 tcp_send_skb(sk, skb);
1472 }
1473 
1474 /*
1475  *      Queue a partial frame
1476  */
1477  
1478 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1479 {
1480         struct sk_buff * tmp;
1481         unsigned long flags;
1482 
1483         save_flags(flags);
1484         cli();
1485         tmp = sk->partial;
1486         if (tmp)
1487                 del_timer(&sk->partial_timer);
1488         sk->partial = skb;
1489         init_timer(&sk->partial_timer);
1490         /*
1491          *      Wait up to 1 second for the buffer to fill.
1492          */
1493         sk->partial_timer.expires = jiffies+HZ;
1494         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1495         sk->partial_timer.data = (unsigned long) sk;
1496         add_timer(&sk->partial_timer);
1497         restore_flags(flags);
1498         if (tmp)
1499                 tcp_send_skb(sk, tmp);
1500 }
1501 
1502 
1503 /*
1504  *      This routine sends an ack and also updates the window. 
1505  */
1506  
1507 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1508              struct sock *sk,
1509              struct tcphdr *th, unsigned long daddr)
1510 {
1511         struct sk_buff *buff;
1512         struct tcphdr *t1;
1513         struct device *dev = NULL;
1514         int tmp;
1515 
1516         if(sk->zapped)
1517                 return;         /* We have been reset, we may not send again */
1518                 
1519         /*
1520          * We need to grab some memory, and put together an ack,
1521          * and then put it into the queue to be sent.
1522          */
1523 
1524         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1525         if (buff == NULL) 
1526         {
1527                 /* 
1528                  *      Force it to send an ack. We don't have to do this
1529                  *      (ACK is unreliable) but it's much better use of 
1530                  *      bandwidth on slow links to send a spare ack than
1531                  *      resend packets. 
1532                  */
1533                  
1534                 sk->ack_backlog++;
1535                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1536                 {
1537                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1538                 }
1539                 return;
1540         }
1541 
1542         /*
1543          *      Assemble a suitable TCP frame
1544          */
1545          
1546         buff->sk = sk;
1547         buff->localroute = sk->localroute;
1548 
1549         /* 
1550          *      Put in the IP header and routing stuff. 
1551          */
1552          
1553         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1554                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1555         if (tmp < 0) 
1556         {
1557                 buff->free = 1;
1558                 sock_wfree(sk, buff);
1559                 return;
1560         }
1561         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1562 
1563         memcpy(t1, th, sizeof(*t1));
1564 
1565         /*
1566          *      Swap the send and the receive. 
1567          */
1568          
1569         t1->dest = th->source;
1570         t1->source = th->dest;
1571         t1->seq = ntohl(sequence);
1572         t1->ack = 1;
1573         sk->window = tcp_select_window(sk);
1574         t1->window = ntohs(sk->window);
1575         t1->res1 = 0;
1576         t1->res2 = 0;
1577         t1->rst = 0;
1578         t1->urg = 0;
1579         t1->syn = 0;
1580         t1->psh = 0;
1581         t1->fin = 0;
1582         
1583         /*
1584          *      If we have nothing queued for transmit and the transmit timer
1585          *      is on we are just doing an ACK timeout and need to switch
1586          *      to a keepalive.
1587          */
1588          
1589         if (ack == sk->acked_seq) 
1590         {
1591                 sk->ack_backlog = 0;
1592                 sk->bytes_rcv = 0;
1593                 sk->ack_timed = 0;
1594                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1595                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1596                 {
1597                         if(sk->keepopen) {
1598                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1599                         } else {
1600                                 delete_timer(sk);
1601                         }
1602                 }
1603         }
1604         
1605         /*
1606          *      Fill in the packet and send it
1607          */
1608          
1609         t1->ack_seq = ntohl(ack);
1610         t1->doff = sizeof(*t1)/4;
1611         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1612         if (sk->debug)
1613                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1614         tcp_statistics.TcpOutSegs++;
1615         sk->prot->queue_xmit(sk, dev, buff, 1);
1616 }
1617 
1618 
1619 /* 
1620  *      This routine builds a generic TCP header. 
1621  */
1622  
1623 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1624 {
1625 
1626         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1627         th->seq = htonl(sk->write_seq);
1628         th->psh =(push == 0) ? 1 : 0;
1629         th->doff = sizeof(*th)/4;
1630         th->ack = 1;
1631         th->fin = 0;
1632         sk->ack_backlog = 0;
1633         sk->bytes_rcv = 0;
1634         sk->ack_timed = 0;
1635         th->ack_seq = htonl(sk->acked_seq);
1636         sk->window = tcp_select_window(sk);
1637         th->window = htons(sk->window);
1638 
1639         return(sizeof(*th));
1640 }
1641 
1642 /*
1643  *      This routine copies from a user buffer into a socket,
1644  *      and starts the transmit system.
1645  */
1646 
1647 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
1648           int len, int nonblock, int flags)
1649 {
1650         int copied = 0;
1651         int copy;
1652         int tmp;
1653         int seglen;
1654         int iovct=0;
1655         struct sk_buff *skb;
1656         struct sk_buff *send_tmp;
1657         struct proto *prot;
1658         struct device *dev = NULL;
1659         unsigned char *from;
1660         
1661         /*
1662          *      Do sanity checking for sendmsg/sendto/send
1663          */
1664          
1665         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1666                 return -EINVAL;
1667         if (msg->msg_name)
1668         {
1669                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1670                 if(sk->state == TCP_CLOSE)
1671                         return -ENOTCONN;
1672                 if (msg->msg_namelen < sizeof(*addr))
1673                         return -EINVAL;
1674                 if (addr->sin_family && addr->sin_family != AF_INET) 
1675                         return -EINVAL;
1676                 if (addr->sin_port != sk->dummy_th.dest) 
1677                         return -EISCONN;
1678                 if (addr->sin_addr.s_addr != sk->daddr) 
1679                         return -EISCONN;
1680         }
1681         
1682         /*
1683          *      Ok commence sending
1684          */
1685         
1686         while(iovct<msg->msg_iovlen)
1687         {
1688                 seglen=msg->msg_iov[iovct].iov_len;
1689                 from=msg->msg_iov[iovct++].iov_base;
1690                 sk->inuse=1;
1691                 prot = sk->prot;
1692                 while(seglen > 0) 
1693                 {
1694                         if (sk->err) 
1695                         {                       /* Stop on an error */
1696                                 release_sock(sk);
1697                                 if (copied) 
1698                                         return(copied);
1699                                 tmp = -sk->err;
1700                                 sk->err = 0;
1701                                 return(tmp);
1702                         }
1703 
1704                         /*
1705                          *      First thing we do is make sure that we are established. 
1706                          */
1707         
1708                         if (sk->shutdown & SEND_SHUTDOWN) 
1709                         {
1710                                 release_sock(sk);
1711                                 sk->err = EPIPE;
1712                                 if (copied) 
1713                                         return(copied);
1714                                 sk->err = 0;
1715                                 return(-EPIPE);
1716                         }
1717 
1718                         /* 
1719                          *      Wait for a connection to finish.
1720                          */
1721                 
1722                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1723                         {
1724                                 if (sk->err) 
1725                                 {
1726                                         release_sock(sk);
1727                                         if (copied) 
1728                                                 return(copied);
1729                                         tmp = -sk->err;
1730                                         sk->err = 0;
1731                                         return(tmp);
1732                                 }               
1733         
1734                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1735                                 {
1736                                         release_sock(sk);
1737                                         if (copied) 
1738                                                 return(copied);
1739         
1740                                         if (sk->err) 
1741                                         {       
1742                                                 tmp = -sk->err;
1743                                                 sk->err = 0;
1744                                                 return(tmp);
1745                                         }
1746 
1747                                         if (sk->keepopen) 
1748                                         {
1749                                                 send_sig(SIGPIPE, current, 0);
1750                                         }
1751                                         return(-EPIPE);
1752                                 }
1753         
1754                                 if (nonblock || copied) 
1755                                 {
1756                                         release_sock(sk);
1757                                         if (copied) 
1758                                                 return(copied);
1759                                         return(-EAGAIN);
1760                                 }
1761         
1762                                 release_sock(sk);
1763                                 cli();
1764                         
1765                                 if (sk->state != TCP_ESTABLISHED &&
1766                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1767                                 {
1768                                         interruptible_sleep_on(sk->sleep);      
1769                                         if (current->signal & ~current->blocked)
1770                                         {
1771                                                 sti();
1772                                                 if (copied) 
1773                                                         return(copied);
1774                                                 return(-ERESTARTSYS);
1775                                         }
1776                                 }
1777                                 sk->inuse = 1;
1778                                 sti();
1779                         }
1780         
1781                 /*
1782                  * The following code can result in copy <= if sk->mss is ever
1783                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1784                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1785                  * had better not get here until we've seen his SYN and at least one
1786                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1787                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1788                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1789                  * before the exchange of SYN's.  If the initial ack from the other
1790                  * end has a window of 0, max_window and thus mss will both be 0.
1791                  */
1792         
1793                 /* 
1794                  *      Now we need to check if we have a half built packet. 
1795                  */
1796         
1797                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1798                         {
1799                                 int hdrlen;
1800 
1801                                  /* IP header + TCP header */
1802                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1803                                          + sizeof(struct tcphdr);
1804         
1805                                 /* Add more stuff to the end of skb->len */
1806                                 if (!(flags & MSG_OOB)) 
1807                                 {
1808                                         copy = min(sk->mss - (skb->len - hdrlen), len);
1809                                         /* FIXME: this is really a bug. */
1810                                         if (copy <= 0) 
1811                                         {
1812                                                 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1813                                                 copy = 0;
1814                                         }                 
1815                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1816                                         from += copy;
1817                                         copied += copy;
1818                                         len -= copy;
1819                                         seglen -= copy;
1820                                         sk->write_seq += copy;
1821                                         seglen -= copy;
1822                                 }
1823                                 if ((skb->len - hdrlen) >= sk->mss ||
1824                                         (flags & MSG_OOB) || !sk->packets_out)
1825                                         tcp_send_skb(sk, skb);
1826                                 else
1827                                         tcp_enqueue_partial(skb, sk);
1828                                 continue;
1829                         }
1830 
1831                 /*
1832                  * We also need to worry about the window.
1833                  * If window < 1/2 the maximum window we've seen from this
1834                  *   host, don't use it.  This is sender side
1835                  *   silly window prevention, as specified in RFC1122.
1836                  *   (Note that this is different than earlier versions of
1837                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1838                  *   use the whole MSS.  Since the results in the right
1839                  *   edge of the packet being outside the window, it will
1840                  *   be queued for later rather than sent.
1841                  */
1842 
1843                         copy = sk->window_seq - sk->write_seq;
1844                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1845                                 copy = sk->mss;
1846                         if (copy > len)
1847                                 copy = len;
1848 
1849                 /*
1850                  *      We should really check the window here also. 
1851                  */
1852                  
1853                         send_tmp = NULL;
1854                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1855                         {
1856                                 /*
1857                                  *      We will release the socket in case we sleep here. 
1858                                  */
1859                                 release_sock(sk);
1860                                 /*
1861                                  *      NB: following must be mtu, because mss can be increased.
1862                                  *      mss is always <= mtu 
1863                                  */
1864                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1865                                 sk->inuse = 1;
1866                                 send_tmp = skb;
1867                         } 
1868                         else 
1869                         {
1870                                 /*
1871                                  *      We will release the socket in case we sleep here. 
1872                                  */
1873                                 release_sock(sk);
1874                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1875                                 sk->inuse = 1;
1876                         }
1877         
1878                         /*
1879                          *      If we didn't get any memory, we need to sleep. 
1880                          */
1881         
1882                         if (skb == NULL) 
1883                         {
1884                                 sk->socket->flags |= SO_NOSPACE;
1885                                 if (nonblock) 
1886                                 {
1887                                         release_sock(sk);
1888                                         if (copied) 
1889                                                 return(copied);
1890                                         return(-EAGAIN);
1891                                 }
1892 
1893                                 /*
1894                                  *      FIXME: here is another race condition. 
1895                                  */
1896 
1897                                 tmp = sk->wmem_alloc;
1898                                 release_sock(sk);
1899                                 cli();
1900                                 /*
1901                                  *      Again we will try to avoid it. 
1902                                  */
1903                                 if (tmp <= sk->wmem_alloc &&
1904                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1905                                         && sk->err == 0) 
1906                                 {
1907                                         sk->socket->flags &= ~SO_NOSPACE;
1908                                         interruptible_sleep_on(sk->sleep);
1909                                         if (current->signal & ~current->blocked) 
1910                                         {
1911                                                 sti();
1912                                                 if (copied) 
1913                                                         return(copied);
1914                                                 return(-ERESTARTSYS);
1915                                         }
1916                                 }
1917                                 sk->inuse = 1;
1918                                 sti();
1919                                 continue;
1920                         }
1921 
1922                         skb->sk = sk;
1923                         skb->free = 0;
1924                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1925         
1926                         /*
1927                          * FIXME: we need to optimize this.
1928                          * Perhaps some hints here would be good.
1929                          */
1930                 
1931                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1932                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1933                         if (tmp < 0 ) 
1934                         {
1935                                 sock_wfree(sk, skb);
1936                                 release_sock(sk);
1937                                 if (copied) 
1938                                         return(copied);
1939                                 return(tmp);
1940                         }
1941                         skb->dev = dev;
1942                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1943                         tmp = tcp_build_header(skb->h.th, sk, len-copy);
1944                         if (tmp < 0) 
1945                         {
1946                                 sock_wfree(sk, skb);
1947                                 release_sock(sk);
1948                                 if (copied) 
1949                                         return(copied);
1950                                 return(tmp);
1951                         }
1952         
1953                         if (flags & MSG_OOB) 
1954                         {
1955                                 skb->h.th->urg = 1;
1956                                 skb->h.th->urg_ptr = ntohs(copy);
1957                         }
1958 
1959                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1960                 
1961                         from += copy;
1962                         copied += copy;
1963                         len -= copy;
1964                         seglen -= copy;
1965                         skb->free = 0;
1966                         sk->write_seq += copy;
1967                 
1968                         if (send_tmp != NULL && sk->packets_out) 
1969                         {
1970                                 tcp_enqueue_partial(send_tmp, sk);
1971                                 continue;
1972                         }
1973                         tcp_send_skb(sk, skb);
1974                 }
1975         }
1976         sk->err = 0;
1977 
1978 /*
1979  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1980  *      interactive fast network servers. It's meant to be on and
1981  *      it really improves the throughput though not the echo time
1982  *      on my slow slip link - Alan
1983  */
1984 
1985 /*
1986  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1987  */
1988  
1989         if(sk->partial && ((!sk->packets_out) 
1990      /* If not nagling we can send on the before case too.. */
1991               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1992         ))
1993                 tcp_send_partial(sk);
1994 
1995         release_sock(sk);
1996         return(copied);
1997 }
1998 
1999 static int tcp_sendto(struct sock *sk, const unsigned char *ubuf, int size, int noblock, unsigned flags,
     /* [previous][next][first][last][top][bottom][index][help] */
2000                 struct sockaddr_in *sin, int addr_len)
2001 {
2002         struct iovec iov;
2003         struct msghdr msg;
2004 
2005         iov.iov_base = (void *)ubuf;
2006         iov.iov_len  = size;
2007 
2008         msg.msg_name      = (void *)sin;
2009         msg.msg_namelen   = addr_len;
2010         msg.msg_accrights = NULL;
2011         msg.msg_iov       = &iov;
2012         msg.msg_iovlen    = 1;
2013 
2014         return tcp_sendmsg(sk, &msg, size, noblock, flags);
2015 }
2016 
2017 static int tcp_write(struct sock *sk, const unsigned char *ubuf, int size, int noblock, unsigned flags)
     /* [previous][next][first][last][top][bottom][index][help] */
2018 {
2019         return tcp_sendto(sk,ubuf,size,noblock,flags,NULL,0);
2020 }
2021 
2022 
2023 /*
2024  *      Send an ack if one is backlogged at this point. Ought to merge
2025  *      this with tcp_send_ack().
2026  */
2027  
2028 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2029 {
2030         int tmp;
2031         struct device *dev = NULL;
2032         struct tcphdr *t1;
2033         struct sk_buff *buff;
2034 
2035         if (!sk->ack_backlog) 
2036                 return;
2037 
2038         /*
2039          * If we're closed, don't send an ack, or we'll get a RST
2040          * from the closed destination.
2041          */
2042         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2043                 return; 
2044 
2045         /*
2046          * FIXME: we need to put code here to prevent this routine from
2047          * being called.  Being called once in a while is ok, so only check
2048          * if this is the second time in a row.
2049          */
2050 
2051         /*
2052          * We need to grab some memory, and put together an ack,
2053          * and then put it into the queue to be sent.
2054          */
2055 
2056         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2057         if (buff == NULL) 
2058         {
2059                 /* Try again real soon. */
2060                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2061                 return;
2062         }
2063 
2064         buff->sk = sk;
2065         buff->localroute = sk->localroute;
2066         
2067         /*
2068          *      Put in the IP header and routing stuff. 
2069          */
2070 
2071         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2072                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2073         if (tmp < 0) 
2074         {
2075                 buff->free = 1;
2076                 sock_wfree(sk, buff);
2077                 return;
2078         }
2079 
2080         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2081 
2082         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2083         t1->seq = htonl(sk->sent_seq);
2084         t1->ack = 1;
2085         t1->res1 = 0;
2086         t1->res2 = 0;
2087         t1->rst = 0;
2088         t1->urg = 0;
2089         t1->syn = 0;
2090         t1->psh = 0;
2091         sk->ack_backlog = 0;
2092         sk->bytes_rcv = 0;
2093         sk->window = tcp_select_window(sk);
2094         t1->window = ntohs(sk->window);
2095         t1->ack_seq = ntohl(sk->acked_seq);
2096         t1->doff = sizeof(*t1)/4;
2097         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2098         sk->prot->queue_xmit(sk, dev, buff, 1);
2099         tcp_statistics.TcpOutSegs++;
2100 }
2101 
2102 
2103 /*
2104  *      FIXME:
2105  *      This routine frees used buffers.
2106  *      It should consider sending an ACK to let the
2107  *      other end know we now have a bigger window.
2108  */
2109 
2110 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2111 {
2112         unsigned long flags;
2113         unsigned long left;
2114         struct sk_buff *skb;
2115         unsigned long rspace;
2116 
2117         if(sk->debug)
2118                 printk("cleaning rbuf for sk=%p\n", sk);
2119   
2120         save_flags(flags);
2121         cli();
2122   
2123         left = sock_rspace(sk);
2124  
2125         /*
2126          *      We have to loop through all the buffer headers,
2127          *      and try to free up all the space we can.
2128          */
2129 
2130         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2131         {
2132                 if (!skb->used || skb->users) 
2133                         break;
2134                 skb_unlink(skb);
2135                 skb->sk = sk;
2136                 kfree_skb(skb, FREE_READ);
2137         }
2138 
2139         restore_flags(flags);
2140 
2141         /*
2142          *      FIXME:
2143          *      At this point we should send an ack if the difference
2144          *      in the window, and the amount of space is bigger than
2145          *      TCP_WINDOW_DIFF.
2146          */
2147 
2148         if(sk->debug)
2149                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2150                                             left);
2151         if ((rspace=sock_rspace(sk)) != left) 
2152         {
2153                 /*
2154                  * This area has caused the most trouble.  The current strategy
2155                  * is to simply do nothing if the other end has room to send at
2156                  * least 3 full packets, because the ack from those will auto-
2157                  * matically update the window.  If the other end doesn't think
2158                  * we have much space left, but we have room for at least 1 more
2159                  * complete packet than it thinks we do, we will send an ack
2160                  * immediately.  Otherwise we will wait up to .5 seconds in case
2161                  * the user reads some more.
2162                  */
2163                 sk->ack_backlog++;
2164         /*
2165          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2166          * if the other end is offering a window smaller than the agreed on MSS
2167          * (called sk->mtu here).  In theory there's no connection between send
2168          * and receive, and so no reason to think that they're going to send
2169          * small packets.  For the moment I'm using the hack of reducing the mss
2170          * only on the send side, so I'm putting mtu here.
2171          */
2172 
2173                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2174                 {
2175                         /* Send an ack right now. */
2176                         tcp_read_wakeup(sk);
2177                 } 
2178                 else 
2179                 {
2180                         /* Force it to send an ack soon. */
2181                         int was_active = del_timer(&sk->retransmit_timer);
2182                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2183                         {
2184                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2185                         } 
2186                         else
2187                                 add_timer(&sk->retransmit_timer);
2188                 }
2189         }
2190 } 
2191 
2192 
2193 /*
2194  *      Handle reading urgent data. BSD has very simple semantics for
2195  *      this, no blocking and very strange errors 8)
2196  */
2197  
2198 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2199              struct msghdr *msg, int len, int flags, int *addr_len)
2200 {
2201         /*
2202          *      No URG data to read
2203          */
2204         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2205                 return -EINVAL; /* Yes this is right ! */
2206                 
2207         if (sk->err) 
2208         {
2209                 int tmp = -sk->err;
2210                 sk->err = 0;
2211                 return tmp;
2212         }
2213 
2214         if (sk->state == TCP_CLOSE || sk->done) 
2215         {
2216                 if (!sk->done) 
2217                 {
2218                         sk->done = 1;
2219                         return 0;
2220                 }
2221                 return -ENOTCONN;
2222         }
2223 
2224         if (sk->shutdown & RCV_SHUTDOWN) 
2225         {
2226                 sk->done = 1;
2227                 return 0;
2228         }
2229         sk->inuse = 1;
2230         if (sk->urg_data & URG_VALID) 
2231         {
2232                 char c = sk->urg_data;
2233                 if (!(flags & MSG_PEEK))
2234                         sk->urg_data = URG_READ;
2235                 memcpy_toiovec(msg->msg_iov, &c, 1);
2236                 if(msg->msg_name)
2237                 {
2238                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2239                         sin->sin_family=AF_INET;
2240                         sin->sin_addr.s_addr=sk->daddr;
2241                         sin->sin_port=sk->dummy_th.dest;
2242                 }
2243                 if(addr_len)
2244                         *addr_len=sizeof(struct sockaddr_in);
2245                 release_sock(sk);
2246                 return 1;
2247         }
2248         release_sock(sk);
2249         
2250         /*
2251          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2252          * the available implementations agree in this case:
2253          * this call should never block, independent of the
2254          * blocking state of the socket.
2255          * Mike <pall@rz.uni-karlsruhe.de>
2256          */
2257         return -EAGAIN;
2258 }
2259 
2260 
2261 /*
2262  *      This routine copies from a sock struct into the user buffer. 
2263  */
2264  
2265 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
2266         int len, int nonblock, int flags, int *addr_len)
2267 {
2268         struct wait_queue wait = { current, NULL };
2269         int copied = 0;
2270         u32 peek_seq;
2271         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2272         unsigned long used;
2273 
2274         /* 
2275          *      This error should be checked. 
2276          */
2277          
2278         if (sk->state == TCP_LISTEN)
2279                 return -ENOTCONN;
2280 
2281         /*
2282          *      Urgent data needs to be handled specially. 
2283          */
2284          
2285         if (flags & MSG_OOB)
2286                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2287 
2288         /*
2289          *      Copying sequence to update. This is volatile to handle
2290          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2291          *      inline and thus not flush cached variables otherwise).
2292          */
2293          
2294         peek_seq = sk->copied_seq;
2295         seq = &sk->copied_seq;
2296         if (flags & MSG_PEEK)
2297                 seq = &peek_seq;
2298 
2299         add_wait_queue(sk->sleep, &wait);
2300         sk->inuse = 1;
2301         while (len > 0) 
2302         {
2303                 struct sk_buff * skb;
2304                 u32 offset;
2305         
2306                 /*
2307                  * Are we at urgent data? Stop if we have read anything.
2308                  */
2309                  
2310                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2311                         break;
2312 
2313                 /*
2314                  *      Next get a buffer.
2315                  */
2316                  
2317                 current->state = TASK_INTERRUPTIBLE;
2318 
2319                 skb = skb_peek(&sk->receive_queue);
2320                 do 
2321                 {
2322                         if (!skb)
2323                                 break;
2324                         if (before(*seq, skb->h.th->seq))
2325                                 break;
2326                         offset = *seq - skb->h.th->seq;
2327                         if (skb->h.th->syn)
2328                                 offset--;
2329                         if (offset < skb->len)
2330                                 goto found_ok_skb;
2331                         if (skb->h.th->fin)
2332                                 goto found_fin_ok;
2333                         if (!(flags & MSG_PEEK))
2334                                 skb->used = 1;
2335                         skb = skb->next;
2336                 }
2337                 while (skb != (struct sk_buff *)&sk->receive_queue);
2338 
2339                 if (copied)
2340                         break;
2341 
2342                 if (sk->err) 
2343                 {
2344                         copied = -sk->err;
2345                         sk->err = 0;
2346                         break;
2347                 }
2348 
2349                 if (sk->state == TCP_CLOSE) 
2350                 {
2351                         if (!sk->done) 
2352                         {
2353                                 sk->done = 1;
2354                                 break;
2355                         }
2356                         copied = -ENOTCONN;
2357                         break;
2358                 }
2359 
2360                 if (sk->shutdown & RCV_SHUTDOWN) 
2361                 {
2362                         sk->done = 1;
2363                         break;
2364                 }
2365                         
2366                 if (nonblock) 
2367                 {
2368                         copied = -EAGAIN;
2369                         break;
2370                 }
2371 
2372                 cleanup_rbuf(sk);
2373                 release_sock(sk);
2374                 sk->socket->flags |= SO_WAITDATA;
2375                 schedule();
2376                 sk->socket->flags &= ~SO_WAITDATA;
2377                 sk->inuse = 1;
2378 
2379                 if (current->signal & ~current->blocked) 
2380                 {
2381                         copied = -ERESTARTSYS;
2382                         break;
2383                 }
2384                 continue;
2385 
2386         found_ok_skb:
2387                 /*
2388                  *      Lock the buffer. We can be fairly relaxed as
2389                  *      an interrupt will never steal a buffer we are 
2390                  *      using unless I've missed something serious in
2391                  *      tcp_data.
2392                  */
2393                 
2394                 skb->users++;
2395                 
2396                 /*
2397                  *      Ok so how much can we use ? 
2398                  */
2399                  
2400                 used = skb->len - offset;
2401                 if (len < used)
2402                         used = len;
2403                 /*
2404                  *      Do we have urgent data here? 
2405                  */
2406                 
2407                 if (sk->urg_data) 
2408                 {
2409                         u32 urg_offset = sk->urg_seq - *seq;
2410                         if (urg_offset < used) 
2411                         {
2412                                 if (!urg_offset) 
2413                                 {
2414                                         if (!sk->urginline) 
2415                                         {
2416                                                 ++*seq;
2417                                                 offset++;
2418                                                 used--;
2419                                         }
2420                                 }
2421                                 else
2422                                         used = urg_offset;
2423                         }
2424                 }
2425                 
2426                 /*
2427                  *      Copy it - We _MUST_ update *seq first so that we
2428                  *      don't ever double read when we have dual readers
2429                  */
2430                  
2431                 *seq += used;
2432 
2433                 /*
2434                  *      This memcpy_tofs can sleep. If it sleeps and we
2435                  *      do a second read it relies on the skb->users to avoid
2436                  *      a crash when cleanup_rbuf() gets called.
2437                  */
2438                  
2439                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2440                         skb->h.th->doff*4 + offset, used);
2441                 copied += used;
2442                 len -= used;
2443                 
2444                 /*
2445                  *      We now will not sleep again until we are finished
2446                  *      with skb. Sorry if you are doing the SMP port
2447                  *      but you'll just have to fix it neatly ;)
2448                  */
2449                  
2450                 skb->users --;
2451                 
2452                 if (after(sk->copied_seq,sk->urg_seq))
2453                         sk->urg_data = 0;
2454                 if (used + offset < skb->len)
2455                         continue;
2456                 
2457                 /*
2458                  *      Process the FIN.
2459                  */
2460 
2461                 if (skb->h.th->fin)
2462                         goto found_fin_ok;
2463                 if (flags & MSG_PEEK)
2464                         continue;
2465                 skb->used = 1;
2466                 continue;
2467 
2468         found_fin_ok:
2469                 ++*seq;
2470                 if (flags & MSG_PEEK)
2471                         break;
2472                         
2473                 /*
2474                  *      All is done
2475                  */
2476                  
2477                 skb->used = 1;
2478                 sk->shutdown |= RCV_SHUTDOWN;
2479                 break;
2480 
2481         }
2482         
2483         if(copied>0 && msg->msg_name)
2484         {
2485                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2486                 sin->sin_family=AF_INET;
2487                 sin->sin_addr.s_addr=sk->daddr;
2488                 sin->sin_port=sk->dummy_th.dest;
2489         }
2490         if(addr_len)
2491                 *addr_len=sizeof(struct sockaddr_in);
2492                 
2493         remove_wait_queue(sk->sleep, &wait);
2494         current->state = TASK_RUNNING;
2495 
2496         /* Clean up data we have read: This will do ACK frames */
2497         cleanup_rbuf(sk);
2498         release_sock(sk);
2499         return copied;
2500 }
2501 
2502 
2503 static int tcp_recvfrom(struct sock *sk, unsigned char *ubuf, int size, int noblock, unsigned flags,
     /* [previous][next][first][last][top][bottom][index][help] */
2504                 struct sockaddr_in *sa, int *addr_len)
2505 {
2506         struct iovec iov;
2507         struct msghdr msg;
2508 
2509         iov.iov_base = (void *)ubuf;
2510         iov.iov_len  = size;
2511 
2512         msg.msg_name      = (void *)sa;
2513         msg.msg_namelen   = 0;
2514         if (addr_len)
2515                 msg.msg_namelen = *addr_len;
2516         msg.msg_accrights = NULL;
2517         msg.msg_iov       = &iov;
2518         msg.msg_iovlen    = 1;
2519 
2520         return tcp_recvmsg(sk, &msg, size, noblock, flags, addr_len);
2521 }
2522 
2523 int tcp_read(struct sock *sk, unsigned char *buff, int len, int noblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2524          unsigned flags)
2525 {
2526         return(tcp_recvfrom(sk, buff, len, noblock, flags, NULL, NULL));
2527 }
2528 
2529 
2530 /*
2531  *      State processing on a close. This implements the state shift for
2532  *      sending our FIN frame. Note that we only send a FIN for some 
2533  *      states. A shutdown() may have already sent the FIN, or we may be
2534  *      closed.
2535  */
2536  
2537 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2538 {
2539         int ns=TCP_CLOSE;
2540         int send_fin=0;
2541         switch(sk->state)
2542         {
2543                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2544                         break;
2545                 case TCP_SYN_RECV:
2546                 case TCP_ESTABLISHED:   /* Closedown begin */
2547                         ns=TCP_FIN_WAIT1;
2548                         send_fin=1;
2549                         break;
2550                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2551                 case TCP_FIN_WAIT2:
2552                 case TCP_CLOSING:
2553                         ns=sk->state;
2554                         break;
2555                 case TCP_CLOSE:
2556                 case TCP_LISTEN:
2557                         break;
2558                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2559                                            wait only for the ACK */
2560                         ns=TCP_LAST_ACK;
2561                         send_fin=1;
2562         }
2563         
2564         tcp_set_state(sk,ns);
2565                 
2566         /*
2567          *      This is a (useful) BSD violating of the RFC. There is a
2568          *      problem with TCP as specified in that the other end could
2569          *      keep a socket open forever with no application left this end.
2570          *      We use a 3 minute timeout (about the same as BSD) then kill
2571          *      our end. If they send after that then tough - BUT: long enough
2572          *      that we won't make the old 4*rto = almost no time - whoops
2573          *      reset mistake.
2574          */
2575         if(dead && ns==TCP_FIN_WAIT2)
2576         {
2577                 int timer_active=del_timer(&sk->timer);
2578                 if(timer_active)
2579                         add_timer(&sk->timer);
2580                 else
2581                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2582         }
2583         
2584         return send_fin;
2585 }
2586 
2587 /*
2588  *      Send a fin.
2589  */
2590 
2591 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2592 {
2593         struct proto *prot =(struct proto *)sk->prot;
2594         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2595         struct tcphdr *t1;
2596         struct sk_buff *buff;
2597         struct device *dev=NULL;
2598         int tmp;
2599                 
2600         release_sock(sk); /* in case the malloc sleeps. */
2601         
2602         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2603         sk->inuse = 1;
2604 
2605         if (buff == NULL)
2606         {
2607                 /* This is a disaster if it occurs */
2608                 printk("tcp_send_fin: Impossible malloc failure");
2609                 return;
2610         }
2611 
2612         /*
2613          *      Administrivia
2614          */
2615          
2616         buff->sk = sk;
2617         buff->localroute = sk->localroute;
2618 
2619         /*
2620          *      Put in the IP header and routing stuff. 
2621          */
2622 
2623         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2624                            IPPROTO_TCP, sk->opt,
2625                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2626         if (tmp < 0) 
2627         {
2628                 int t;
2629                 /*
2630                  *      Finish anyway, treat this as a send that got lost. 
2631                  *      (Not good).
2632                  */
2633                  
2634                 buff->free = 1;
2635                 sock_wfree(sk,buff);
2636                 sk->write_seq++;
2637                 t=del_timer(&sk->timer);
2638                 if(t)
2639                         add_timer(&sk->timer);
2640                 else
2641                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2642                 return;
2643         }
2644         
2645         /*
2646          *      We ought to check if the end of the queue is a buffer and
2647          *      if so simply add the fin to that buffer, not send it ahead.
2648          */
2649 
2650         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2651         buff->dev = dev;
2652         memcpy(t1, th, sizeof(*t1));
2653         t1->seq = ntohl(sk->write_seq);
2654         sk->write_seq++;
2655         buff->h.seq = sk->write_seq;
2656         t1->ack = 1;
2657         t1->ack_seq = ntohl(sk->acked_seq);
2658         t1->window = ntohs(sk->window=tcp_select_window(sk));
2659         t1->fin = 1;
2660         t1->rst = 0;
2661         t1->doff = sizeof(*t1)/4;
2662         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2663 
2664         /*
2665          * If there is data in the write queue, the fin must be appended to
2666          * the write queue.
2667          */
2668         
2669         if (skb_peek(&sk->write_queue) != NULL) 
2670         {
2671                 buff->free = 0;
2672                 if (buff->next != NULL) 
2673                 {
2674                         printk("tcp_send_fin: next != NULL\n");
2675                         skb_unlink(buff);
2676                 }
2677                 skb_queue_tail(&sk->write_queue, buff);
2678         } 
2679         else 
2680         {
2681                 sk->sent_seq = sk->write_seq;
2682                 sk->prot->queue_xmit(sk, dev, buff, 0);
2683                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2684         }
2685 }
2686 
2687 /*
2688  *      Shutdown the sending side of a connection. Much like close except
2689  *      that we don't receive shut down or set sk->dead=1.
2690  */
2691 
2692 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2693 {
2694         /*
2695          *      We need to grab some memory, and put together a FIN,
2696          *      and then put it into the queue to be sent.
2697          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2698          */
2699 
2700         if (!(how & SEND_SHUTDOWN)) 
2701                 return;
2702          
2703         /*
2704          *      If we've already sent a FIN, or it's a closed state
2705          */
2706          
2707         if (sk->state == TCP_FIN_WAIT1 ||
2708             sk->state == TCP_FIN_WAIT2 ||
2709             sk->state == TCP_CLOSING ||
2710             sk->state == TCP_LAST_ACK ||
2711             sk->state == TCP_TIME_WAIT || 
2712             sk->state == TCP_CLOSE ||
2713             sk->state == TCP_LISTEN
2714           )
2715         {
2716                 return;
2717         }
2718         sk->inuse = 1;
2719 
2720         /*
2721          * flag that the sender has shutdown
2722          */
2723 
2724         sk->shutdown |= SEND_SHUTDOWN;
2725 
2726         /*
2727          *  Clear out any half completed packets. 
2728          */
2729 
2730         if (sk->partial)
2731                 tcp_send_partial(sk);
2732                 
2733         /*
2734          *      FIN if needed
2735          */
2736          
2737         if(tcp_close_state(sk,0))
2738                 tcp_send_fin(sk);
2739                 
2740         release_sock(sk);
2741 }
2742 
2743 /*
2744  *      This routine will send an RST to the other tcp. 
2745  */
2746  
2747 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2748           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2749 {
2750         struct sk_buff *buff;
2751         struct tcphdr *t1;
2752         int tmp;
2753         struct device *ndev=NULL;
2754 
2755         /*
2756          *      Cannot reset a reset (Think about it).
2757          */
2758          
2759         if(th->rst)
2760                 return;
2761   
2762         /*
2763          * We need to grab some memory, and put together an RST,
2764          * and then put it into the queue to be sent.
2765          */
2766 
2767         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2768         if (buff == NULL) 
2769                 return;
2770 
2771         buff->sk = NULL;
2772         buff->dev = dev;
2773         buff->localroute = 0;
2774 
2775         /*
2776          *      Put in the IP header and routing stuff. 
2777          */
2778 
2779         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2780                            sizeof(struct tcphdr),tos,ttl);
2781         if (tmp < 0) 
2782         {
2783                 buff->free = 1;
2784                 sock_wfree(NULL, buff);
2785                 return;
2786         }
2787 
2788         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2789         memcpy(t1, th, sizeof(*t1));
2790 
2791         /*
2792          *      Swap the send and the receive. 
2793          */
2794 
2795         t1->dest = th->source;
2796         t1->source = th->dest;
2797         t1->rst = 1;  
2798         t1->window = 0;
2799   
2800         if(th->ack)
2801         {
2802                 t1->ack = 0;
2803                 t1->seq = th->ack_seq;
2804                 t1->ack_seq = 0;
2805         }
2806         else
2807         {
2808                 t1->ack = 1;
2809                 if(!th->syn)
2810                         t1->ack_seq=htonl(th->seq);
2811                 else
2812                         t1->ack_seq=htonl(th->seq+1);
2813                 t1->seq=0;
2814         }
2815 
2816         t1->syn = 0;
2817         t1->urg = 0;
2818         t1->fin = 0;
2819         t1->psh = 0;
2820         t1->doff = sizeof(*t1)/4;
2821         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2822         prot->queue_xmit(NULL, ndev, buff, 1);
2823         tcp_statistics.TcpOutSegs++;
2824 }
2825 
2826 
2827 /*
2828  *      Look for tcp options. Parses everything but only knows about MSS.
2829  *      This routine is always called with the packet containing the SYN.
2830  *      However it may also be called with the ack to the SYN.  So you
2831  *      can't assume this is always the SYN.  It's always called after
2832  *      we have set up sk->mtu to our own MTU.
2833  *
2834  *      We need at minimum to add PAWS support here. Possibly large windows
2835  *      as Linux gets deployed on 100Mb/sec networks.
2836  */
2837  
2838 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2839 {
2840         unsigned char *ptr;
2841         int length=(th->doff*4)-sizeof(struct tcphdr);
2842         int mss_seen = 0;
2843     
2844         ptr = (unsigned char *)(th + 1);
2845   
2846         while(length>0)
2847         {
2848                 int opcode=*ptr++;
2849                 int opsize=*ptr++;
2850                 switch(opcode)
2851                 {
2852                         case TCPOPT_EOL:
2853                                 return;
2854                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2855                                 length--;
2856                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2857                                 continue;
2858                         
2859                         default:
2860                                 if(opsize<=2)   /* Avoid silly options looping forever */
2861                                         return;
2862                                 switch(opcode)
2863                                 {
2864                                         case TCPOPT_MSS:
2865                                                 if(opsize==4 && th->syn)
2866                                                 {
2867                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2868                                                         mss_seen = 1;
2869                                                 }
2870                                                 break;
2871                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2872                                 }
2873                                 ptr+=opsize-2;
2874                                 length-=opsize;
2875                 }
2876         }
2877         if (th->syn) 
2878         {
2879                 if (! mss_seen)
2880                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2881         }
2882 #ifdef CONFIG_INET_PCTCP
2883         sk->mss = min(sk->max_window >> 1, sk->mtu);
2884 #else    
2885         sk->mss = min(sk->max_window, sk->mtu);
2886 #endif  
2887 }
2888 
2889 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2890 {
2891         dst = ntohl(dst);
2892         if (IN_CLASSA(dst))
2893                 return htonl(IN_CLASSA_NET);
2894         if (IN_CLASSB(dst))
2895                 return htonl(IN_CLASSB_NET);
2896         return htonl(IN_CLASSC_NET);
2897 }
2898 
2899 /*
2900  *      Default sequence number picking algorithm.
2901  *      As close as possible to RFC 793, which
2902  *      suggests using a 250kHz clock.
2903  *      Further reading shows this assumes 2MB/s networks.
2904  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2905  *      That's funny, Linux has one built in!  Use it!
2906  */
2907 
2908 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2909 {
2910         struct timeval tv;
2911         do_gettimeofday(&tv);
2912         return tv.tv_usec+tv.tv_sec*1000000;
2913 }
2914 
2915 /*
2916  *      This routine handles a connection request.
2917  *      It should make sure we haven't already responded.
2918  *      Because of the way BSD works, we have to send a syn/ack now.
2919  *      This also means it will be harder to close a socket which is
2920  *      listening.
2921  */
2922  
2923 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2924                  unsigned long daddr, unsigned long saddr,
2925                  struct options *opt, struct device *dev, u32 seq)
2926 {
2927         struct sk_buff *buff;
2928         struct tcphdr *t1;
2929         unsigned char *ptr;
2930         struct sock *newsk;
2931         struct tcphdr *th;
2932         struct device *ndev=NULL;
2933         int tmp;
2934         struct rtable *rt;
2935   
2936         th = skb->h.th;
2937 
2938         /* If the socket is dead, don't accept the connection. */
2939         if (!sk->dead) 
2940         {
2941                 sk->data_ready(sk,0);
2942         }
2943         else 
2944         {
2945                 if(sk->debug)
2946                         printk("Reset on %p: Connect on dead socket.\n",sk);
2947                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2948                 tcp_statistics.TcpAttemptFails++;
2949                 kfree_skb(skb, FREE_READ);
2950                 return;
2951         }
2952 
2953         /*
2954          * Make sure we can accept more.  This will prevent a
2955          * flurry of syns from eating up all our memory.
2956          */
2957 
2958         if (sk->ack_backlog >= sk->max_ack_backlog) 
2959         {
2960                 tcp_statistics.TcpAttemptFails++;
2961                 kfree_skb(skb, FREE_READ);
2962                 return;
2963         }
2964 
2965         /*
2966          * We need to build a new sock struct.
2967          * It is sort of bad to have a socket without an inode attached
2968          * to it, but the wake_up's will just wake up the listening socket,
2969          * and if the listening socket is destroyed before this is taken
2970          * off of the queue, this will take care of it.
2971          */
2972 
2973         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2974         if (newsk == NULL) 
2975         {
2976                 /* just ignore the syn.  It will get retransmitted. */
2977                 tcp_statistics.TcpAttemptFails++;
2978                 kfree_skb(skb, FREE_READ);
2979                 return;
2980         }
2981 
2982         memcpy(newsk, sk, sizeof(*newsk));
2983         newsk->opt = NULL;
2984         if (opt && opt->optlen) {
2985           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2986           if (!sk->opt) {
2987                 kfree_s(newsk, sizeof(struct sock));
2988                 tcp_statistics.TcpAttemptFails++;
2989                 kfree_skb(skb, FREE_READ);
2990                 return;
2991           }
2992           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2993                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2994                 kfree_s(newsk, sizeof(struct sock));
2995                 tcp_statistics.TcpAttemptFails++;
2996                 kfree_skb(skb, FREE_READ);
2997                 return;
2998           }
2999         }
3000         skb_queue_head_init(&newsk->write_queue);
3001         skb_queue_head_init(&newsk->receive_queue);
3002         newsk->send_head = NULL;
3003         newsk->send_tail = NULL;
3004         skb_queue_head_init(&newsk->back_log);
3005         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3006         newsk->rto = TCP_TIMEOUT_INIT;
3007         newsk->mdev = 0;
3008         newsk->max_window = 0;
3009         newsk->cong_window = 1;
3010         newsk->cong_count = 0;
3011         newsk->ssthresh = 0;
3012         newsk->backoff = 0;
3013         newsk->blog = 0;
3014         newsk->intr = 0;
3015         newsk->proc = 0;
3016         newsk->done = 0;
3017         newsk->partial = NULL;
3018         newsk->pair = NULL;
3019         newsk->wmem_alloc = 0;
3020         newsk->rmem_alloc = 0;
3021         newsk->localroute = sk->localroute;
3022 
3023         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3024 
3025         newsk->err = 0;
3026         newsk->shutdown = 0;
3027         newsk->ack_backlog = 0;
3028         newsk->acked_seq = skb->h.th->seq+1;
3029         newsk->copied_seq = skb->h.th->seq+1;
3030         newsk->fin_seq = skb->h.th->seq;
3031         newsk->state = TCP_SYN_RECV;
3032         newsk->timeout = 0;
3033         newsk->ip_xmit_timeout = 0;
3034         newsk->write_seq = seq; 
3035         newsk->window_seq = newsk->write_seq;
3036         newsk->rcv_ack_seq = newsk->write_seq;
3037         newsk->urg_data = 0;
3038         newsk->retransmits = 0;
3039         newsk->linger=0;
3040         newsk->destroy = 0;
3041         init_timer(&newsk->timer);
3042         newsk->timer.data = (unsigned long)newsk;
3043         newsk->timer.function = &net_timer;
3044         init_timer(&newsk->retransmit_timer);
3045         newsk->retransmit_timer.data = (unsigned long)newsk;
3046         newsk->retransmit_timer.function=&retransmit_timer;
3047         newsk->dummy_th.source = skb->h.th->dest;
3048         newsk->dummy_th.dest = skb->h.th->source;
3049         
3050         /*
3051          *      Swap these two, they are from our point of view. 
3052          */
3053          
3054         newsk->daddr = saddr;
3055         newsk->saddr = daddr;
3056 
3057         put_sock(newsk->num,newsk);
3058         newsk->dummy_th.res1 = 0;
3059         newsk->dummy_th.doff = 6;
3060         newsk->dummy_th.fin = 0;
3061         newsk->dummy_th.syn = 0;
3062         newsk->dummy_th.rst = 0;        
3063         newsk->dummy_th.psh = 0;
3064         newsk->dummy_th.ack = 0;
3065         newsk->dummy_th.urg = 0;
3066         newsk->dummy_th.res2 = 0;
3067         newsk->acked_seq = skb->h.th->seq + 1;
3068         newsk->copied_seq = skb->h.th->seq + 1;
3069         newsk->socket = NULL;
3070 
3071         /*
3072          *      Grab the ttl and tos values and use them 
3073          */
3074 
3075         newsk->ip_ttl=sk->ip_ttl;
3076         newsk->ip_tos=skb->ip_hdr->tos;
3077 
3078         /*
3079          *      Use 512 or whatever user asked for 
3080          */
3081 
3082         /*
3083          *      Note use of sk->user_mss, since user has no direct access to newsk 
3084          */
3085 
3086         rt=ip_rt_route(saddr, NULL,NULL);
3087         
3088         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3089                 newsk->window_clamp = rt->rt_window;
3090         else
3091                 newsk->window_clamp = 0;
3092                 
3093         if (sk->user_mss)
3094                 newsk->mtu = sk->user_mss;
3095         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3096                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3097         else 
3098         {
3099 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3100                 if ((saddr ^ daddr) & default_mask(saddr))
3101 #else
3102                 if ((saddr ^ daddr) & dev->pa_mask)
3103 #endif
3104                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3105                 else
3106                         newsk->mtu = MAX_WINDOW;
3107         }
3108 
3109         /*
3110          *      But not bigger than device MTU 
3111          */
3112 
3113         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3114 
3115         /*
3116          *      This will min with what arrived in the packet 
3117          */
3118 
3119         tcp_options(newsk,skb->h.th);
3120         
3121         tcp_cache_zap();
3122 
3123         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3124         if (buff == NULL) 
3125         {
3126                 sk->err = ENOMEM;
3127                 newsk->dead = 1;
3128                 newsk->state = TCP_CLOSE;
3129                 /* And this will destroy it */
3130                 release_sock(newsk);
3131                 kfree_skb(skb, FREE_READ);
3132                 tcp_statistics.TcpAttemptFails++;
3133                 return;
3134         }
3135   
3136         buff->sk = newsk;
3137         buff->localroute = newsk->localroute;
3138 
3139         /*
3140          *      Put in the IP header and routing stuff. 
3141          */
3142 
3143         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3144                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3145 
3146         /*
3147          *      Something went wrong. 
3148          */
3149 
3150         if (tmp < 0) 
3151         {
3152                 sk->err = tmp;
3153                 buff->free = 1;
3154                 kfree_skb(buff,FREE_WRITE);
3155                 newsk->dead = 1;
3156                 newsk->state = TCP_CLOSE;
3157                 release_sock(newsk);
3158                 skb->sk = sk;
3159                 kfree_skb(skb, FREE_READ);
3160                 tcp_statistics.TcpAttemptFails++;
3161                 return;
3162         }
3163 
3164         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3165   
3166         memcpy(t1, skb->h.th, sizeof(*t1));
3167         buff->h.seq = newsk->write_seq;
3168         /*
3169          *      Swap the send and the receive. 
3170          */
3171         t1->dest = skb->h.th->source;
3172         t1->source = newsk->dummy_th.source;
3173         t1->seq = ntohl(newsk->write_seq++);
3174         t1->ack = 1;
3175         newsk->window = tcp_select_window(newsk);
3176         newsk->sent_seq = newsk->write_seq;
3177         t1->window = ntohs(newsk->window);
3178         t1->res1 = 0;
3179         t1->res2 = 0;
3180         t1->rst = 0;
3181         t1->urg = 0;
3182         t1->psh = 0;
3183         t1->syn = 1;
3184         t1->ack_seq = ntohl(skb->h.th->seq+1);
3185         t1->doff = sizeof(*t1)/4+1;
3186         ptr = skb_put(buff,4);
3187         ptr[0] = 2;
3188         ptr[1] = 4;
3189         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3190         ptr[3] =(newsk->mtu) & 0xff;
3191 
3192         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3193         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3194         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3195         skb->sk = newsk;
3196 
3197         /*
3198          *      Charge the sock_buff to newsk. 
3199          */
3200          
3201         sk->rmem_alloc -= skb->truesize;
3202         newsk->rmem_alloc += skb->truesize;
3203         
3204         skb_queue_tail(&sk->receive_queue,skb);
3205         sk->ack_backlog++;
3206         release_sock(newsk);
3207         tcp_statistics.TcpOutSegs++;
3208 }
3209 
3210 
3211 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
3212 {
3213         /*
3214          * We need to grab some memory, and put together a FIN, 
3215          * and then put it into the queue to be sent.
3216          */
3217         
3218         sk->inuse = 1;
3219         
3220         if(th_cache_sk==sk)
3221                 tcp_cache_zap();
3222         if(sk->state == TCP_LISTEN)
3223         {
3224                 /* Special case */
3225                 tcp_set_state(sk, TCP_CLOSE);
3226                 tcp_close_pending(sk);
3227                 release_sock(sk);
3228                 return;
3229         }
3230         
3231         sk->keepopen = 1;
3232         sk->shutdown = SHUTDOWN_MASK;
3233 
3234         if (!sk->dead) 
3235                 sk->state_change(sk);
3236 
3237         if (timeout == 0) 
3238         {
3239                 struct sk_buff *skb;
3240                 
3241                 /*
3242                  *  We need to flush the recv. buffs.  We do this only on the
3243                  *  descriptor close, not protocol-sourced closes, because the
3244                  *  reader process may not have drained the data yet!
3245                  */
3246                  
3247                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3248                         kfree_skb(skb, FREE_READ);
3249                 /*
3250                  *      Get rid off any half-completed packets. 
3251                  */
3252 
3253                 if (sk->partial) 
3254                         tcp_send_partial(sk);
3255         }
3256 
3257                 
3258         /*
3259          *      Timeout is not the same thing - however the code likes
3260          *      to send both the same way (sigh).
3261          */
3262          
3263         if(timeout)
3264         {
3265                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3266         }
3267         else
3268         {
3269                 if(tcp_close_state(sk,1)==1)
3270                 {
3271                         tcp_send_fin(sk);
3272                 }
3273         }
3274         release_sock(sk);
3275 }
3276 
3277 
3278 /*
3279  *      This routine takes stuff off of the write queue,
3280  *      and puts it in the xmit queue. This happens as incoming acks
3281  *      open up the remote window for us.
3282  */
3283  
3284 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3285 {
3286         struct sk_buff *skb;
3287 
3288         /*
3289          *      The bytes will have to remain here. In time closedown will
3290          *      empty the write queue and all will be happy 
3291          */
3292 
3293         if(sk->zapped)
3294                 return;
3295 
3296         /*
3297          *      Anything on the transmit queue that fits the window can
3298          *      be added providing we are not
3299          *
3300          *      a) retransmitting (Nagle's rule)
3301          *      b) exceeding our congestion window.
3302          */
3303          
3304         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3305                 before(skb->h.seq, sk->window_seq + 1) &&
3306                 (sk->retransmits == 0 ||
3307                  sk->ip_xmit_timeout != TIME_WRITE ||
3308                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3309                 && sk->packets_out < sk->cong_window) 
3310         {
3311                 IS_SKB(skb);
3312                 skb_unlink(skb);
3313                 
3314                 /*
3315                  *      See if we really need to send the packet. 
3316                  */
3317                  
3318                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3319                 {
3320                         /*
3321                          *      This is acked data. We can discard it. This 
3322                          *      cannot currently occur.
3323                          */
3324                          
3325                         sk->retransmits = 0;
3326                         kfree_skb(skb, FREE_WRITE);
3327                         if (!sk->dead) 
3328                                 sk->write_space(sk);
3329                 } 
3330                 else
3331                 {
3332                         struct tcphdr *th;
3333                         struct iphdr *iph;
3334                         int size;
3335 /*
3336  * put in the ack seq and window at this point rather than earlier,
3337  * in order to keep them monotonic.  We really want to avoid taking
3338  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3339  * Ack and window will in general have changed since this packet was put
3340  * on the write queue.
3341  */
3342                         iph = skb->ip_hdr;
3343                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3344                         size = skb->len - (((unsigned char *) th) - skb->data);
3345                         
3346                         th->ack_seq = ntohl(sk->acked_seq);
3347                         th->window = ntohs(tcp_select_window(sk));
3348 
3349                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3350 
3351                         sk->sent_seq = skb->h.seq;
3352                         
3353                         /*
3354                          *      IP manages our queue for some crazy reason
3355                          */
3356                          
3357                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3358                         
3359                         /*
3360                          *      Again we slide the timer wrongly
3361                          */
3362                          
3363                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3364                 }
3365         }
3366 }
3367 
3368 
3369 /*
3370  *      This routine deals with incoming acks, but not outgoing ones.
3371  */
3372 
3373 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3374 {
3375         u32 ack;
3376         int flag = 0;
3377 
3378         /* 
3379          * 1 - there was data in packet as well as ack or new data is sent or 
3380          *     in shutdown state
3381          * 2 - data from retransmit queue was acked and removed
3382          * 4 - window shrunk or data from retransmit queue was acked and removed
3383          */
3384 
3385         if(sk->zapped)
3386                 return(1);      /* Dead, cant ack any more so why bother */
3387 
3388         /*
3389          *      Have we discovered a larger window
3390          */
3391          
3392         ack = ntohl(th->ack_seq);
3393 
3394         if (ntohs(th->window) > sk->max_window) 
3395         {
3396                 sk->max_window = ntohs(th->window);
3397 #ifdef CONFIG_INET_PCTCP
3398                 /* Hack because we don't send partial packets to non SWS
3399                    handling hosts */
3400                 sk->mss = min(sk->max_window>>1, sk->mtu);
3401 #else
3402                 sk->mss = min(sk->max_window, sk->mtu);
3403 #endif  
3404         }
3405 
3406         /*
3407          *      We have dropped back to keepalive timeouts. Thus we have
3408          *      no retransmits pending.
3409          */
3410          
3411         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3412                 sk->retransmits = 0;
3413 
3414         /*
3415          *      If the ack is newer than sent or older than previous acks
3416          *      then we can probably ignore it.
3417          */
3418          
3419         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3420         {
3421                 if(sk->debug)
3422                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3423                         
3424                 /*
3425                  *      Keepalive processing.
3426                  */
3427                  
3428                 if (after(ack, sk->sent_seq)) 
3429                 {
3430                         return(0);
3431                 }
3432                 
3433                 /*
3434                  *      Restart the keepalive timer.
3435                  */
3436                  
3437                 if (sk->keepopen) 
3438                 {
3439                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3440                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3441                 }
3442                 return(1);
3443         }
3444 
3445         /*
3446          *      If there is data set flag 1
3447          */
3448          
3449         if (len != th->doff*4) 
3450                 flag |= 1;
3451 
3452         /*
3453          *      See if our window has been shrunk. 
3454          */
3455 
3456         if (after(sk->window_seq, ack+ntohs(th->window))) 
3457         {
3458                 /*
3459                  * We may need to move packets from the send queue
3460                  * to the write queue, if the window has been shrunk on us.
3461                  * The RFC says you are not allowed to shrink your window
3462                  * like this, but if the other end does, you must be able
3463                  * to deal with it.
3464                  */
3465                 struct sk_buff *skb;
3466                 struct sk_buff *skb2;
3467                 struct sk_buff *wskb = NULL;
3468         
3469                 skb2 = sk->send_head;
3470                 sk->send_head = NULL;
3471                 sk->send_tail = NULL;
3472         
3473                 /*
3474                  *      This is an artifact of a flawed concept. We want one
3475                  *      queue and a smarter send routine when we send all.
3476                  */
3477         
3478                 flag |= 4;      /* Window changed */
3479         
3480                 sk->window_seq = ack + ntohs(th->window);
3481                 cli();
3482                 while (skb2 != NULL) 
3483                 {
3484                         skb = skb2;
3485                         skb2 = skb->link3;
3486                         skb->link3 = NULL;
3487                         if (after(skb->h.seq, sk->window_seq)) 
3488                         {
3489                                 if (sk->packets_out > 0) 
3490                                         sk->packets_out--;
3491                                 /* We may need to remove this from the dev send list. */
3492                                 if (skb->next != NULL) 
3493                                 {
3494                                         skb_unlink(skb);                                
3495                                 }
3496                                 /* Now add it to the write_queue. */
3497                                 if (wskb == NULL)
3498                                         skb_queue_head(&sk->write_queue,skb);
3499                                 else
3500                                         skb_append(wskb,skb);
3501                                 wskb = skb;
3502                         } 
3503                         else 
3504                         {
3505                                 if (sk->send_head == NULL) 
3506                                 {
3507                                         sk->send_head = skb;
3508                                         sk->send_tail = skb;
3509                                 }
3510                                 else
3511                                 {
3512                                         sk->send_tail->link3 = skb;
3513                                         sk->send_tail = skb;
3514                                 }
3515                                 skb->link3 = NULL;
3516                         }
3517                 }
3518                 sti();
3519         }
3520 
3521         /*
3522          *      Pipe has emptied
3523          */
3524          
3525         if (sk->send_tail == NULL || sk->send_head == NULL) 
3526         {
3527                 sk->send_head = NULL;
3528                 sk->send_tail = NULL;
3529                 sk->packets_out= 0;
3530         }
3531 
3532         /*
3533          *      Update the right hand window edge of the host
3534          */
3535          
3536         sk->window_seq = ack + ntohs(th->window);
3537 
3538         /*
3539          *      We don't want too many packets out there. 
3540          */
3541          
3542         if (sk->ip_xmit_timeout == TIME_WRITE && 
3543                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3544         {
3545                 /* 
3546                  * This is Jacobson's slow start and congestion avoidance. 
3547                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3548                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3549                  * counter and increment it once every cwnd times.  It's possible
3550                  * that this should be done only if sk->retransmits == 0.  I'm
3551                  * interpreting "new data is acked" as including data that has
3552                  * been retransmitted but is just now being acked.
3553                  */
3554                 if (sk->cong_window < sk->ssthresh)  
3555                         /* 
3556                          *      In "safe" area, increase
3557                          */
3558                         sk->cong_window++;
3559                 else 
3560                 {
3561                         /*
3562                          *      In dangerous area, increase slowly.  In theory this is
3563                          *      sk->cong_window += 1 / sk->cong_window
3564                          */
3565                         if (sk->cong_count >= sk->cong_window) 
3566                         {
3567                                 sk->cong_window++;
3568                                 sk->cong_count = 0;
3569                         }
3570                         else 
3571                                 sk->cong_count++;
3572                 }
3573         }
3574 
3575         /*
3576          *      Remember the highest ack received.
3577          */
3578          
3579         sk->rcv_ack_seq = ack;
3580 
3581         /*
3582          *      If this ack opens up a zero window, clear backoff.  It was
3583          *      being used to time the probes, and is probably far higher than
3584          *      it needs to be for normal retransmission.
3585          */
3586 
3587         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3588         {
3589                 sk->retransmits = 0;    /* Our probe was answered */
3590                 
3591                 /*
3592                  *      Was it a usable window open ?
3593                  */
3594                  
3595                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3596                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3597                 {
3598                         sk->backoff = 0;
3599                         
3600                         /*
3601                          *      Recompute rto from rtt.  this eliminates any backoff.
3602                          */
3603 
3604                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3605                         if (sk->rto > 120*HZ)
3606                                 sk->rto = 120*HZ;
3607                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3608                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3609                                                    .2 of a second is going to need huge windows (SIGH) */
3610                         sk->rto = 20;
3611                 }
3612         }
3613 
3614         /* 
3615          *      See if we can take anything off of the retransmit queue.
3616          */
3617    
3618         while(sk->send_head != NULL) 
3619         {
3620                 /* Check for a bug. */
3621                 if (sk->send_head->link3 &&
3622                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3623                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3624                         
3625                 /*
3626                  *      If our packet is before the ack sequence we can
3627                  *      discard it as it's confirmed to have arrived the other end.
3628                  */
3629                  
3630                 if (before(sk->send_head->h.seq, ack+1)) 
3631                 {
3632                         struct sk_buff *oskb;   
3633                         if (sk->retransmits) 
3634                         {       
3635                                 /*
3636                                  *      We were retransmitting.  don't count this in RTT est 
3637                                  */
3638                                 flag |= 2;
3639 
3640                                 /*
3641                                  * even though we've gotten an ack, we're still
3642                                  * retransmitting as long as we're sending from
3643                                  * the retransmit queue.  Keeping retransmits non-zero
3644                                  * prevents us from getting new data interspersed with
3645                                  * retransmissions.
3646                                  */
3647 
3648                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3649                                         sk->retransmits = 1;
3650                                 else
3651                                         sk->retransmits = 0;
3652                         }
3653                         /*
3654                          * Note that we only reset backoff and rto in the
3655                          * rtt recomputation code.  And that doesn't happen
3656                          * if there were retransmissions in effect.  So the
3657                          * first new packet after the retransmissions is
3658                          * sent with the backoff still in effect.  Not until
3659                          * we get an ack from a non-retransmitted packet do
3660                          * we reset the backoff and rto.  This allows us to deal
3661                          * with a situation where the network delay has increased
3662                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3663                          */
3664 
3665                         /*
3666                          *      We have one less packet out there. 
3667                          */
3668                          
3669                         if (sk->packets_out > 0) 
3670                                 sk->packets_out --;
3671                         /* 
3672                          *      Wake up the process, it can probably write more. 
3673                          */
3674                         if (!sk->dead) 
3675                                 sk->write_space(sk);
3676                         oskb = sk->send_head;
3677 
3678                         if (!(flag&2))  /* Not retransmitting */
3679                         {
3680                                 long m;
3681         
3682                                 /*
3683                                  *      The following amusing code comes from Jacobson's
3684                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3685                                  *      are scaled versions of rtt and mean deviation.
3686                                  *      This is designed to be as fast as possible 
3687                                  *      m stands for "measurement".
3688                                  */
3689         
3690                                 m = jiffies - oskb->when;  /* RTT */
3691                                 if(m<=0)
3692                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3693                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3694                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3695                                 if (m < 0)
3696                                         m = -m;         /* m is now abs(error) */
3697                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3698                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3699         
3700                                 /*
3701                                  *      Now update timeout.  Note that this removes any backoff.
3702                                  */
3703                          
3704                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3705                                 if (sk->rto > 120*HZ)
3706                                         sk->rto = 120*HZ;
3707                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3708                                         sk->rto = 20;
3709                                 sk->backoff = 0;
3710                         }
3711                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3712                                            In this case as we just set it up */
3713                         cli();
3714                         oskb = sk->send_head;
3715                         IS_SKB(oskb);
3716                         sk->send_head = oskb->link3;
3717                         if (sk->send_head == NULL) 
3718                         {
3719                                 sk->send_tail = NULL;
3720                         }
3721 
3722                 /*
3723                  *      We may need to remove this from the dev send list. 
3724                  */
3725 
3726                         if (oskb->next)
3727                                 skb_unlink(oskb);
3728                         sti();
3729                         kfree_skb(oskb, FREE_WRITE); /* write. */
3730                         if (!sk->dead) 
3731                                 sk->write_space(sk);
3732                 }
3733                 else
3734                 {
3735                         break;
3736                 }
3737         }
3738 
3739         /*
3740          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3741          * returns non-NULL, we complete ignore the timer stuff in the else
3742          * clause.  We ought to organize the code so that else clause can
3743          * (should) be executed regardless, possibly moving the PROBE timer
3744          * reset over.  The skb_peek() thing should only move stuff to the
3745          * write queue, NOT also manage the timer functions.
3746          */
3747 
3748         /*
3749          * Maybe we can take some stuff off of the write queue,
3750          * and put it onto the xmit queue.
3751          */
3752         if (skb_peek(&sk->write_queue) != NULL) 
3753         {
3754                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3755                         (sk->retransmits == 0 || 
3756                          sk->ip_xmit_timeout != TIME_WRITE ||
3757                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3758                         && sk->packets_out < sk->cong_window) 
3759                 {
3760                         /*
3761                          *      Add more data to the send queue.
3762                          */
3763                         flag |= 1;
3764                         tcp_write_xmit(sk);
3765                 }
3766                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3767                         sk->send_head == NULL &&
3768                         sk->ack_backlog == 0 &&
3769                         sk->state != TCP_TIME_WAIT) 
3770                 {
3771                         /*
3772                          *      Data to queue but no room.
3773                          */
3774                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3775                 }               
3776         }
3777         else
3778         {
3779                 /*
3780                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3781                  * from TCP_CLOSE we don't do anything
3782                  *
3783                  * from anything else, if there is write data (or fin) pending,
3784                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3785                  * a KEEPALIVE timeout, else we delete the timer.
3786                  *
3787                  * We do not set flag for nominal write data, otherwise we may
3788                  * force a state where we start to write itsy bitsy tidbits
3789                  * of data.
3790                  */
3791 
3792                 switch(sk->state) {
3793                 case TCP_TIME_WAIT:
3794                         /*
3795                          * keep us in TIME_WAIT until we stop getting packets,
3796                          * reset the timeout.
3797                          */
3798                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3799                         break;
3800                 case TCP_CLOSE:
3801                         /*
3802                          * don't touch the timer.
3803                          */
3804                         break;
3805                 default:
3806                         /*
3807                          *      Must check send_head, write_queue, and ack_backlog
3808                          *      to determine which timeout to use.
3809                          */
3810                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3811                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3812                         } else if (sk->keepopen) {
3813                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3814                         } else {
3815                                 del_timer(&sk->retransmit_timer);
3816                                 sk->ip_xmit_timeout = 0;
3817                         }
3818                         break;
3819                 }
3820         }
3821 
3822         /*
3823          *      We have nothing queued but space to send. Send any partial
3824          *      packets immediately (end of Nagle rule application).
3825          */
3826          
3827         if (sk->packets_out == 0 && sk->partial != NULL &&
3828                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3829         {
3830                 flag |= 1;
3831                 tcp_send_partial(sk);
3832         }
3833 
3834         /*
3835          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3836          * we are now waiting for an acknowledge to our FIN.  The other end is
3837          * already in TIME_WAIT.
3838          *
3839          * Move to TCP_CLOSE on success.
3840          */
3841 
3842         if (sk->state == TCP_LAST_ACK) 
3843         {
3844                 if (!sk->dead)
3845                         sk->state_change(sk);
3846                 if(sk->debug)
3847                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3848                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3849                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3850                 {
3851                         flag |= 1;
3852                         tcp_set_state(sk,TCP_CLOSE);
3853                         sk->shutdown = SHUTDOWN_MASK;
3854                 }
3855         }
3856 
3857         /*
3858          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3859          *
3860          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3861          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3862          */
3863 
3864         if (sk->state == TCP_FIN_WAIT1) 
3865         {
3866 
3867                 if (!sk->dead) 
3868                         sk->state_change(sk);
3869                 if (sk->rcv_ack_seq == sk->write_seq) 
3870                 {
3871                         flag |= 1;
3872                         sk->shutdown |= SEND_SHUTDOWN;
3873                         tcp_set_state(sk, TCP_FIN_WAIT2);
3874                 }
3875         }
3876 
3877         /*
3878          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3879          *
3880          *      Move to TIME_WAIT
3881          */
3882 
3883         if (sk->state == TCP_CLOSING) 
3884         {
3885 
3886                 if (!sk->dead) 
3887                         sk->state_change(sk);
3888                 if (sk->rcv_ack_seq == sk->write_seq) 
3889                 {
3890                         flag |= 1;
3891                         tcp_time_wait(sk);
3892                 }
3893         }
3894         
3895         /*
3896          *      Final ack of a three way shake 
3897          */
3898          
3899         if(sk->state==TCP_SYN_RECV)
3900         {
3901                 tcp_set_state(sk, TCP_ESTABLISHED);
3902                 tcp_options(sk,th);
3903                 sk->dummy_th.dest=th->source;
3904                 sk->copied_seq = sk->acked_seq;
3905                 if(!sk->dead)
3906                         sk->state_change(sk);
3907                 if(sk->max_window==0)
3908                 {
3909                         sk->max_window=32;      /* Sanity check */
3910                         sk->mss=min(sk->max_window,sk->mtu);
3911                 }
3912         }
3913         
3914         /*
3915          * I make no guarantees about the first clause in the following
3916          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3917          * what conditions "!flag" would be true.  However I think the rest
3918          * of the conditions would prevent that from causing any
3919          * unnecessary retransmission. 
3920          *   Clearly if the first packet has expired it should be 
3921          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3922          * harder to explain:  You have to look carefully at how and when the
3923          * timer is set and with what timeout.  The most recent transmission always
3924          * sets the timer.  So in general if the most recent thing has timed
3925          * out, everything before it has as well.  So we want to go ahead and
3926          * retransmit some more.  If we didn't explicitly test for this
3927          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3928          * would not be true.  If you look at the pattern of timing, you can
3929          * show that rto is increased fast enough that the next packet would
3930          * almost never be retransmitted immediately.  Then you'd end up
3931          * waiting for a timeout to send each packet on the retransmission
3932          * queue.  With my implementation of the Karn sampling algorithm,
3933          * the timeout would double each time.  The net result is that it would
3934          * take a hideous amount of time to recover from a single dropped packet.
3935          * It's possible that there should also be a test for TIME_WRITE, but
3936          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3937          * got to be in real retransmission mode.
3938          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3939          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3940          * As long as no further losses occur, this seems reasonable.
3941          */
3942         
3943         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3944                (((flag&2) && sk->retransmits) ||
3945                (sk->send_head->when + sk->rto < jiffies))) 
3946         {
3947                 if(sk->send_head->when + sk->rto < jiffies)
3948                         tcp_retransmit(sk,0);   
3949                 else
3950                 {
3951                         tcp_do_retransmit(sk, 1);
3952                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3953                 }
3954         }
3955 
3956         return(1);
3957 }
3958 
3959 
3960 /*
3961  *      Process the FIN bit. This now behaves as it is supposed to work
3962  *      and the FIN takes effect when it is validly part of sequence
3963  *      space. Not before when we get holes.
3964  *
3965  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3966  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3967  *      TIME-WAIT)
3968  *
3969  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3970  *      close and we go into CLOSING (and later onto TIME-WAIT)
3971  *
3972  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3973  *
3974  */
3975  
3976 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3977 {
3978         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3979 
3980         if (!sk->dead) 
3981         {
3982                 sk->state_change(sk);
3983                 sock_wake_async(sk->socket, 1);
3984         }
3985 
3986         switch(sk->state) 
3987         {
3988                 case TCP_SYN_RECV:
3989                 case TCP_SYN_SENT:
3990                 case TCP_ESTABLISHED:
3991                         /*
3992                          * move to CLOSE_WAIT, tcp_data() already handled
3993                          * sending the ack.
3994                          */
3995                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3996                         if (th->rst)
3997                                 sk->shutdown = SHUTDOWN_MASK;
3998                         break;
3999 
4000                 case TCP_CLOSE_WAIT:
4001                 case TCP_CLOSING:
4002                         /*
4003                          * received a retransmission of the FIN, do
4004                          * nothing.
4005                          */
4006                         break;
4007                 case TCP_TIME_WAIT:
4008                         /*
4009                          * received a retransmission of the FIN,
4010                          * restart the TIME_WAIT timer.
4011                          */
4012                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4013                         return(0);
4014                 case TCP_FIN_WAIT1:
4015                         /*
4016                          * This case occurs when a simultaneous close
4017                          * happens, we must ack the received FIN and
4018                          * enter the CLOSING state.
4019                          *
4020                          * This causes a WRITE timeout, which will either
4021                          * move on to TIME_WAIT when we timeout, or resend
4022                          * the FIN properly (maybe we get rid of that annoying
4023                          * FIN lost hang). The TIME_WRITE code is already correct
4024                          * for handling this timeout.
4025                          */
4026 
4027                         if(sk->ip_xmit_timeout != TIME_WRITE)
4028                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4029                         tcp_set_state(sk,TCP_CLOSING);
4030                         break;
4031                 case TCP_FIN_WAIT2:
4032                         /*
4033                          * received a FIN -- send ACK and enter TIME_WAIT
4034                          */
4035                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4036                         sk->shutdown|=SHUTDOWN_MASK;
4037                         tcp_set_state(sk,TCP_TIME_WAIT);
4038                         break;
4039                 case TCP_CLOSE:
4040                         /*
4041                          * already in CLOSE
4042                          */
4043                         break;
4044                 default:
4045                         tcp_set_state(sk,TCP_LAST_ACK);
4046         
4047                         /* Start the timers. */
4048                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4049                         return(0);
4050         }
4051 
4052         return(0);
4053 }
4054 
4055 
4056 
4057 /*
4058  *      This routine handles the data.  If there is room in the buffer,
4059  *      it will be have already been moved into it.  If there is no
4060  *      room, then we will just have to discard the packet.
4061  */
4062 
4063 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
4064          unsigned long saddr, unsigned short len)
4065 {
4066         struct sk_buff *skb1, *skb2;
4067         struct tcphdr *th;
4068         int dup_dumped=0;
4069         u32 new_seq, shut_seq;
4070 
4071         th = skb->h.th;
4072         skb_pull(skb,th->doff*4);
4073         skb_trim(skb,len-(th->doff*4));
4074 
4075         /*
4076          *      The bytes in the receive read/assembly queue has increased. Needed for the
4077          *      low memory discard algorithm 
4078          */
4079            
4080         sk->bytes_rcv += skb->len;
4081         
4082         if (skb->len == 0 && !th->fin) 
4083         {
4084                 /* 
4085                  *      Don't want to keep passing ack's back and forth. 
4086                  *      (someone sent us dataless, boring frame)
4087                  */
4088                 if (!th->ack)
4089                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4090                 kfree_skb(skb, FREE_READ);
4091                 return(0);
4092         }
4093         
4094         /*
4095          *      We no longer have anyone receiving data on this connection.
4096          */
4097 
4098 #ifndef TCP_DONT_RST_SHUTDOWN            
4099 
4100         if(sk->shutdown & RCV_SHUTDOWN)
4101         {
4102                 /*
4103                  *      FIXME: BSD has some magic to avoid sending resets to
4104                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4105                  *      BSD stacks still have broken keepalives so we want to
4106                  *      cope with it.
4107                  */
4108 
4109                 if(skb->len)    /* We don't care if it's just an ack or
4110                                    a keepalive/window probe */
4111                 {
4112                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4113                         
4114                         /* Do this the way 4.4BSD treats it. Not what I'd
4115                            regard as the meaning of the spec but it's what BSD
4116                            does and clearly they know everything 8) */
4117 
4118                         /*
4119                          *      This is valid because of two things
4120                          *
4121                          *      a) The way tcp_data behaves at the bottom.
4122                          *      b) A fin takes effect when read not when received.
4123                          */
4124                          
4125                         shut_seq=sk->acked_seq+1;       /* Last byte */
4126                         
4127                         if(after(new_seq,shut_seq))
4128                         {
4129                                 if(sk->debug)
4130                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4131                                                 sk, new_seq, shut_seq, sk->blog);
4132                                 if(sk->dead)
4133                                 {
4134                                         sk->acked_seq = new_seq + th->fin;
4135                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4136                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4137                                         tcp_statistics.TcpEstabResets++;
4138                                         tcp_set_state(sk,TCP_CLOSE);
4139                                         sk->err = EPIPE;
4140                                         sk->shutdown = SHUTDOWN_MASK;
4141                                         kfree_skb(skb, FREE_READ);
4142                                         return 0;
4143                                 }
4144                         }
4145                 }
4146         }
4147 
4148 #endif
4149 
4150         /*
4151          *      Now we have to walk the chain, and figure out where this one
4152          *      goes into it.  This is set up so that the last packet we received
4153          *      will be the first one we look at, that way if everything comes
4154          *      in order, there will be no performance loss, and if they come
4155          *      out of order we will be able to fit things in nicely.
4156          *
4157          *      [AC: This is wrong. We should assume in order first and then walk
4158          *       forwards from the first hole based upon real traffic patterns.]
4159          *      
4160          */
4161 
4162         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4163         {
4164                 skb_queue_head(&sk->receive_queue,skb);
4165                 skb1= NULL;
4166         } 
4167         else
4168         {
4169                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4170                 {
4171                         if(sk->debug)
4172                         {
4173                                 printk("skb1=%p :", skb1);
4174                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4175                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4176                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4177                                                 sk->acked_seq);
4178                         }
4179                         
4180                         /*
4181                          *      Optimisation: Duplicate frame or extension of previous frame from
4182                          *      same sequence point (lost ack case).
4183                          *      The frame contains duplicate data or replaces a previous frame
4184                          *      discard the previous frame (safe as sk->inuse is set) and put
4185                          *      the new one in its place.
4186                          */
4187                          
4188                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4189                         {
4190                                 skb_append(skb1,skb);
4191                                 skb_unlink(skb1);
4192                                 kfree_skb(skb1,FREE_READ);
4193                                 dup_dumped=1;
4194                                 skb1=NULL;
4195                                 break;
4196                         }
4197                         
4198                         /*
4199                          *      Found where it fits
4200                          */
4201                          
4202                         if (after(th->seq+1, skb1->h.th->seq))
4203                         {
4204                                 skb_append(skb1,skb);
4205                                 break;
4206                         }
4207                         
4208                         /*
4209                          *      See if we've hit the start. If so insert.
4210                          */
4211                         if (skb1 == skb_peek(&sk->receive_queue))
4212                         {
4213                                 skb_queue_head(&sk->receive_queue, skb);
4214                                 break;
4215                         }
4216                 }
4217         }
4218 
4219         /*
4220          *      Figure out what the ack value for this frame is
4221          */
4222          
4223         th->ack_seq = th->seq + skb->len;
4224         if (th->syn) 
4225                 th->ack_seq++;
4226         if (th->fin)
4227                 th->ack_seq++;
4228 
4229         if (before(sk->acked_seq, sk->copied_seq)) 
4230         {
4231                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4232                 sk->acked_seq = sk->copied_seq;
4233         }
4234 
4235         /*
4236          *      Now figure out if we can ack anything. This is very messy because we really want two
4237          *      receive queues, a completed and an assembly queue. We also want only one transmit
4238          *      queue.
4239          */
4240 
4241         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4242         {
4243                 if (before(th->seq, sk->acked_seq+1)) 
4244                 {
4245                         int newwindow;
4246 
4247                         if (after(th->ack_seq, sk->acked_seq)) 
4248                         {
4249                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4250                                 if (newwindow < 0)
4251                                         newwindow = 0;  
4252                                 sk->window = newwindow;
4253                                 sk->acked_seq = th->ack_seq;
4254                         }
4255                         skb->acked = 1;
4256 
4257                         /*
4258                          *      When we ack the fin, we do the FIN 
4259                          *      processing.
4260                          */
4261 
4262                         if (skb->h.th->fin) 
4263                         {
4264                                 tcp_fin(skb,sk,skb->h.th);
4265                         }
4266           
4267                         for(skb2 = skb->next;
4268                             skb2 != (struct sk_buff *)&sk->receive_queue;
4269                             skb2 = skb2->next) 
4270                         {
4271                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4272                                 {
4273                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4274                                         {
4275                                                 newwindow = sk->window -
4276                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4277                                                 if (newwindow < 0)
4278                                                         newwindow = 0;  
4279                                                 sk->window = newwindow;
4280                                                 sk->acked_seq = skb2->h.th->ack_seq;
4281                                         }
4282                                         skb2->acked = 1;
4283                                         /*
4284                                          *      When we ack the fin, we do
4285                                          *      the fin handling.
4286                                          */
4287                                         if (skb2->h.th->fin) 
4288                                         {
4289                                                 tcp_fin(skb,sk,skb->h.th);
4290                                         }
4291 
4292                                         /*
4293                                          *      Force an immediate ack.
4294                                          */
4295                                          
4296                                         sk->ack_backlog = sk->max_ack_backlog;
4297                                 }
4298                                 else
4299                                 {
4300                                         break;
4301                                 }
4302                         }
4303 
4304                         /*
4305                          *      This also takes care of updating the window.
4306                          *      This if statement needs to be simplified.
4307                          */
4308                         if (!sk->delay_acks ||
4309                             sk->ack_backlog >= sk->max_ack_backlog || 
4310                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4311         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4312                         }
4313                         else 
4314                         {
4315                                 sk->ack_backlog++;
4316                                 if(sk->debug)
4317                                         printk("Ack queued.\n");
4318                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4319                         }
4320                 }
4321         }
4322 
4323         /*
4324          *      If we've missed a packet, send an ack.
4325          *      Also start a timer to send another.
4326          */
4327          
4328         if (!skb->acked) 
4329         {
4330         
4331         /*
4332          *      This is important.  If we don't have much room left,
4333          *      we need to throw out a few packets so we have a good
4334          *      window.  Note that mtu is used, not mss, because mss is really
4335          *      for the send side.  He could be sending us stuff as large as mtu.
4336          */
4337                  
4338                 while (sock_rspace(sk) < sk->mtu) 
4339                 {
4340                         skb1 = skb_peek(&sk->receive_queue);
4341                         if (skb1 == NULL) 
4342                         {
4343                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4344                                 break;
4345                         }
4346 
4347                         /*
4348                          *      Don't throw out something that has been acked. 
4349                          */
4350                  
4351                         if (skb1->acked) 
4352                         {
4353                                 break;
4354                         }
4355                 
4356                         skb_unlink(skb1);
4357                         kfree_skb(skb1, FREE_READ);
4358                 }
4359                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4360                 sk->ack_backlog++;
4361                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4362         }
4363         else
4364         {
4365                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4366         }
4367 
4368         /*
4369          *      Now tell the user we may have some data. 
4370          */
4371          
4372         if (!sk->dead) 
4373         {
4374                 if(sk->debug)
4375                         printk("Data wakeup.\n");
4376                 sk->data_ready(sk,0);
4377         } 
4378         return(0);
4379 }
4380 
4381 
4382 /*
4383  *      This routine is only called when we have urgent data
4384  *      signalled. Its the 'slow' part of tcp_urg. It could be
4385  *      moved inline now as tcp_urg is only called from one
4386  *      place. We handle URGent data wrong. We have to - as
4387  *      BSD still doesn't use the correction from RFC961.
4388  */
4389  
4390 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4391 {
4392         u32 ptr = ntohs(th->urg_ptr);
4393 
4394         if (ptr)
4395                 ptr--;
4396         ptr += th->seq;
4397 
4398         /* ignore urgent data that we've already seen and read */
4399         if (after(sk->copied_seq, ptr))
4400                 return;
4401 
4402         /* do we already have a newer (or duplicate) urgent pointer? */
4403         if (sk->urg_data && !after(ptr, sk->urg_seq))
4404                 return;
4405 
4406         /* tell the world about our new urgent pointer */
4407         if (sk->proc != 0) {
4408                 if (sk->proc > 0) {
4409                         kill_proc(sk->proc, SIGURG, 1);
4410                 } else {
4411                         kill_pg(-sk->proc, SIGURG, 1);
4412                 }
4413         }
4414         sk->urg_data = URG_NOTYET;
4415         sk->urg_seq = ptr;
4416 }
4417 
4418 /*
4419  *      This is the 'fast' part of urgent handling.
4420  */
4421  
4422 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4423         unsigned long saddr, unsigned long len)
4424 {
4425         u32 ptr;
4426 
4427         /*
4428          *      Check if we get a new urgent pointer - normally not 
4429          */
4430          
4431         if (th->urg)
4432                 tcp_check_urg(sk,th);
4433 
4434         /*
4435          *      Do we wait for any urgent data? - normally not
4436          */
4437          
4438         if (sk->urg_data != URG_NOTYET)
4439                 return 0;
4440 
4441         /*
4442          *      Is the urgent pointer pointing into this packet? 
4443          */
4444          
4445         ptr = sk->urg_seq - th->seq + th->doff*4;
4446         if (ptr >= len)
4447                 return 0;
4448 
4449         /*
4450          *      Ok, got the correct packet, update info 
4451          */
4452          
4453         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4454         if (!sk->dead)
4455                 sk->data_ready(sk,0);
4456         return 0;
4457 }
4458 
4459 /*
4460  *      This will accept the next outstanding connection. 
4461  */
4462  
4463 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4464 {
4465         struct sock *newsk;
4466         struct sk_buff *skb;
4467   
4468   /*
4469    * We need to make sure that this socket is listening,
4470    * and that it has something pending.
4471    */
4472 
4473         if (sk->state != TCP_LISTEN) 
4474         {
4475                 sk->err = EINVAL;
4476                 return(NULL); 
4477         }
4478 
4479         /* Avoid the race. */
4480         cli();
4481         sk->inuse = 1;
4482 
4483         while((skb = tcp_dequeue_established(sk)) == NULL) 
4484         {
4485                 if (flags & O_NONBLOCK) 
4486                 {
4487                         sti();
4488                         release_sock(sk);
4489                         sk->err = EAGAIN;
4490                         return(NULL);
4491                 }
4492 
4493                 release_sock(sk);
4494                 interruptible_sleep_on(sk->sleep);
4495                 if (current->signal & ~current->blocked) 
4496                 {
4497                         sti();
4498                         sk->err = ERESTARTSYS;
4499                         return(NULL);
4500                 }
4501                 sk->inuse = 1;
4502         }
4503         sti();
4504 
4505         /*
4506          *      Now all we need to do is return skb->sk. 
4507          */
4508 
4509         newsk = skb->sk;
4510 
4511         kfree_skb(skb, FREE_READ);
4512         sk->ack_backlog--;
4513         release_sock(sk);
4514         return(newsk);
4515 }
4516 
4517 
4518 /*
4519  *      This will initiate an outgoing connection. 
4520  */
4521  
4522 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4523 {
4524         struct sk_buff *buff;
4525         struct device *dev=NULL;
4526         unsigned char *ptr;
4527         int tmp;
4528         int atype;
4529         struct tcphdr *t1;
4530         struct rtable *rt;
4531 
4532         if (sk->state != TCP_CLOSE) 
4533         {
4534                 return(-EISCONN);
4535         }
4536         
4537         if (addr_len < 8) 
4538                 return(-EINVAL);
4539 
4540         if (usin->sin_family && usin->sin_family != AF_INET) 
4541                 return(-EAFNOSUPPORT);
4542 
4543         /*
4544          *      connect() to INADDR_ANY means loopback (BSD'ism).
4545          */
4546         
4547         if(usin->sin_addr.s_addr==INADDR_ANY)
4548                 usin->sin_addr.s_addr=ip_my_addr();
4549                   
4550         /*
4551          *      Don't want a TCP connection going to a broadcast address 
4552          */
4553 
4554         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4555                 return -ENETUNREACH;
4556   
4557         sk->inuse = 1;
4558         sk->daddr = usin->sin_addr.s_addr;
4559         sk->write_seq = tcp_init_seq();
4560         sk->window_seq = sk->write_seq;
4561         sk->rcv_ack_seq = sk->write_seq -1;
4562         sk->err = 0;
4563         sk->dummy_th.dest = usin->sin_port;
4564         release_sock(sk);
4565 
4566         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4567         if (buff == NULL) 
4568         {
4569                 return(-ENOMEM);
4570         }
4571         sk->inuse = 1;
4572         buff->sk = sk;
4573         buff->free = 0;
4574         buff->localroute = sk->localroute;
4575         
4576 
4577         /*
4578          *      Put in the IP header and routing stuff. 
4579          */
4580          
4581         if (sk->localroute)
4582           rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4583         else
4584           rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4585 
4586         /*
4587          *      We need to build the routing stuff from the things saved in skb. 
4588          */
4589 
4590         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4591                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4592         if (tmp < 0) 
4593         {
4594                 sock_wfree(sk, buff);
4595                 release_sock(sk);
4596                 return(-ENETUNREACH);
4597         }
4598 
4599         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4600 
4601         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4602         t1->seq = ntohl(sk->write_seq++);
4603         sk->sent_seq = sk->write_seq;
4604         buff->h.seq = sk->write_seq;
4605         t1->ack = 0;
4606         t1->window = 2;
4607         t1->res1=0;
4608         t1->res2=0;
4609         t1->rst = 0;
4610         t1->urg = 0;
4611         t1->psh = 0;
4612         t1->syn = 1;
4613         t1->urg_ptr = 0;
4614         t1->doff = 6;
4615         /* use 512 or whatever user asked for */
4616         
4617         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4618                 sk->window_clamp=rt->rt_window;
4619         else
4620                 sk->window_clamp=0;
4621 
4622         if (sk->user_mss)
4623                 sk->mtu = sk->user_mss;
4624         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4625                 sk->mtu = rt->rt_mss;
4626         else 
4627         {
4628 #ifdef CONFIG_INET_SNARL
4629                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4630 #else
4631                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4632 #endif
4633                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4634                 else
4635                         sk->mtu = MAX_WINDOW;
4636         }
4637         /*
4638          *      but not bigger than device MTU 
4639          */
4640 
4641         if(sk->mtu <32)
4642                 sk->mtu = 32;   /* Sanity limit */
4643                 
4644         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4645         
4646         /*
4647          *      Put in the TCP options to say MTU. 
4648          */
4649 
4650         ptr = skb_put(buff,4);
4651         ptr[0] = 2;
4652         ptr[1] = 4;
4653         ptr[2] = (sk->mtu) >> 8;
4654         ptr[3] = (sk->mtu) & 0xff;
4655         tcp_send_check(t1, sk->saddr, sk->daddr,
4656                   sizeof(struct tcphdr) + 4, sk);
4657 
4658         /*
4659          *      This must go first otherwise a really quick response will get reset. 
4660          */
4661 
4662         tcp_cache_zap();
4663         tcp_set_state(sk,TCP_SYN_SENT);
4664         if(rt&&rt->rt_flags&RTF_IRTT)
4665                 sk->rto = rt->rt_irtt;
4666         else
4667                 sk->rto = TCP_TIMEOUT_INIT;
4668         sk->retransmit_timer.function=&retransmit_timer;
4669         sk->retransmit_timer.data = (unsigned long)sk;
4670         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4671         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4672 
4673         sk->prot->queue_xmit(sk, dev, buff, 0);  
4674         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4675         tcp_statistics.TcpActiveOpens++;
4676         tcp_statistics.TcpOutSegs++;
4677   
4678         release_sock(sk);
4679         return(0);
4680 }
4681 
4682 
4683 /* This functions checks to see if the tcp header is actually acceptable. */
4684 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4685              struct options *opt, unsigned long saddr, struct device *dev)
4686 {
4687         u32 next_seq;
4688 
4689         next_seq = len - 4*th->doff;
4690         if (th->fin)
4691                 next_seq++;
4692         /* if we have a zero window, we can't have any data in the packet.. */
4693         if (next_seq && !sk->window)
4694                 goto ignore_it;
4695         next_seq += th->seq;
4696 
4697         /*
4698          * This isn't quite right.  sk->acked_seq could be more recent
4699          * than sk->window.  This is however close enough.  We will accept
4700          * slightly more packets than we should, but it should not cause
4701          * problems unless someone is trying to forge packets.
4702          */
4703 
4704         /* have we already seen all of this packet? */
4705         if (!after(next_seq+1, sk->acked_seq))
4706                 goto ignore_it;
4707         /* or does it start beyond the window? */
4708         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4709                 goto ignore_it;
4710 
4711         /* ok, at least part of this packet would seem interesting.. */
4712         return 1;
4713 
4714 ignore_it:
4715         if (th->rst)
4716                 return 0;
4717 
4718         /*
4719          *      Send a reset if we get something not ours and we are
4720          *      unsynchronized. Note: We don't do anything to our end. We
4721          *      are just killing the bogus remote connection then we will
4722          *      connect again and it will work (with luck).
4723          */
4724          
4725         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4726         {
4727                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4728                 return 1;
4729         }
4730 
4731         /* Try to resync things. */
4732         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4733         return 0;
4734 }
4735 
4736 /*
4737  *      When we get a reset we do this.
4738  */
4739 
4740 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4741 {
4742         sk->zapped = 1;
4743         sk->err = ECONNRESET;
4744         if (sk->state == TCP_SYN_SENT)
4745                 sk->err = ECONNREFUSED;
4746         if (sk->state == TCP_CLOSE_WAIT)
4747                 sk->err = EPIPE;
4748 #ifdef TCP_DO_RFC1337           
4749         /*
4750          *      Time wait assassination protection [RFC1337]
4751          */
4752         if(sk->state!=TCP_TIME_WAIT)
4753         {       
4754                 tcp_set_state(sk,TCP_CLOSE);
4755                 sk->shutdown = SHUTDOWN_MASK;
4756         }
4757 #else   
4758         tcp_set_state(sk,TCP_CLOSE);
4759         sk->shutdown = SHUTDOWN_MASK;
4760 #endif  
4761         if (!sk->dead) 
4762                 sk->state_change(sk);
4763         kfree_skb(skb, FREE_READ);
4764         release_sock(sk);
4765         return(0);
4766 }
4767 
4768 /*
4769  *      A TCP packet has arrived.
4770  *              skb->h.raw is the TCP header.
4771  */
4772  
4773 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4774         __u32 daddr, unsigned short len,
4775         __u32 saddr, int redo, struct inet_protocol * protocol)
4776 {
4777         struct tcphdr *th;
4778         struct sock *sk;
4779         int syn_ok=0;
4780         
4781         tcp_statistics.TcpInSegs++;
4782         if(skb->pkt_type!=PACKET_HOST)
4783         {
4784                 kfree_skb(skb,FREE_READ);
4785                 return(0);
4786         }
4787   
4788         th = skb->h.th;
4789 
4790         /*
4791          *      Find the socket, using the last hit cache if applicable.
4792          */
4793 
4794         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4795         {
4796                 sk=(struct sock *)th_cache_sk;
4797                 /*
4798                  *      We think this is causing the bug so
4799                  */
4800                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4801                         printk("Cache mismatch on TCP.\n");
4802         }
4803         else
4804         {
4805                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4806                 th_cache_saddr=saddr;
4807                 th_cache_daddr=daddr;
4808                 th_cache_dport=th->dest;
4809                 th_cache_sport=th->source;
4810                 th_cache_sk=sk;
4811         }               
4812 
4813         /*
4814          *      If this socket has got a reset it's to all intents and purposes 
4815          *      really dead. Count closed sockets as dead.
4816          *
4817          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4818          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4819          *      exist so should cause resets as if the port was unreachable.
4820          */
4821          
4822         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4823                 sk=NULL;
4824 
4825         if (!redo) 
4826         {
4827                 /*
4828                  *      Pull up the IP header.
4829                  */
4830                 skb_pull(skb, skb->h.raw-skb->data);
4831                 /*
4832                  *      Try to use the device checksum if provided.
4833                  */
4834                 if (
4835                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4836                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4837                     )
4838                 {
4839                         skb->sk = NULL;
4840                         kfree_skb(skb,FREE_READ);
4841                         /*
4842                          *      We don't release the socket because it was
4843                          *      never marked in use.
4844                          */
4845                         return(0);
4846                 }
4847                 th->seq = ntohl(th->seq);
4848 
4849                 /* See if we know about the socket. */
4850                 if (sk == NULL) 
4851                 {
4852                         /*
4853                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4854                          */
4855                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4856                         skb->sk = NULL;
4857                         /*
4858                          *      Discard frame
4859                          */
4860                         kfree_skb(skb, FREE_READ);
4861                         return(0);
4862                 }
4863 
4864 /*              skb->len = len;*/
4865                 skb->acked = 0;
4866                 skb->used = 0;
4867                 skb->free = 0;
4868                 skb->saddr = daddr;
4869                 skb->daddr = saddr;
4870         
4871                 /* We may need to add it to the backlog here. */
4872                 cli();
4873                 if (sk->inuse) 
4874                 {
4875                         skb_queue_tail(&sk->back_log, skb);
4876                         sti();
4877                         return(0);
4878                 }
4879                 sk->inuse = 1;
4880                 sti();
4881         }
4882         else
4883         {
4884                 if (sk==NULL) 
4885                 {
4886                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4887                         skb->sk = NULL;
4888                         kfree_skb(skb, FREE_READ);
4889                         return(0);
4890                 }
4891         }
4892 
4893 
4894         if (!sk->prot) 
4895         {
4896                 printk("IMPOSSIBLE 3\n");
4897                 return(0);
4898         }
4899 
4900 
4901         /*
4902          *      Charge the memory to the socket. 
4903          */
4904          
4905         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4906         {
4907                 kfree_skb(skb, FREE_READ);
4908                 release_sock(sk);
4909                 return(0);
4910         }
4911 
4912         skb->sk=sk;
4913         sk->rmem_alloc += skb->truesize;
4914 
4915         /*
4916          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4917          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4918          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4919          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4920          */
4921 
4922         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4923         {
4924         
4925                 /*
4926                  *      Now deal with unusual cases.
4927                  */
4928          
4929                 if(sk->state==TCP_LISTEN)
4930                 {
4931                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4932                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4933 
4934                         /*
4935                          *      We don't care for RST, and non SYN are absorbed (old segments)
4936                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4937                          *      netmask on a running connection it can go broadcast. Even Sun's have
4938                          *      this problem so I'm ignoring it 
4939                          */
4940                            
4941                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4942                         {
4943                                 kfree_skb(skb, FREE_READ);
4944                                 release_sock(sk);
4945                                 return 0;
4946                         }
4947                 
4948                         /*      
4949                          *      Guess we need to make a new socket up 
4950                          */
4951                 
4952                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4953                 
4954                         /*
4955                          *      Now we have several options: In theory there is nothing else
4956                          *      in the frame. KA9Q has an option to send data with the syn,
4957                          *      BSD accepts data with the syn up to the [to be] advertised window
4958                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4959                          *      it, that fits the spec precisely and avoids incompatibilities. It
4960                          *      would be nice in future to drop through and process the data.
4961                          */
4962                          
4963                         release_sock(sk);
4964                         return 0;
4965                 }
4966         
4967                 /* retransmitted SYN? */
4968                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4969                 {
4970                         kfree_skb(skb, FREE_READ);
4971                         release_sock(sk);
4972                         return 0;
4973                 }
4974                 
4975                 /*
4976                  *      SYN sent means we have to look for a suitable ack and either reset
4977                  *      for bad matches or go to connected 
4978                  */
4979            
4980                 if(sk->state==TCP_SYN_SENT)
4981                 {
4982                         /* Crossed SYN or previous junk segment */
4983                         if(th->ack)
4984                         {
4985                                 /* We got an ack, but it's not a good ack */
4986                                 if(!tcp_ack(sk,th,saddr,len))
4987                                 {
4988                                         /* Reset the ack - its an ack from a 
4989                                            different connection  [ th->rst is checked in tcp_reset()] */
4990                                         tcp_statistics.TcpAttemptFails++;
4991                                         tcp_reset(daddr, saddr, th,
4992                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4993                                         kfree_skb(skb, FREE_READ);
4994                                         release_sock(sk);
4995                                         return(0);
4996                                 }
4997                                 if(th->rst)
4998                                         return tcp_std_reset(sk,skb);
4999                                 if(!th->syn)
5000                                 {
5001                                         /* A valid ack from a different connection
5002                                            start. Shouldn't happen but cover it */
5003                                         kfree_skb(skb, FREE_READ);
5004                                         release_sock(sk);
5005                                         return 0;
5006                                 }
5007                                 /*
5008                                  *      Ok.. it's good. Set up sequence numbers and
5009                                  *      move to established.
5010                                  */
5011                                 syn_ok=1;       /* Don't reset this connection for the syn */
5012                                 sk->acked_seq=th->seq+1;
5013                                 sk->fin_seq=th->seq;
5014                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5015                                 tcp_set_state(sk, TCP_ESTABLISHED);
5016                                 tcp_options(sk,th);
5017                                 sk->dummy_th.dest=th->source;
5018                                 sk->copied_seq = sk->acked_seq;
5019                                 if(!sk->dead)
5020                                 {
5021                                         sk->state_change(sk);
5022                                         sock_wake_async(sk->socket, 0);
5023                                 }
5024                                 if(sk->max_window==0)
5025                                 {
5026                                         sk->max_window = 32;
5027                                         sk->mss = min(sk->max_window, sk->mtu);
5028                                 }
5029                         }
5030                         else
5031                         {
5032                                 /* See if SYN's cross. Drop if boring */
5033                                 if(th->syn && !th->rst)
5034                                 {
5035                                         /* Crossed SYN's are fine - but talking to
5036                                            yourself is right out... */
5037                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5038                                                 sk->dummy_th.source==th->source &&
5039                                                 sk->dummy_th.dest==th->dest)
5040                                         {
5041                                                 tcp_statistics.TcpAttemptFails++;
5042                                                 return tcp_std_reset(sk,skb);
5043                                         }
5044                                         tcp_set_state(sk,TCP_SYN_RECV);
5045                                         
5046                                         /*
5047                                          *      FIXME:
5048                                          *      Must send SYN|ACK here
5049                                          */
5050                                 }               
5051                                 /* Discard junk segment */
5052                                 kfree_skb(skb, FREE_READ);
5053                                 release_sock(sk);
5054                                 return 0;
5055                         }
5056                         /*
5057                          *      SYN_RECV with data maybe.. drop through
5058                          */
5059                         goto rfc_step6;
5060                 }
5061 
5062         /*
5063          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5064          *      a more complex suggestion for fixing these reuse issues in RFC1644
5065          *      but not yet ready for general use. Also see RFC1379.
5066          */
5067         
5068 #define BSD_TIME_WAIT
5069 #ifdef BSD_TIME_WAIT
5070                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5071                         after(th->seq, sk->acked_seq) && !th->rst)
5072                 {
5073                         u32 seq = sk->write_seq;
5074                         if(sk->debug)
5075                                 printk("Doing a BSD time wait\n");
5076                         tcp_statistics.TcpEstabResets++;           
5077                         sk->rmem_alloc -= skb->truesize;
5078                         skb->sk = NULL;
5079                         sk->err=ECONNRESET;
5080                         tcp_set_state(sk, TCP_CLOSE);
5081                         sk->shutdown = SHUTDOWN_MASK;
5082                         release_sock(sk);
5083                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5084                         if (sk && sk->state==TCP_LISTEN)
5085                         {
5086                                 sk->inuse=1;
5087                                 skb->sk = sk;
5088                                 sk->rmem_alloc += skb->truesize;
5089                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5090                                 release_sock(sk);
5091                                 return 0;
5092                         }
5093                         kfree_skb(skb, FREE_READ);
5094                         return 0;
5095                 }
5096 #endif  
5097         }
5098 
5099         /*
5100          *      We are now in normal data flow (see the step list in the RFC)
5101          *      Note most of these are inline now. I'll inline the lot when
5102          *      I have time to test it hard and look at what gcc outputs 
5103          */
5104         
5105         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5106         {
5107                 kfree_skb(skb, FREE_READ);
5108                 release_sock(sk);
5109                 return 0;
5110         }
5111 
5112         if(th->rst)
5113                 return tcp_std_reset(sk,skb);
5114         
5115         /*
5116          *      !syn_ok is effectively the state test in RFC793.
5117          */
5118          
5119         if(th->syn && !syn_ok)
5120         {
5121                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5122                 return tcp_std_reset(sk,skb);   
5123         }
5124 
5125         /*
5126          *      Process the ACK
5127          */
5128          
5129 
5130         if(th->ack && !tcp_ack(sk,th,saddr,len))
5131         {
5132                 /*
5133                  *      Our three way handshake failed.
5134                  */
5135                  
5136                 if(sk->state==TCP_SYN_RECV)
5137                 {
5138                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5139                 }
5140                 kfree_skb(skb, FREE_READ);
5141                 release_sock(sk);
5142                 return 0;
5143         }
5144         
5145 rfc_step6:              /* I'll clean this up later */
5146 
5147         /*
5148          *      Process urgent data
5149          */
5150                 
5151         if(tcp_urg(sk, th, saddr, len))
5152         {
5153                 kfree_skb(skb, FREE_READ);
5154                 release_sock(sk);
5155                 return 0;
5156         }
5157         
5158         
5159         /*
5160          *      Process the encapsulated data
5161          */
5162         
5163         if(tcp_data(skb,sk, saddr, len))
5164         {
5165                 kfree_skb(skb, FREE_READ);
5166                 release_sock(sk);
5167                 return 0;
5168         }
5169 
5170         /*
5171          *      And done
5172          */     
5173         
5174         release_sock(sk);
5175         return 0;
5176 }
5177 
5178 /*
5179  *      This routine sends a packet with an out of date sequence
5180  *      number. It assumes the other end will try to ack it.
5181  */
5182 
5183 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5184 {
5185         struct sk_buff *buff,*skb;
5186         struct tcphdr *t1;
5187         struct device *dev=NULL;
5188         int tmp;
5189 
5190         if (sk->zapped)
5191                 return; /* After a valid reset we can send no more */
5192 
5193         /*
5194          *      Write data can still be transmitted/retransmitted in the
5195          *      following states.  If any other state is encountered, return.
5196          *      [listen/close will never occur here anyway]
5197          */
5198 
5199         if (sk->state != TCP_ESTABLISHED && 
5200             sk->state != TCP_CLOSE_WAIT &&
5201             sk->state != TCP_FIN_WAIT1 && 
5202             sk->state != TCP_LAST_ACK &&
5203             sk->state != TCP_CLOSING
5204         ) 
5205         {
5206                 return;
5207         }
5208         if ( before(sk->sent_seq, sk->window_seq) && 
5209             (skb=skb_peek(&sk->write_queue)))
5210         {
5211                 /*
5212                  * We are probing the opening of a window
5213                  * but the window size is != 0
5214                  * must have been a result SWS advoidance ( sender )
5215                  */
5216             
5217                 struct iphdr *iph;
5218                 struct tcphdr *th;
5219                 struct tcphdr *nth;
5220                 unsigned long win_size;
5221 #if 0
5222                 unsigned long ow_size;
5223 #endif
5224                 void * tcp_data_start;
5225         
5226                 /*
5227                  *      How many bytes can we send ?
5228                  */
5229                  
5230                 win_size = sk->window_seq - sk->sent_seq;
5231 
5232                 /*
5233                  *      Recover the buffer pointers
5234                  */
5235                  
5236                 iph = (struct iphdr *)skb->ip_hdr;
5237                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5238 
5239                 /*
5240                  *      Grab the data for a temporary frame
5241                  */
5242                  
5243                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5244                                      (iph->ihl << 2) +
5245                                      sk->prot->max_header + 15, 
5246                                      1, GFP_ATOMIC);
5247                 if ( buff == NULL )
5248                         return;
5249 
5250                 /* 
5251                  *      If we strip the packet on the write queue we must
5252                  *      be ready to retransmit this one 
5253                  */
5254             
5255                 buff->free = /*0*/1;
5256 
5257                 buff->sk = sk;
5258                 buff->localroute = sk->localroute;
5259                 
5260                 /*
5261                  *      Put headers on the new packet
5262                  */
5263 
5264                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5265                                          IPPROTO_TCP, sk->opt, buff->truesize,
5266                                          sk->ip_tos,sk->ip_ttl);
5267                 if (tmp < 0) 
5268                 {
5269                         sock_wfree(sk, buff);
5270                         return;
5271                 }
5272                 
5273                 /*
5274                  *      Move the TCP header over
5275                  */
5276 
5277                 buff->dev = dev;
5278 
5279                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5280 
5281                 memcpy(nth, th, th->doff * 4);
5282                 
5283                 /*
5284                  *      Correct the new header
5285                  */
5286                  
5287                 nth->ack = 1; 
5288                 nth->ack_seq = ntohl(sk->acked_seq);
5289                 nth->window = ntohs(tcp_select_window(sk));
5290                 nth->check = 0;
5291 
5292                 /*
5293                  *      Find the first data byte.
5294                  */
5295                  
5296                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5297                                 (iph->ihl << 2) + th->doff * 4;
5298 
5299                 /*
5300                  *      Add it to our new buffer
5301                  */
5302                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5303                 
5304                 /*
5305                  *      Remember our right edge sequence number.
5306                  */
5307                  
5308                 buff->h.seq = sk->sent_seq + win_size;
5309                 sk->sent_seq = buff->h.seq;             /* Hack */
5310 #if 0
5311 
5312                 /*
5313                  *      now: shrink the queue head segment 
5314                  */
5315                  
5316                 th->check = 0;
5317                 ow_size = skb->len - win_size - 
5318                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5319 
5320                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5321                 skb_trim(skb,skb->len-win_size);
5322                 sk->sent_seq += win_size;
5323                 th->seq = htonl(sk->sent_seq);
5324                 if (th->urg)
5325                 {
5326                         unsigned short urg_ptr;
5327         
5328                         urg_ptr = ntohs(th->urg_ptr);
5329                         if (urg_ptr <= win_size)
5330                                 th->urg = 0;
5331                         else
5332                         {
5333                                 urg_ptr -= win_size;
5334                                 th->urg_ptr = htons(urg_ptr);
5335                                 nth->urg_ptr = htons(win_size);
5336                         }
5337                 }
5338 #else
5339                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5340                         nth->urg = 0;
5341 #endif          
5342 
5343                 /*
5344                  *      Checksum the split buffer
5345                  */
5346                  
5347                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5348                            nth->doff * 4 + win_size , sk);
5349         }
5350         else
5351         {       
5352                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5353                 if (buff == NULL) 
5354                         return;
5355 
5356                 buff->free = 1;
5357                 buff->sk = sk;
5358                 buff->localroute = sk->localroute;
5359 
5360                 /*
5361                  *      Put in the IP header and routing stuff. 
5362                  */
5363                  
5364                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5365                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5366                 if (tmp < 0) 
5367                 {
5368                         sock_wfree(sk, buff);
5369                         return;
5370                 }
5371 
5372                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5373                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5374 
5375                 /*
5376                  *      Use a previous sequence.
5377                  *      This should cause the other end to send an ack.
5378                  */
5379          
5380                 t1->seq = htonl(sk->sent_seq-1);
5381                 t1->ack = 1; 
5382                 t1->res1= 0;
5383                 t1->res2= 0;
5384                 t1->rst = 0;
5385                 t1->urg = 0;
5386                 t1->psh = 0;
5387                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5388                 t1->syn = 0;
5389                 t1->ack_seq = ntohl(sk->acked_seq);
5390                 t1->window = ntohs(tcp_select_window(sk));
5391                 t1->doff = sizeof(*t1)/4;
5392                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5393 
5394         }               
5395 
5396         /*
5397          *      Send it.
5398          */
5399         
5400         sk->prot->queue_xmit(sk, dev, buff, 1);
5401         tcp_statistics.TcpOutSegs++;
5402 }
5403 
5404 /*
5405  *      A window probe timeout has occurred.
5406  */
5407 
5408 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5409 {
5410         if (sk->zapped)
5411                 return;         /* After a valid reset we can send no more */
5412 
5413         tcp_write_wakeup(sk);
5414 
5415         sk->backoff++;
5416         sk->rto = min(sk->rto << 1, 120*HZ);
5417         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5418         sk->retransmits++;
5419         sk->prot->retransmits ++;
5420 }
5421 
5422 /*
5423  *      Socket option code for TCP. 
5424  */
5425   
5426 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5427 {
5428         int val,err;
5429 
5430         if(level!=SOL_TCP)
5431                 return ip_setsockopt(sk,level,optname,optval,optlen);
5432 
5433         if (optval == NULL) 
5434                 return(-EINVAL);
5435 
5436         err=verify_area(VERIFY_READ, optval, sizeof(int));
5437         if(err)
5438                 return err;
5439         
5440         val = get_user((int *)optval);
5441 
5442         switch(optname)
5443         {
5444                 case TCP_MAXSEG:
5445 /*
5446  * values greater than interface MTU won't take effect.  however at
5447  * the point when this call is done we typically don't yet know
5448  * which interface is going to be used
5449  */
5450                         if(val<1||val>MAX_WINDOW)
5451                                 return -EINVAL;
5452                         sk->user_mss=val;
5453                         return 0;
5454                 case TCP_NODELAY:
5455                         sk->nonagle=(val==0)?0:1;
5456                         return 0;
5457                 default:
5458                         return(-ENOPROTOOPT);
5459         }
5460 }
5461 
5462 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5463 {
5464         int val,err;
5465 
5466         if(level!=SOL_TCP)
5467                 return ip_getsockopt(sk,level,optname,optval,optlen);
5468                         
5469         switch(optname)
5470         {
5471                 case TCP_MAXSEG:
5472                         val=sk->user_mss;
5473                         break;
5474                 case TCP_NODELAY:
5475                         val=sk->nonagle;
5476                         break;
5477                 default:
5478                         return(-ENOPROTOOPT);
5479         }
5480         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5481         if(err)
5482                 return err;
5483         put_user(sizeof(int),(int *) optlen);
5484 
5485         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5486         if(err)
5487                 return err;
5488         put_user(val,(int *)optval);
5489 
5490         return(0);
5491 }       
5492 
5493 
5494 struct proto tcp_prot = {
5495         tcp_close,
5496         tcp_read,
5497         tcp_write,
5498         tcp_sendto,
5499         tcp_recvfrom,
5500         ip_build_header,
5501         tcp_connect,
5502         tcp_accept,
5503         ip_queue_xmit,
5504         tcp_retransmit,
5505         tcp_write_wakeup,
5506         tcp_read_wakeup,
5507         tcp_rcv,
5508         tcp_select,
5509         tcp_ioctl,
5510         NULL,
5511         tcp_shutdown,
5512         tcp_setsockopt,
5513         tcp_getsockopt,
5514         tcp_sendmsg,
5515         tcp_recvmsg,
5516         128,
5517         0,
5518         "TCP",
5519         0, 0,
5520         {NULL,}
5521 };

/* [previous][next][first][last][top][bottom][index][help] */