root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_find_established
  6. tcp_dequeue_established
  7. tcp_close_pending
  8. tcp_time_wait
  9. tcp_do_retransmit
  10. reset_xmit_timer
  11. tcp_retransmit_time
  12. tcp_retransmit
  13. tcp_write_timeout
  14. retransmit_timer
  15. tcp_err
  16. tcp_readable
  17. tcp_listen_select
  18. tcp_select
  19. tcp_ioctl
  20. tcp_check
  21. tcp_send_check
  22. tcp_send_skb
  23. tcp_dequeue_partial
  24. tcp_send_partial
  25. tcp_enqueue_partial
  26. tcp_send_ack
  27. tcp_build_header
  28. tcp_sendmsg
  29. tcp_sendto
  30. tcp_write
  31. tcp_read_wakeup
  32. cleanup_rbuf
  33. tcp_recv_urg
  34. tcp_recvmsg
  35. tcp_recvfrom
  36. tcp_read
  37. tcp_close_state
  38. tcp_send_fin
  39. tcp_shutdown
  40. tcp_reset
  41. tcp_options
  42. default_mask
  43. tcp_init_seq
  44. tcp_conn_request
  45. tcp_close
  46. tcp_write_xmit
  47. tcp_ack
  48. tcp_fin
  49. tcp_data
  50. tcp_check_urg
  51. tcp_urg
  52. tcp_accept
  53. tcp_connect
  54. tcp_sequence
  55. tcp_std_reset
  56. tcp_rcv
  57. tcp_write_wakeup
  58. tcp_send_probe0
  59. tcp_setsockopt
  60. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *
 180  *
 181  * To Fix:
 182  *              Fast path the code. Two things here - fix the window calculation
 183  *              so it doesn't iterate over the queue, also spot packets with no funny
 184  *              options arriving in order and process directly.
 185  *
 186  *              Implement RFC 1191 [Path MTU discovery]
 187  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 188  *              Rewrite output state machine to use a single queue and do low window
 189  *              situations as per the spec (RFC 1122)
 190  *              Speed up input assembly algorithm.
 191  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 192  *              could do with it working on IPv4
 193  *              User settable/learned rtt/max window/mtu
 194  *              Cope with MTU/device switches when retransmitting in tcp.
 195  *              Fix the window handling to use PR's new code.
 196  *
 197  *              Change the fundamental structure to a single send queue maintained
 198  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 199  *              active routes too]). Cut the queue off in tcp_retransmit/
 200  *              tcp_transmit.
 201  *              Change the receive queue to assemble as it goes. This lets us
 202  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 203  *              tcp_data/tcp_read as well as the window shrink crud.
 204  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 205  *              tcp_queue_skb seem obvious routines to extract.
 206  *      
 207  *              This program is free software; you can redistribute it and/or
 208  *              modify it under the terms of the GNU General Public License
 209  *              as published by the Free Software Foundation; either version
 210  *              2 of the License, or(at your option) any later version.
 211  *
 212  * Description of States:
 213  *
 214  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 215  *
 216  *      TCP_SYN_RECV            received a connection request, sent ack,
 217  *                              waiting for final ack in three-way handshake.
 218  *
 219  *      TCP_ESTABLISHED         connection established
 220  *
 221  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 222  *                              transmission of remaining buffered data
 223  *
 224  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 225  *                              to shutdown
 226  *
 227  *      TCP_CLOSING             both sides have shutdown but we still have
 228  *                              data we have to finish sending
 229  *
 230  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 231  *                              closed, can only be entered from FIN_WAIT2
 232  *                              or CLOSING.  Required because the other end
 233  *                              may not have gotten our last ACK causing it
 234  *                              to retransmit the data packet (which we ignore)
 235  *
 236  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 237  *                              us to finish writing our data and to shutdown
 238  *                              (we have to close() to move on to LAST_ACK)
 239  *
 240  *      TCP_LAST_ACK            out side has shutdown after remote has
 241  *                              shutdown.  There may still be data in our
 242  *                              buffer that we have to finish sending
 243  *              
 244  *      TCP_CLOSE               socket is finished
 245  */
 246 
 247 /*
 248  * RFC1122 status:
 249  * NOTE: I'm not going to be doing comments in the code for this one except
 250  * for violations and the like.  tcp.c is just too big... If I say something
 251  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 252  * with Alan. -- MS 950903
 253  * 
 254  * Use of PSH (4.2.2.2)
 255  *   MAY aggregate data sent without the PSH flag. (does)
 256  *   MAY queue data recieved without the PSH flag. (does)
 257  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 258  *   MAY implement PSH on send calls. (doesn't, thus:)
 259  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 260  *     MUST set PSH on last segment (does)
 261  *   MAY pass received PSH to application layer (doesn't)
 262  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 263  * 
 264  * Window Size (4.2.2.3, 4.2.2.16)
 265  *   MUST treat window size as an unsigned number (does)
 266  *   SHOULD treat window size as a 32-bit number (does not)
 267  *   MUST NOT shrink window once it is offered (does not normally)
 268  *   
 269  * Urgent Pointer (4.2.2.4)
 270  * **MUST point urgent pointer to last byte of urgent data (not right
 271  *     after). (doesn't, to be like BSD)
 272  *   MUST inform application layer asynchronously of incoming urgent
 273  *     data. (does)
 274  *   MUST provide application with means of determining the amount of
 275  *     urgent data pending. (does)
 276  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 277  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 278  *      [Follows BSD 1 byte of urgent data]
 279  * 
 280  * TCP Options (4.2.2.5)
 281  *   MUST be able to recieve TCP options in any segment. (does)
 282  *   MUST ignore unsupported options (does)
 283  *   
 284  * Maximum Segment Size Option (4.2.2.6)
 285  *   MUST implement both sending and receiving MSS. (does)
 286  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 287  *     it always). (does, even when MSS == 536, which is legal)
 288  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 289  *   MUST calculate "effective send MSS" correctly:
 290  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 291  *     (does - but allows operator override)
 292  *  
 293  * TCP Checksum (4.2.2.7)
 294  *   MUST generate and check TCP checksum. (does)
 295  * 
 296  * Initial Sequence Number Selection (4.2.2.8)
 297  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 298  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 299  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 300  * 
 301  * Simultaneous Open Attempts (4.2.2.10)
 302  *   MUST support simultaneous open attempts (does)
 303  * 
 304  * Recovery from Old Duplicate SYN (4.2.2.11)
 305  *   MUST keep track of active vs. passive open (does)
 306  * 
 307  * RST segment (4.2.2.12)
 308  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 309  *     anything with it, which is standard)
 310  * 
 311  * Closing a Connection (4.2.2.13)
 312  *   MUST inform application of whether connectin was closed by RST or
 313  *     normal close. (does)
 314  *   MAY allow "half-duplex" close (treat connection as closed for the
 315  *     local app, even before handshake is done). (does)
 316  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 317  * 
 318  * Retransmission Timeout (4.2.2.15)
 319  *   MUST implement Jacobson's slow start and congestion avoidance
 320  *     stuff. (does) 
 321  * 
 322  * Probing Zero Windows (4.2.2.17)
 323  *   MUST support probing of zero windows. (does)
 324  *   MAY keep offered window closed indefinitely. (does)
 325  *   MUST allow remote window to stay closed indefinitely. (does)
 326  * 
 327  * Passive Open Calls (4.2.2.18)
 328  *   MUST NOT let new passive open affect other connections. (doesn't)
 329  *   MUST support passive opens (LISTENs) concurrently. (does)
 330  *   
 331  * Time to Live (4.2.2.19)
 332  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 333  * 
 334  * Event Processing (4.2.2.20)
 335  *   SHOULD queue out-of-order segments. (does)
 336  *   MUST aggregate ACK segments whenever possible. (does but badly)
 337  *   
 338  * Retransmission Timeout Calculation (4.2.3.1)
 339  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 340  *     calculation. (does, or at least explains them in the comments 8*b)
 341  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 342  * 
 343  * When to Send an ACK Segment (4.2.3.2)
 344  *   SHOULD implement delayed ACK. (does not)
 345  *   MUST keep ACK delay < 0.5 sec. (N/A)
 346  * 
 347  * When to Send a Window Update (4.2.3.3)
 348  *   MUST implement receiver-side SWS. (does)
 349  *   
 350  * When to Send Data (4.2.3.4)
 351  *   MUST implement sender-side SWS. (does - imperfectly)
 352  *   SHOULD implement Nagle algorithm. (does)
 353  * 
 354  * TCP Connection Failures (4.2.3.5)
 355  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 356  *   SHOULD inform application layer of soft errors. (doesn't)
 357  *   
 358  * TCP Keep-Alives (4.2.3.6)
 359  *   MAY provide keep-alives. (does)
 360  *   MUST make keep-alives configurable on a per-connection basis. (does)
 361  *   MUST default to no keep-alives. (does)
 362  * **MUST make keep-alive interval configurable. (doesn't)
 363  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 364  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 365  *     connection. (doesn't)
 366  *   SHOULD send keep-alive with no data. (does)
 367  * 
 368  * TCP Multihoming (4.2.3.7)
 369  *   MUST get source address from IP layer before sending first
 370  *     SYN. (does)
 371  *   MUST use same local address for all segments of a connection. (does)
 372  * 
 373  * IP Options (4.2.3.8)
 374  *   (I don't think the IP layer sees the IP options, yet.)
 375  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 376  *   MAY support Time Stamp and Record Route. (doesn't)
 377  * **MUST allow application to specify a source route. (doesn't?)
 378  * **MUST allow receieved Source Route option to set route for all future
 379  *     segments on this connection. (doesn't, not that I think it's a
 380  *     huge problem)
 381  * 
 382  * ICMP messages (4.2.3.9)
 383  *   MUST act on ICMP errors. (does)
 384  *   MUST slow transmission upon receipt of a Source Quench. (does)
 385  *   MUST NOT abort connection upon receipt of soft Destination
 386  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 387  *     Problems. (doesn't)
 388  *   SHOULD report soft Destination Unreachables etc. to the
 389  *     application. (doesn't)
 390  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 391  *     messages (2, 3, 4). (does)
 392  * 
 393  * Remote Address Validation (4.2.3.10)
 394  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 395  *   MUST ignore SYN with invalid source address. (does)
 396  *   MUST silently discard incoming SYN for broadcast/multicast
 397  *     address. (does) 
 398  * 
 399  * Asynchronous Reports (4.2.4.1)
 400  * **MUST provide mechanism for reporting soft errors to application
 401  *     layer. (doesn't)
 402  * 
 403  * Type of Service (4.2.4.2)
 404  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 405  * 
 406  * (Whew. -- MS 950903)
 407  **/
 408 
 409 #include <linux/types.h>
 410 #include <linux/sched.h>
 411 #include <linux/mm.h>
 412 #include <linux/time.h>
 413 #include <linux/string.h>
 414 #include <linux/config.h>
 415 #include <linux/socket.h>
 416 #include <linux/sockios.h>
 417 #include <linux/termios.h>
 418 #include <linux/in.h>
 419 #include <linux/fcntl.h>
 420 #include <linux/inet.h>
 421 #include <linux/netdevice.h>
 422 #include <net/snmp.h>
 423 #include <net/ip.h>
 424 #include <net/protocol.h>
 425 #include <net/icmp.h>
 426 #include <net/tcp.h>
 427 #include <net/arp.h>
 428 #include <linux/skbuff.h>
 429 #include <net/sock.h>
 430 #include <net/route.h>
 431 #include <linux/errno.h>
 432 #include <linux/timer.h>
 433 #include <asm/system.h>
 434 #include <asm/segment.h>
 435 #include <linux/mm.h>
 436 #include <net/checksum.h>
 437 
 438 /*
 439  *      The MSL timer is the 'normal' timer.
 440  */
 441  
 442 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 443 
 444 #define SEQ_TICK 3
 445 unsigned long seq_offset;
 446 struct tcp_mib  tcp_statistics;
 447 
 448 /*
 449  *      Cached last hit socket
 450  */
 451  
 452 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 453 volatile unsigned short  th_cache_dport, th_cache_sport;
 454 volatile struct sock *th_cache_sk;
 455 
 456 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 457 {
 458         unsigned long flags;
 459         save_flags(flags);
 460         cli();
 461         th_cache_saddr=0;
 462         th_cache_daddr=0;
 463         th_cache_dport=0;
 464         th_cache_sport=0;
 465         th_cache_sk=NULL;
 466         restore_flags(flags);
 467 }
 468 
 469 static void tcp_close(struct sock *sk, int timeout);
 470 
 471 
 472 /*
 473  *      The less said about this the better, but it works and will do for 1.2 
 474  */
 475 
 476 static struct wait_queue *master_select_wakeup;
 477 
 478 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 479 {
 480         if (a < b) 
 481                 return(a);
 482         return(b);
 483 }
 484 
 485 #undef STATE_TRACE
 486 
 487 #ifdef STATE_TRACE
 488 static char *statename[]={
 489         "Unused","Established","Syn Sent","Syn Recv",
 490         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 491         "Close Wait","Last ACK","Listen","Closing"
 492 };
 493 #endif
 494 
 495 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 496 {
 497         if(sk->state==TCP_ESTABLISHED)
 498                 tcp_statistics.TcpCurrEstab--;
 499 #ifdef STATE_TRACE
 500         if(sk->debug)
 501                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 502 #endif  
 503         /* This is a hack but it doesn't occur often and it's going to
 504            be a real        to fix nicely */
 505            
 506         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 507         {
 508                 wake_up_interruptible(&master_select_wakeup);
 509         }
 510         sk->state=state;
 511         if(state==TCP_ESTABLISHED)
 512                 tcp_statistics.TcpCurrEstab++;
 513         if(sk->state==TCP_CLOSE)
 514                 tcp_cache_zap();
 515 }
 516 
 517 /*
 518  *      This routine picks a TCP windows for a socket based on
 519  *      the following constraints
 520  *  
 521  *      1. The window can never be shrunk once it is offered (RFC 793)
 522  *      2. We limit memory per socket
 523  *   
 524  *      For now we use NET2E3's heuristic of offering half the memory
 525  *      we have handy. All is not as bad as this seems however because
 526  *      of two things. Firstly we will bin packets even within the window
 527  *      in order to get the data we are waiting for into the memory limit.
 528  *      Secondly we bin common duplicate forms at receive time
 529  *      Better heuristics welcome
 530  */
 531    
 532 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 533 {
 534         int new_window = sock_rspace(sk);
 535         
 536         if(sk->window_clamp)
 537                 new_window=min(sk->window_clamp,new_window);
 538         /*
 539          *      Two things are going on here.  First, we don't ever offer a
 540          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 541          *      receiver side of SWS as specified in RFC1122.
 542          *      Second, we always give them at least the window they
 543          *      had before, in order to avoid retracting window.  This
 544          *      is technically allowed, but RFC1122 advises against it and
 545          *      in practice it causes trouble.
 546          *
 547          *      Fixme: This doesn't correctly handle the case where
 548          *      new_window > sk->window but not by enough to allow for the
 549          *      shift in sequence space. 
 550          */
 551         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 552                 return(sk->window);
 553         return(new_window);
 554 }
 555 
 556 /*
 557  *      Find someone to 'accept'. Must be called with
 558  *      sk->inuse=1 or cli()
 559  */ 
 560 
 561 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 562 {
 563         struct sk_buff *p=skb_peek(&s->receive_queue);
 564         if(p==NULL)
 565                 return NULL;
 566         do
 567         {
 568                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 569                         return p;
 570                 p=p->next;
 571         }
 572         while(p!=(struct sk_buff *)&s->receive_queue);
 573         return NULL;
 574 }
 575 
 576 /*
 577  *      Remove a completed connection and return it. This is used by
 578  *      tcp_accept() to get connections from the queue.
 579  */
 580 
 581 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 582 {
 583         struct sk_buff *skb;
 584         unsigned long flags;
 585         save_flags(flags);
 586         cli(); 
 587         skb=tcp_find_established(s);
 588         if(skb!=NULL)
 589                 skb_unlink(skb);        /* Take it off the queue */
 590         restore_flags(flags);
 591         return skb;
 592 }
 593 
 594 /* 
 595  *      This routine closes sockets which have been at least partially
 596  *      opened, but not yet accepted. Currently it is only called by
 597  *      tcp_close, and timeout mirrors the value there. 
 598  */
 599 
 600 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 601 {
 602         struct sk_buff *skb;
 603 
 604         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 605         {
 606                 skb->sk->dead=1;
 607                 tcp_close(skb->sk, 0);
 608                 kfree_skb(skb, FREE_READ);
 609         }
 610         return;
 611 }
 612 
 613 /*
 614  *      Enter the time wait state. 
 615  */
 616 
 617 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 618 {
 619         tcp_set_state(sk,TCP_TIME_WAIT);
 620         sk->shutdown = SHUTDOWN_MASK;
 621         if (!sk->dead)
 622                 sk->state_change(sk);
 623         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 624 }
 625 
 626 /*
 627  *      A socket has timed out on its send queue and wants to do a
 628  *      little retransmitting. Currently this means TCP.
 629  */
 630 
 631 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 632 {
 633         struct sk_buff * skb;
 634         struct proto *prot;
 635         struct device *dev;
 636         int ct=0;
 637         struct rtable *rt;
 638 
 639         prot = sk->prot;
 640         skb = sk->send_head;
 641 
 642         while (skb != NULL)
 643         {
 644                 struct tcphdr *th;
 645                 struct iphdr *iph;
 646                 int size;
 647 
 648                 dev = skb->dev;
 649                 IS_SKB(skb);
 650                 skb->when = jiffies;
 651 
 652                 /*
 653                  *      Discard the surplus MAC header
 654                  */
 655                  
 656                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 657 
 658                 /*
 659                  * In general it's OK just to use the old packet.  However we
 660                  * need to use the current ack and window fields.  Urg and
 661                  * urg_ptr could possibly stand to be updated as well, but we
 662                  * don't keep the necessary data.  That shouldn't be a problem,
 663                  * if the other end is doing the right thing.  Since we're
 664                  * changing the packet, we have to issue a new IP identifier.
 665                  */
 666 
 667                 iph = (struct iphdr *)skb->data;
 668                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 669                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 670                 
 671                 /*
 672                  *      Note: We ought to check for window limits here but
 673                  *      currently this is done (less efficiently) elsewhere.
 674                  */
 675 
 676                 iph->id = htons(ip_id_count++);
 677                 ip_send_check(iph);
 678                 
 679                 /*
 680                  *      Put a MAC header back on (may cause ARPing)
 681                  */
 682                  
 683                 if(skb->localroute)
 684                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 685                 else
 686                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 687                         
 688                 if(rt==NULL)    /* Deep poo */
 689                 {
 690                         if(skb->sk)
 691                         {
 692                                 skb->sk->err=ENETUNREACH;
 693                                 skb->sk->error_report(skb->sk);
 694                         }
 695                 }
 696                 else
 697                 {
 698                         dev=rt->rt_dev;
 699                         skb->raddr=rt->rt_gateway;
 700                         if(skb->raddr==0)
 701                                 skb->raddr=iph->daddr;
 702                         skb->dev=dev;
 703                         skb->arp=1;
 704                         if(dev->hard_header)
 705                         {
 706                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 707                                         skb->arp=0;
 708                         }
 709                 
 710                         /*
 711                          *      This is not the right way to handle this. We have to
 712                          *      issue an up to date window and ack report with this 
 713                          *      retransmit to keep the odd buggy tcp that relies on 
 714                          *      the fact BSD does this happy. 
 715                          *      We don't however need to recalculate the entire 
 716                          *      checksum, so someone wanting a small problem to play
 717                          *      with might like to implement RFC1141/RFC1624 and speed
 718                          *      this up by avoiding a full checksum.
 719                          */
 720                  
 721                         th->ack_seq = ntohl(sk->acked_seq);
 722                         th->window = ntohs(tcp_select_window(sk));
 723                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 724                 
 725                         /*
 726                          *      If the interface is (still) up and running, kick it.
 727                          */
 728         
 729                         if (dev->flags & IFF_UP)
 730                         {
 731                                 /*
 732                                  *      If the packet is still being sent by the device/protocol
 733                                  *      below then don't retransmit. This is both needed, and good -
 734                                  *      especially with connected mode AX.25 where it stops resends
 735                                  *      occurring of an as yet unsent anyway frame!
 736                                  *      We still add up the counts as the round trip time wants
 737                                  *      adjusting.
 738                                  */
 739                                 if (sk && !skb_device_locked(skb))
 740                                 {
 741                                         /* Remove it from any existing driver queue first! */
 742                                         skb_unlink(skb);
 743                                         /* Now queue it */
 744                                         ip_statistics.IpOutRequests++;
 745                                         dev_queue_xmit(skb, dev, sk->priority);
 746                                 }
 747                         }
 748                 }
 749                 
 750                 /*
 751                  *      Count retransmissions
 752                  */
 753                  
 754                 ct++;
 755                 sk->prot->retransmits ++;
 756                 tcp_statistics.TcpRetransSegs++;
 757                 
 758 
 759                 /*
 760                  *      Only one retransmit requested.
 761                  */
 762         
 763                 if (!all)
 764                         break;
 765 
 766                 /*
 767                  *      This should cut it off before we send too many packets.
 768                  */
 769 
 770                 if (ct >= sk->cong_window)
 771                         break;
 772                 skb = skb->link3;
 773         }
 774 }
 775 
 776 /*
 777  *      Reset the retransmission timer
 778  */
 779  
 780 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 781 {
 782         del_timer(&sk->retransmit_timer);
 783         sk->ip_xmit_timeout = why;
 784         if((int)when < 0)
 785         {
 786                 when=3;
 787                 printk("Error: Negative timer in xmit_timer\n");
 788         }
 789         sk->retransmit_timer.expires=jiffies+when;
 790         add_timer(&sk->retransmit_timer);
 791 }
 792 
 793 /*
 794  *      This is the normal code called for timeouts.  It does the retransmission
 795  *      and then does backoff.  tcp_do_retransmit is separated out because
 796  *      tcp_ack needs to send stuff from the retransmit queue without
 797  *      initiating a backoff.
 798  */
 799 
 800 
 801 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 802 {
 803         tcp_do_retransmit(sk, all);
 804 
 805         /*
 806          * Increase the timeout each time we retransmit.  Note that
 807          * we do not increase the rtt estimate.  rto is initialized
 808          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 809          * that doubling rto each time is the least we can get away with.
 810          * In KA9Q, Karn uses this for the first few times, and then
 811          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 812          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 813          * defined in the protocol as the maximum possible RTT.  I guess
 814          * we'll have to use something other than TCP to talk to the
 815          * University of Mars.
 816          *
 817          * PAWS allows us longer timeouts and large windows, so once
 818          * implemented ftp to mars will work nicely. We will have to fix
 819          * the 120 second clamps though!
 820          */
 821 
 822         sk->retransmits++;
 823         sk->prot->retransmits++;
 824         sk->backoff++;
 825         sk->rto = min(sk->rto << 1, 120*HZ);
 826         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 827 }
 828 
 829 
 830 /*
 831  *      A timer event has trigger a tcp retransmit timeout. The
 832  *      socket xmit queue is ready and set up to send. Because
 833  *      the ack receive code keeps the queue straight we do
 834  *      nothing clever here.
 835  */
 836 
 837 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 838 {
 839         if (all) 
 840         {
 841                 tcp_retransmit_time(sk, all);
 842                 return;
 843         }
 844 
 845         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 846         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 847         sk->cong_count = 0;
 848 
 849         sk->cong_window = 1;
 850 
 851         /* Do the actual retransmit. */
 852         tcp_retransmit_time(sk, all);
 853 }
 854 
 855 /*
 856  *      A write timeout has occurred. Process the after effects.
 857  */
 858 
 859 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 860 {
 861         /*
 862          *      Look for a 'soft' timeout.
 863          */
 864         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 865                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 866         {
 867                 /*
 868                  *      Attempt to recover if arp has changed (unlikely!) or
 869                  *      a route has shifted (not supported prior to 1.3).
 870                  */
 871                 arp_destroy (sk->daddr, 0);
 872                 /*ip_route_check (sk->daddr);*/
 873         }
 874         
 875         /*
 876          *      Have we tried to SYN too many times (repent repent 8))
 877          */
 878          
 879         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 880         {
 881                 sk->err=ETIMEDOUT;
 882                 sk->error_report(sk);
 883                 del_timer(&sk->retransmit_timer);
 884                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 885                 tcp_set_state(sk,TCP_CLOSE);
 886                 /* Don't FIN, we got nothing back */
 887                 release_sock(sk);
 888                 return 0;
 889         }
 890         /*
 891          *      Has it gone just too far ?
 892          */
 893         if (sk->retransmits > TCP_RETR2) 
 894         {
 895                 sk->err = ETIMEDOUT;
 896                 sk->error_report(sk);
 897                 del_timer(&sk->retransmit_timer);
 898                 /*
 899                  *      Time wait the socket 
 900                  */
 901                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 902                 {
 903                         tcp_set_state(sk,TCP_TIME_WAIT);
 904                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 905                 }
 906                 else
 907                 {
 908                         /*
 909                          *      Clean up time.
 910                          */
 911                         tcp_set_state(sk, TCP_CLOSE);
 912                         release_sock(sk);
 913                         return 0;
 914                 }
 915         }
 916         return 1;
 917 }
 918 
 919 /*
 920  *      The TCP retransmit timer. This lacks a few small details.
 921  *
 922  *      1.      An initial rtt timeout on the probe0 should cause what we can
 923  *              of the first write queue buffer to be split and sent.
 924  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 925  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 926  *              tcp_err should save a 'soft error' for us.
 927  */
 928 
 929 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 930 {
 931         struct sock *sk = (struct sock*)data;
 932         int why = sk->ip_xmit_timeout;
 933 
 934         /* 
 935          * only process if socket is not in use
 936          */
 937 
 938         cli();
 939         if (sk->inuse || in_bh) 
 940         {
 941                 /* Try again in 1 second */
 942                 sk->retransmit_timer.expires = jiffies+HZ;
 943                 add_timer(&sk->retransmit_timer);
 944                 sti();
 945                 return;
 946         }
 947 
 948         sk->inuse = 1;
 949         sti();
 950 
 951         /* Always see if we need to send an ack. */
 952 
 953         if (sk->ack_backlog && !sk->zapped) 
 954         {
 955                 sk->prot->read_wakeup (sk);
 956                 if (! sk->dead)
 957                         sk->data_ready(sk,0);
 958         }
 959 
 960         /* Now we need to figure out why the socket was on the timer. */
 961 
 962         switch (why) 
 963         {
 964                 /* Window probing */
 965                 case TIME_PROBE0:
 966                         tcp_send_probe0(sk);
 967                         tcp_write_timeout(sk);
 968                         break;
 969                 /* Retransmitting */
 970                 case TIME_WRITE:
 971                         /* It could be we got here because we needed to send an ack.
 972                          * So we need to check for that.
 973                          */
 974                 {
 975                         struct sk_buff *skb;
 976                         unsigned long flags;
 977 
 978                         save_flags(flags);
 979                         cli();
 980                         skb = sk->send_head;
 981                         if (!skb) 
 982                         {
 983                                 restore_flags(flags);
 984                         } 
 985                         else 
 986                         {
 987                                 /*
 988                                  *      Kicked by a delayed ack. Reset timer
 989                                  *      correctly now
 990                                  */
 991                                 if (jiffies < skb->when + sk->rto) 
 992                                 {
 993                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 994                                         restore_flags(flags);
 995                                         break;
 996                                 }
 997                                 restore_flags(flags);
 998                                 /*
 999                                  *      Retransmission
1000                                  */
1001                                 sk->retransmits++;
1002                                 sk->prot->retransmits++;
1003                                 sk->prot->retransmit (sk, 0);
1004                                 tcp_write_timeout(sk);
1005                         }
1006                         break;
1007                 }
1008                 /* Sending Keepalives */
1009                 case TIME_KEEPOPEN:
1010                         /* 
1011                          * this reset_timer() call is a hack, this is not
1012                          * how KEEPOPEN is supposed to work.
1013                          */
1014                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1015 
1016                         /* Send something to keep the connection open. */
1017                         if (sk->prot->write_wakeup)
1018                                   sk->prot->write_wakeup (sk);
1019                         sk->retransmits++;
1020                         sk->prot->retransmits++;
1021                         tcp_write_timeout(sk);
1022                         break;
1023                 default:
1024                         printk ("rexmit_timer: timer expired - reason unknown\n");
1025                         break;
1026         }
1027         release_sock(sk);
1028 }
1029 
1030 /*
1031  * This routine is called by the ICMP module when it gets some
1032  * sort of error condition.  If err < 0 then the socket should
1033  * be closed and the error returned to the user.  If err > 0
1034  * it's just the icmp type << 8 | icmp code.  After adjustment
1035  * header points to the first 8 bytes of the tcp header.  We need
1036  * to find the appropriate port.
1037  */
1038 
1039 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
1040         __u32 saddr, struct inet_protocol *protocol)
1041 {
1042         struct tcphdr *th;
1043         struct sock *sk;
1044         struct iphdr *iph=(struct iphdr *)header;
1045   
1046         header+=4*iph->ihl;
1047    
1048 
1049         th =(struct tcphdr *)header;
1050         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1051 
1052         if (sk == NULL) 
1053                 return;
1054   
1055         if (type == ICMP_SOURCE_QUENCH) 
1056         {
1057                 /*
1058                  * FIXME:
1059                  * For now we will just trigger a linear backoff.
1060                  * The slow start code should cause a real backoff here.
1061                  */
1062                 if (sk->cong_window > 4)
1063                         sk->cong_window--;
1064                 return;
1065         }
1066         
1067         if (type == ICMP_PARAMETERPROB)
1068         {
1069                 sk->err=EPROTO;
1070                 sk->error_report(sk);
1071         }
1072 
1073         /*
1074          * If we've already connected we will keep trying
1075          * until we time out, or the user gives up.
1076          */
1077 
1078         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1079         {
1080                 sk->err = icmp_err_convert[code].errno;
1081                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1082                 {
1083                         tcp_statistics.TcpAttemptFails++;
1084                         tcp_set_state(sk,TCP_CLOSE);
1085                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1086                 }
1087         }
1088         return;
1089 }
1090 
1091 
1092 /*
1093  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1094  *      in the received data queue (ie a frame missing that needs sending to us). Not
1095  *      sorting using two queues as data arrives makes life so much harder.
1096  */
1097 
1098 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1099 {
1100         unsigned long counted;
1101         unsigned long amount;
1102         struct sk_buff *skb;
1103         int sum;
1104         unsigned long flags;
1105 
1106         if(sk && sk->debug)
1107                 printk("tcp_readable: %p - ",sk);
1108 
1109         save_flags(flags);
1110         cli();
1111         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1112         {
1113                 restore_flags(flags);
1114                 if(sk && sk->debug) 
1115                         printk("empty\n");
1116                 return(0);
1117         }
1118   
1119         counted = sk->copied_seq;       /* Where we are at the moment */
1120         amount = 0;
1121   
1122         /* 
1123          *      Do until a push or until we are out of data. 
1124          */
1125          
1126         do 
1127         {
1128                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1129                         break;
1130                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1131                 if (skb->h.th->syn)
1132                         sum++;
1133                 if (sum > 0) 
1134                 {                                       /* Add it up, move on */
1135                         amount += sum;
1136                         if (skb->h.th->syn) 
1137                                 amount--;
1138                         counted += sum;
1139                 }
1140                 /*
1141                  * Don't count urg data ... but do it in the right place!
1142                  * Consider: "old_data (ptr is here) URG PUSH data"
1143                  * The old code would stop at the first push because
1144                  * it counted the urg (amount==1) and then does amount--
1145                  * *after* the loop.  This means tcp_readable() always
1146                  * returned zero if any URG PUSH was in the queue, even
1147                  * though there was normal data available. If we subtract
1148                  * the urg data right here, we even get it to work for more
1149                  * than one URG PUSH skb without normal data.
1150                  * This means that select() finally works now with urg data
1151                  * in the queue.  Note that rlogin was never affected
1152                  * because it doesn't use select(); it uses two processes
1153                  * and a blocking read().  And the queue scan in tcp_read()
1154                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1155                  */
1156                 if (skb->h.th->urg)
1157                         amount--;       /* don't count urg data */
1158                 if (amount && skb->h.th->psh) break;
1159                 skb = skb->next;
1160         }
1161         while(skb != (struct sk_buff *)&sk->receive_queue);
1162 
1163         restore_flags(flags);
1164         if(sk->debug)
1165                 printk("got %lu bytes.\n",amount);
1166         return(amount);
1167 }
1168 
1169 /*
1170  * LISTEN is a special case for select..
1171  */
1172 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1173 {
1174         if (sel_type == SEL_IN) {
1175                 int retval;
1176 
1177                 sk->inuse = 1;
1178                 retval = (tcp_find_established(sk) != NULL);
1179                 release_sock(sk);
1180                 if (!retval)
1181                         select_wait(&master_select_wakeup,wait);
1182                 return retval;
1183         }
1184         return 0;
1185 }
1186 
1187 
1188 /*
1189  *      Wait for a TCP event.
1190  *
1191  *      Note that we don't need to set "sk->inuse", as the upper select layers
1192  *      take care of normal races (between the test and the event) and we don't
1193  *      go look at any of the socket buffers directly.
1194  */
1195 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1196 {
1197         if (sk->state == TCP_LISTEN)
1198                 return tcp_listen_select(sk, sel_type, wait);
1199 
1200         switch(sel_type) {
1201         case SEL_IN:
1202                 if (sk->err)
1203                         return 1;
1204                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1205                         break;
1206 
1207                 if (sk->shutdown & RCV_SHUTDOWN)
1208                         return 1;
1209                         
1210                 if (sk->acked_seq == sk->copied_seq)
1211                         break;
1212 
1213                 if (sk->urg_seq != sk->copied_seq ||
1214                     sk->acked_seq != sk->copied_seq+1 ||
1215                     sk->urginline || !sk->urg_data)
1216                         return 1;
1217                 break;
1218 
1219         case SEL_OUT:
1220                 if (sk->err)
1221                         return 1;
1222                 if (sk->shutdown & SEND_SHUTDOWN) 
1223                         return 0;
1224                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1225                         break;
1226                 /*
1227                  * This is now right thanks to a small fix
1228                  * by Matt Dillon.
1229                  */
1230 
1231                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1232                         break;
1233                 return 1;
1234 
1235         case SEL_EX:
1236                 if (sk->urg_data)
1237                         return 1;
1238                 break;
1239         }
1240         select_wait(sk->sleep, wait);
1241         return 0;
1242 }
1243 
1244 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
1245 {
1246         int err;
1247         switch(cmd) 
1248         {
1249 
1250                 case TIOCINQ:
1251 #ifdef FIXME    /* FIXME: */
1252                 case FIONREAD:
1253 #endif
1254                 {
1255                         unsigned long amount;
1256 
1257                         if (sk->state == TCP_LISTEN) 
1258                                 return(-EINVAL);
1259 
1260                         sk->inuse = 1;
1261                         amount = tcp_readable(sk);
1262                         release_sock(sk);
1263                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1264                         if(err)
1265                                 return err;
1266                         put_user(amount, (int *)arg);
1267                         return(0);
1268                 }
1269                 case SIOCATMARK:
1270                 {
1271                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1272 
1273                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1274                         if (err)
1275                                 return err;
1276                         put_user(answ,(int *) arg);
1277                         return(0);
1278                 }
1279                 case TIOCOUTQ:
1280                 {
1281                         unsigned long amount;
1282 
1283                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1284                         amount = sock_wspace(sk);
1285                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1286                         if(err)
1287                                 return err;
1288                         put_user(amount, (int *)arg);
1289                         return(0);
1290                 }
1291                 default:
1292                         return(-EINVAL);
1293         }
1294 }
1295 
1296 
1297 /*
1298  *      This routine computes a TCP checksum. 
1299  *
1300  *      Modified January 1995 from a go-faster DOS routine by
1301  *      Jorge Cwik <jorge@laser.satlink.net>
1302  */
1303  
1304 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1305           unsigned long saddr, unsigned long daddr, unsigned long base)
1306 {     
1307         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1308 }
1309 
1310 
1311 
1312 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1313                 unsigned long daddr, int len, struct sock *sk)
1314 {
1315         th->check = 0;
1316         th->check = tcp_check(th, len, saddr, daddr,
1317                 csum_partial((char *)th,len,0));
1318         return;
1319 }
1320 
1321 /*
1322  *      This is the main buffer sending routine. We queue the buffer
1323  *      having checked it is sane seeming.
1324  */
1325  
1326 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1327 {
1328         int size;
1329         struct tcphdr * th = skb->h.th;
1330 
1331         /*
1332          *      length of packet (not counting length of pre-tcp headers) 
1333          */
1334          
1335         size = skb->len - ((unsigned char *) th - skb->data);
1336 
1337         /*
1338          *      Sanity check it.. 
1339          */
1340          
1341         if (size < sizeof(struct tcphdr) || size > skb->len) 
1342         {
1343                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1344                         skb, skb->data, th, skb->len);
1345                 kfree_skb(skb, FREE_WRITE);
1346                 return;
1347         }
1348 
1349         /*
1350          *      If we have queued a header size packet.. (these crash a few
1351          *      tcp stacks if ack is not set)
1352          */
1353          
1354         if (size == sizeof(struct tcphdr)) 
1355         {
1356                 /* If it's got a syn or fin it's notionally included in the size..*/
1357                 if(!th->syn && !th->fin) 
1358                 {
1359                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1360                         kfree_skb(skb,FREE_WRITE);
1361                         return;
1362                 }
1363         }
1364 
1365         /*
1366          *      Actual processing.
1367          */
1368          
1369         tcp_statistics.TcpOutSegs++;  
1370         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1371         
1372         /*
1373          *      We must queue if
1374          *
1375          *      a) The right edge of this frame exceeds the window
1376          *      b) We are retransmitting (Nagle's rule)
1377          *      c) We have too many packets 'in flight'
1378          */
1379          
1380         if (after(skb->h.seq, sk->window_seq) ||
1381             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1382              sk->packets_out >= sk->cong_window) 
1383         {
1384                 /* checksum will be supplied by tcp_write_xmit.  So
1385                  * we shouldn't need to set it at all.  I'm being paranoid */
1386                 th->check = 0;
1387                 if (skb->next != NULL) 
1388                 {
1389                         printk("tcp_send_partial: next != NULL\n");
1390                         skb_unlink(skb);
1391                 }
1392                 skb_queue_tail(&sk->write_queue, skb);
1393                 
1394                 /*
1395                  *      If we don't fit we have to start the zero window
1396                  *      probes. This is broken - we really need to do a partial
1397                  *      send _first_ (This is what causes the Cisco and PC/TCP
1398                  *      grief).
1399                  */
1400                  
1401                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1402                     sk->send_head == NULL && sk->ack_backlog == 0)
1403                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1404         } 
1405         else 
1406         {
1407                 /*
1408                  *      This is going straight out
1409                  */
1410                  
1411                 th->ack_seq = ntohl(sk->acked_seq);
1412                 th->window = ntohs(tcp_select_window(sk));
1413 
1414                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1415 
1416                 sk->sent_seq = sk->write_seq;
1417                 
1418                 /*
1419                  *      This is mad. The tcp retransmit queue is put together
1420                  *      by the ip layer. This causes half the problems with
1421                  *      unroutable FIN's and other things.
1422                  */
1423                  
1424                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1425                 
1426                 /*
1427                  *      Set for next retransmit based on expected ACK time.
1428                  *      FIXME: We set this every time which means our 
1429                  *      retransmits are really about a window behind.
1430                  */
1431 
1432                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1433         }
1434 }
1435 
1436 /*
1437  *      Locking problems lead us to a messy situation where we can have
1438  *      multiple partially complete buffers queued up. This is really bad
1439  *      as we don't want to be sending partial buffers. Fix this with
1440  *      a semaphore or similar to lock tcp_write per socket.
1441  *
1442  *      These routines are pretty self descriptive.
1443  */
1444  
1445 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1446 {
1447         struct sk_buff * skb;
1448         unsigned long flags;
1449 
1450         save_flags(flags);
1451         cli();
1452         skb = sk->partial;
1453         if (skb) {
1454                 sk->partial = NULL;
1455                 del_timer(&sk->partial_timer);
1456         }
1457         restore_flags(flags);
1458         return skb;
1459 }
1460 
1461 /*
1462  *      Empty the partial queue
1463  */
1464  
1465 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1466 {
1467         struct sk_buff *skb;
1468 
1469         if (sk == NULL)
1470                 return;
1471         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1472                 tcp_send_skb(sk, skb);
1473 }
1474 
1475 /*
1476  *      Queue a partial frame
1477  */
1478  
1479 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1480 {
1481         struct sk_buff * tmp;
1482         unsigned long flags;
1483 
1484         save_flags(flags);
1485         cli();
1486         tmp = sk->partial;
1487         if (tmp)
1488                 del_timer(&sk->partial_timer);
1489         sk->partial = skb;
1490         init_timer(&sk->partial_timer);
1491         /*
1492          *      Wait up to 1 second for the buffer to fill.
1493          */
1494         sk->partial_timer.expires = jiffies+HZ;
1495         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1496         sk->partial_timer.data = (unsigned long) sk;
1497         add_timer(&sk->partial_timer);
1498         restore_flags(flags);
1499         if (tmp)
1500                 tcp_send_skb(sk, tmp);
1501 }
1502 
1503 
1504 /*
1505  *      This routine sends an ack and also updates the window. 
1506  */
1507  
1508 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1509              struct sock *sk,
1510              struct tcphdr *th, unsigned long daddr)
1511 {
1512         struct sk_buff *buff;
1513         struct tcphdr *t1;
1514         struct device *dev = NULL;
1515         int tmp;
1516 
1517         if(sk->zapped)
1518                 return;         /* We have been reset, we may not send again */
1519                 
1520         /*
1521          * We need to grab some memory, and put together an ack,
1522          * and then put it into the queue to be sent.
1523          */
1524 
1525         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1526         if (buff == NULL) 
1527         {
1528                 /* 
1529                  *      Force it to send an ack. We don't have to do this
1530                  *      (ACK is unreliable) but it's much better use of 
1531                  *      bandwidth on slow links to send a spare ack than
1532                  *      resend packets. 
1533                  */
1534                  
1535                 sk->ack_backlog++;
1536                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1537                 {
1538                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1539                 }
1540                 return;
1541         }
1542 
1543         /*
1544          *      Assemble a suitable TCP frame
1545          */
1546          
1547         buff->sk = sk;
1548         buff->localroute = sk->localroute;
1549 
1550         /* 
1551          *      Put in the IP header and routing stuff. 
1552          */
1553          
1554         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1555                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1556         if (tmp < 0) 
1557         {
1558                 buff->free = 1;
1559                 sock_wfree(sk, buff);
1560                 return;
1561         }
1562         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1563 
1564         memcpy(t1, th, sizeof(*t1));
1565 
1566         /*
1567          *      Swap the send and the receive. 
1568          */
1569          
1570         t1->dest = th->source;
1571         t1->source = th->dest;
1572         t1->seq = ntohl(sequence);
1573         t1->ack = 1;
1574         sk->window = tcp_select_window(sk);
1575         t1->window = ntohs(sk->window);
1576         t1->res1 = 0;
1577         t1->res2 = 0;
1578         t1->rst = 0;
1579         t1->urg = 0;
1580         t1->syn = 0;
1581         t1->psh = 0;
1582         t1->fin = 0;
1583         
1584         /*
1585          *      If we have nothing queued for transmit and the transmit timer
1586          *      is on we are just doing an ACK timeout and need to switch
1587          *      to a keepalive.
1588          */
1589          
1590         if (ack == sk->acked_seq) 
1591         {
1592                 sk->ack_backlog = 0;
1593                 sk->bytes_rcv = 0;
1594                 sk->ack_timed = 0;
1595                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1596                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1597                 {
1598                         if(sk->keepopen) {
1599                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1600                         } else {
1601                                 delete_timer(sk);
1602                         }
1603                 }
1604         }
1605         
1606         /*
1607          *      Fill in the packet and send it
1608          */
1609          
1610         t1->ack_seq = ntohl(ack);
1611         t1->doff = sizeof(*t1)/4;
1612         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1613         if (sk->debug)
1614                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1615         tcp_statistics.TcpOutSegs++;
1616         sk->prot->queue_xmit(sk, dev, buff, 1);
1617 }
1618 
1619 
1620 /* 
1621  *      This routine builds a generic TCP header. 
1622  */
1623  
1624 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1625 {
1626 
1627         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1628         th->seq = htonl(sk->write_seq);
1629         th->psh =(push == 0) ? 1 : 0;
1630         th->doff = sizeof(*th)/4;
1631         th->ack = 1;
1632         th->fin = 0;
1633         sk->ack_backlog = 0;
1634         sk->bytes_rcv = 0;
1635         sk->ack_timed = 0;
1636         th->ack_seq = htonl(sk->acked_seq);
1637         sk->window = tcp_select_window(sk);
1638         th->window = htons(sk->window);
1639 
1640         return(sizeof(*th));
1641 }
1642 
1643 /*
1644  *      This routine copies from a user buffer into a socket,
1645  *      and starts the transmit system.
1646  */
1647 
1648 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
1649           int len, int nonblock, int flags)
1650 {
1651         int copied = 0;
1652         int copy;
1653         int tmp;
1654         int seglen;
1655         int iovct=0;
1656         struct sk_buff *skb;
1657         struct sk_buff *send_tmp;
1658         struct proto *prot;
1659         struct device *dev = NULL;
1660         unsigned char *from;
1661         
1662         /*
1663          *      Do sanity checking for sendmsg/sendto/send
1664          */
1665          
1666         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1667                 return -EINVAL;
1668         if (msg->msg_name)
1669         {
1670                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1671                 if(sk->state == TCP_CLOSE)
1672                         return -ENOTCONN;
1673                 if (msg->msg_namelen < sizeof(*addr))
1674                         return -EINVAL;
1675                 if (addr->sin_family && addr->sin_family != AF_INET) 
1676                         return -EINVAL;
1677                 if (addr->sin_port != sk->dummy_th.dest) 
1678                         return -EISCONN;
1679                 if (addr->sin_addr.s_addr != sk->daddr) 
1680                         return -EISCONN;
1681         }
1682         
1683         /*
1684          *      Ok commence sending
1685          */
1686         
1687         while(iovct<msg->msg_iovlen)
1688         {
1689                 seglen=msg->msg_iov[iovct].iov_len;
1690                 from=msg->msg_iov[iovct++].iov_base;
1691                 sk->inuse=1;
1692                 prot = sk->prot;
1693                 while(seglen > 0) 
1694                 {
1695                         if (sk->err) 
1696                         {                       /* Stop on an error */
1697                                 release_sock(sk);
1698                                 if (copied) 
1699                                         return(copied);
1700                                 tmp = -sk->err;
1701                                 sk->err = 0;
1702                                 return(tmp);
1703                         }
1704 
1705                         /*
1706                          *      First thing we do is make sure that we are established. 
1707                          */
1708         
1709                         if (sk->shutdown & SEND_SHUTDOWN) 
1710                         {
1711                                 release_sock(sk);
1712                                 sk->err = EPIPE;
1713                                 if (copied) 
1714                                         return(copied);
1715                                 sk->err = 0;
1716                                 return(-EPIPE);
1717                         }
1718 
1719                         /* 
1720                          *      Wait for a connection to finish.
1721                          */
1722                 
1723                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1724                         {
1725                                 if (sk->err) 
1726                                 {
1727                                         release_sock(sk);
1728                                         if (copied) 
1729                                                 return(copied);
1730                                         tmp = -sk->err;
1731                                         sk->err = 0;
1732                                         return(tmp);
1733                                 }               
1734         
1735                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1736                                 {
1737                                         release_sock(sk);
1738                                         if (copied) 
1739                                                 return(copied);
1740         
1741                                         if (sk->err) 
1742                                         {       
1743                                                 tmp = -sk->err;
1744                                                 sk->err = 0;
1745                                                 return(tmp);
1746                                         }
1747 
1748                                         if (sk->keepopen) 
1749                                         {
1750                                                 send_sig(SIGPIPE, current, 0);
1751                                         }
1752                                         return(-EPIPE);
1753                                 }
1754         
1755                                 if (nonblock || copied) 
1756                                 {
1757                                         release_sock(sk);
1758                                         if (copied) 
1759                                                 return(copied);
1760                                         return(-EAGAIN);
1761                                 }
1762         
1763                                 release_sock(sk);
1764                                 cli();
1765                         
1766                                 if (sk->state != TCP_ESTABLISHED &&
1767                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1768                                 {
1769                                         interruptible_sleep_on(sk->sleep);      
1770                                         if (current->signal & ~current->blocked)
1771                                         {
1772                                                 sti();
1773                                                 if (copied) 
1774                                                         return(copied);
1775                                                 return(-ERESTARTSYS);
1776                                         }
1777                                 }
1778                                 sk->inuse = 1;
1779                                 sti();
1780                         }
1781         
1782                 /*
1783                  * The following code can result in copy <= if sk->mss is ever
1784                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1785                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1786                  * had better not get here until we've seen his SYN and at least one
1787                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1788                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1789                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1790                  * before the exchange of SYN's.  If the initial ack from the other
1791                  * end has a window of 0, max_window and thus mss will both be 0.
1792                  */
1793         
1794                 /* 
1795                  *      Now we need to check if we have a half built packet. 
1796                  */
1797         
1798                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1799                         {
1800                                 int hdrlen;
1801 
1802                                  /* IP header + TCP header */
1803                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1804                                          + sizeof(struct tcphdr);
1805         
1806                                 /* Add more stuff to the end of skb->len */
1807                                 if (!(flags & MSG_OOB)) 
1808                                 {
1809                                         copy = min(sk->mss - (skb->len - hdrlen), len);
1810                                         /* FIXME: this is really a bug. */
1811                                         if (copy <= 0) 
1812                                         {
1813                                                 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1814                                                 copy = 0;
1815                                         }                 
1816                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1817                                         from += copy;
1818                                         copied += copy;
1819                                         len -= copy;
1820                                         sk->write_seq += copy;
1821                                         seglen -= copy;
1822                                 }
1823                                 if ((skb->len - hdrlen) >= sk->mss ||
1824                                         (flags & MSG_OOB) || !sk->packets_out)
1825                                         tcp_send_skb(sk, skb);
1826                                 else
1827                                         tcp_enqueue_partial(skb, sk);
1828                                 continue;
1829                         }
1830 
1831                 /*
1832                  * We also need to worry about the window.
1833                  * If window < 1/2 the maximum window we've seen from this
1834                  *   host, don't use it.  This is sender side
1835                  *   silly window prevention, as specified in RFC1122.
1836                  *   (Note that this is different than earlier versions of
1837                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1838                  *   use the whole MSS.  Since the results in the right
1839                  *   edge of the packet being outside the window, it will
1840                  *   be queued for later rather than sent.
1841                  */
1842 
1843                         copy = sk->window_seq - sk->write_seq;
1844                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1845                                 copy = sk->mss;
1846                         if (copy > len)
1847                                 copy = len;
1848 
1849                 /*
1850                  *      We should really check the window here also. 
1851                  */
1852                  
1853                         send_tmp = NULL;
1854                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1855                         {
1856                                 /*
1857                                  *      We will release the socket in case we sleep here. 
1858                                  */
1859                                 release_sock(sk);
1860                                 /*
1861                                  *      NB: following must be mtu, because mss can be increased.
1862                                  *      mss is always <= mtu 
1863                                  */
1864                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1865                                 sk->inuse = 1;
1866                                 send_tmp = skb;
1867                         } 
1868                         else 
1869                         {
1870                                 /*
1871                                  *      We will release the socket in case we sleep here. 
1872                                  */
1873                                 release_sock(sk);
1874                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1875                                 sk->inuse = 1;
1876                         }
1877         
1878                         /*
1879                          *      If we didn't get any memory, we need to sleep. 
1880                          */
1881         
1882                         if (skb == NULL) 
1883                         {
1884                                 sk->socket->flags |= SO_NOSPACE;
1885                                 if (nonblock) 
1886                                 {
1887                                         release_sock(sk);
1888                                         if (copied) 
1889                                                 return(copied);
1890                                         return(-EAGAIN);
1891                                 }
1892 
1893                                 /*
1894                                  *      FIXME: here is another race condition. 
1895                                  */
1896 
1897                                 tmp = sk->wmem_alloc;
1898                                 release_sock(sk);
1899                                 cli();
1900                                 /*
1901                                  *      Again we will try to avoid it. 
1902                                  */
1903                                 if (tmp <= sk->wmem_alloc &&
1904                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1905                                         && sk->err == 0) 
1906                                 {
1907                                         sk->socket->flags &= ~SO_NOSPACE;
1908                                         interruptible_sleep_on(sk->sleep);
1909                                         if (current->signal & ~current->blocked) 
1910                                         {
1911                                                 sti();
1912                                                 if (copied) 
1913                                                         return(copied);
1914                                                 return(-ERESTARTSYS);
1915                                         }
1916                                 }
1917                                 sk->inuse = 1;
1918                                 sti();
1919                                 continue;
1920                         }
1921 
1922                         skb->sk = sk;
1923                         skb->free = 0;
1924                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1925         
1926                         /*
1927                          * FIXME: we need to optimize this.
1928                          * Perhaps some hints here would be good.
1929                          */
1930                 
1931                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1932                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1933                         if (tmp < 0 ) 
1934                         {
1935                                 sock_wfree(sk, skb);
1936                                 release_sock(sk);
1937                                 if (copied) 
1938                                         return(copied);
1939                                 return(tmp);
1940                         }
1941                         skb->dev = dev;
1942                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1943                         tmp = tcp_build_header(skb->h.th, sk, len-copy);
1944                         if (tmp < 0) 
1945                         {
1946                                 sock_wfree(sk, skb);
1947                                 release_sock(sk);
1948                                 if (copied) 
1949                                         return(copied);
1950                                 return(tmp);
1951                         }
1952         
1953                         if (flags & MSG_OOB) 
1954                         {
1955                                 skb->h.th->urg = 1;
1956                                 skb->h.th->urg_ptr = ntohs(copy);
1957                         }
1958 
1959                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1960                 
1961                         from += copy;
1962                         copied += copy;
1963                         len -= copy;
1964                         seglen -= copy;
1965                         skb->free = 0;
1966                         sk->write_seq += copy;
1967                 
1968                         if (send_tmp != NULL && sk->packets_out) 
1969                         {
1970                                 tcp_enqueue_partial(send_tmp, sk);
1971                                 continue;
1972                         }
1973                         tcp_send_skb(sk, skb);
1974                 }
1975         }
1976         sk->err = 0;
1977 
1978 /*
1979  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1980  *      interactive fast network servers. It's meant to be on and
1981  *      it really improves the throughput though not the echo time
1982  *      on my slow slip link - Alan
1983  */
1984 
1985 /*
1986  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1987  */
1988  
1989         if(sk->partial && ((!sk->packets_out) 
1990      /* If not nagling we can send on the before case too.. */
1991               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1992         ))
1993                 tcp_send_partial(sk);
1994 
1995         release_sock(sk);
1996         return(copied);
1997 }
1998 
1999 static int tcp_sendto(struct sock *sk, const unsigned char *ubuf, int size, int noblock, unsigned flags,
     /* [previous][next][first][last][top][bottom][index][help] */
2000                 struct sockaddr_in *sin, int addr_len)
2001 {
2002         struct iovec iov;
2003         struct msghdr msg;
2004 
2005         iov.iov_base = (void *)ubuf;
2006         iov.iov_len  = size;
2007 
2008         msg.msg_name      = (void *)sin;
2009         msg.msg_namelen   = addr_len;
2010         msg.msg_accrights = NULL;
2011         msg.msg_iov       = &iov;
2012         msg.msg_iovlen    = 1;
2013 
2014         return tcp_sendmsg(sk, &msg, size, noblock, flags);
2015 }
2016 
2017 static int tcp_write(struct sock *sk, const unsigned char *ubuf, int size, int noblock, unsigned flags)
     /* [previous][next][first][last][top][bottom][index][help] */
2018 {
2019         return tcp_sendto(sk,ubuf,size,noblock,flags,NULL,0);
2020 }
2021 
2022 
2023 /*
2024  *      Send an ack if one is backlogged at this point. Ought to merge
2025  *      this with tcp_send_ack().
2026  */
2027  
2028 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2029 {
2030         int tmp;
2031         struct device *dev = NULL;
2032         struct tcphdr *t1;
2033         struct sk_buff *buff;
2034 
2035         if (!sk->ack_backlog) 
2036                 return;
2037 
2038         /*
2039          * If we're closed, don't send an ack, or we'll get a RST
2040          * from the closed destination.
2041          */
2042         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2043                 return; 
2044 
2045         /*
2046          * FIXME: we need to put code here to prevent this routine from
2047          * being called.  Being called once in a while is ok, so only check
2048          * if this is the second time in a row.
2049          */
2050 
2051         /*
2052          * We need to grab some memory, and put together an ack,
2053          * and then put it into the queue to be sent.
2054          */
2055 
2056         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2057         if (buff == NULL) 
2058         {
2059                 /* Try again real soon. */
2060                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2061                 return;
2062         }
2063 
2064         buff->sk = sk;
2065         buff->localroute = sk->localroute;
2066         
2067         /*
2068          *      Put in the IP header and routing stuff. 
2069          */
2070 
2071         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2072                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2073         if (tmp < 0) 
2074         {
2075                 buff->free = 1;
2076                 sock_wfree(sk, buff);
2077                 return;
2078         }
2079 
2080         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2081 
2082         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2083         t1->seq = htonl(sk->sent_seq);
2084         t1->ack = 1;
2085         t1->res1 = 0;
2086         t1->res2 = 0;
2087         t1->rst = 0;
2088         t1->urg = 0;
2089         t1->syn = 0;
2090         t1->psh = 0;
2091         sk->ack_backlog = 0;
2092         sk->bytes_rcv = 0;
2093         sk->window = tcp_select_window(sk);
2094         t1->window = ntohs(sk->window);
2095         t1->ack_seq = ntohl(sk->acked_seq);
2096         t1->doff = sizeof(*t1)/4;
2097         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2098         sk->prot->queue_xmit(sk, dev, buff, 1);
2099         tcp_statistics.TcpOutSegs++;
2100 }
2101 
2102 
2103 /*
2104  *      FIXME:
2105  *      This routine frees used buffers.
2106  *      It should consider sending an ACK to let the
2107  *      other end know we now have a bigger window.
2108  */
2109 
2110 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2111 {
2112         unsigned long flags;
2113         unsigned long left;
2114         struct sk_buff *skb;
2115         unsigned long rspace;
2116 
2117         if(sk->debug)
2118                 printk("cleaning rbuf for sk=%p\n", sk);
2119   
2120         save_flags(flags);
2121         cli();
2122   
2123         left = sock_rspace(sk);
2124  
2125         /*
2126          *      We have to loop through all the buffer headers,
2127          *      and try to free up all the space we can.
2128          */
2129 
2130         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2131         {
2132                 if (!skb->used || skb->users) 
2133                         break;
2134                 skb_unlink(skb);
2135                 skb->sk = sk;
2136                 kfree_skb(skb, FREE_READ);
2137         }
2138 
2139         restore_flags(flags);
2140 
2141         /*
2142          *      FIXME:
2143          *      At this point we should send an ack if the difference
2144          *      in the window, and the amount of space is bigger than
2145          *      TCP_WINDOW_DIFF.
2146          */
2147 
2148         if(sk->debug)
2149                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2150                                             left);
2151         if ((rspace=sock_rspace(sk)) != left) 
2152         {
2153                 /*
2154                  * This area has caused the most trouble.  The current strategy
2155                  * is to simply do nothing if the other end has room to send at
2156                  * least 3 full packets, because the ack from those will auto-
2157                  * matically update the window.  If the other end doesn't think
2158                  * we have much space left, but we have room for at least 1 more
2159                  * complete packet than it thinks we do, we will send an ack
2160                  * immediately.  Otherwise we will wait up to .5 seconds in case
2161                  * the user reads some more.
2162                  */
2163                 sk->ack_backlog++;
2164         /*
2165          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2166          * if the other end is offering a window smaller than the agreed on MSS
2167          * (called sk->mtu here).  In theory there's no connection between send
2168          * and receive, and so no reason to think that they're going to send
2169          * small packets.  For the moment I'm using the hack of reducing the mss
2170          * only on the send side, so I'm putting mtu here.
2171          */
2172 
2173                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2174                 {
2175                         /* Send an ack right now. */
2176                         tcp_read_wakeup(sk);
2177                 } 
2178                 else 
2179                 {
2180                         /* Force it to send an ack soon. */
2181                         int was_active = del_timer(&sk->retransmit_timer);
2182                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2183                         {
2184                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2185                         } 
2186                         else
2187                                 add_timer(&sk->retransmit_timer);
2188                 }
2189         }
2190 } 
2191 
2192 
2193 /*
2194  *      Handle reading urgent data. BSD has very simple semantics for
2195  *      this, no blocking and very strange errors 8)
2196  */
2197  
2198 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2199              struct msghdr *msg, int len, int flags, int *addr_len)
2200 {
2201         /*
2202          *      No URG data to read
2203          */
2204         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2205                 return -EINVAL; /* Yes this is right ! */
2206                 
2207         if (sk->err) 
2208         {
2209                 int tmp = -sk->err;
2210                 sk->err = 0;
2211                 return tmp;
2212         }
2213 
2214         if (sk->state == TCP_CLOSE || sk->done) 
2215         {
2216                 if (!sk->done) 
2217                 {
2218                         sk->done = 1;
2219                         return 0;
2220                 }
2221                 return -ENOTCONN;
2222         }
2223 
2224         if (sk->shutdown & RCV_SHUTDOWN) 
2225         {
2226                 sk->done = 1;
2227                 return 0;
2228         }
2229         sk->inuse = 1;
2230         if (sk->urg_data & URG_VALID) 
2231         {
2232                 char c = sk->urg_data;
2233                 if (!(flags & MSG_PEEK))
2234                         sk->urg_data = URG_READ;
2235                 memcpy_toiovec(msg->msg_iov, &c, 1);
2236                 if(msg->msg_name)
2237                 {
2238                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2239                         sin->sin_family=AF_INET;
2240                         sin->sin_addr.s_addr=sk->daddr;
2241                         sin->sin_port=sk->dummy_th.dest;
2242                 }
2243                 if(addr_len)
2244                         *addr_len=sizeof(struct sockaddr_in);
2245                 release_sock(sk);
2246                 return 1;
2247         }
2248         release_sock(sk);
2249         
2250         /*
2251          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2252          * the available implementations agree in this case:
2253          * this call should never block, independent of the
2254          * blocking state of the socket.
2255          * Mike <pall@rz.uni-karlsruhe.de>
2256          */
2257         return -EAGAIN;
2258 }
2259 
2260 
2261 /*
2262  *      This routine copies from a sock struct into the user buffer. 
2263  */
2264  
2265 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /* [previous][next][first][last][top][bottom][index][help] */
2266         int len, int nonblock, int flags, int *addr_len)
2267 {
2268         struct wait_queue wait = { current, NULL };
2269         int copied = 0;
2270         u32 peek_seq;
2271         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2272         unsigned long used;
2273 
2274         /* 
2275          *      This error should be checked. 
2276          */
2277          
2278         if (sk->state == TCP_LISTEN)
2279                 return -ENOTCONN;
2280 
2281         /*
2282          *      Urgent data needs to be handled specially. 
2283          */
2284          
2285         if (flags & MSG_OOB)
2286                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2287 
2288         /*
2289          *      Copying sequence to update. This is volatile to handle
2290          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2291          *      inline and thus not flush cached variables otherwise).
2292          */
2293          
2294         peek_seq = sk->copied_seq;
2295         seq = &sk->copied_seq;
2296         if (flags & MSG_PEEK)
2297                 seq = &peek_seq;
2298 
2299         add_wait_queue(sk->sleep, &wait);
2300         sk->inuse = 1;
2301         while (len > 0) 
2302         {
2303                 struct sk_buff * skb;
2304                 u32 offset;
2305         
2306                 /*
2307                  * Are we at urgent data? Stop if we have read anything.
2308                  */
2309                  
2310                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2311                         break;
2312 
2313                 /*
2314                  *      Next get a buffer.
2315                  */
2316                  
2317                 current->state = TASK_INTERRUPTIBLE;
2318 
2319                 skb = skb_peek(&sk->receive_queue);
2320                 do 
2321                 {
2322                         if (!skb)
2323                                 break;
2324                         if (before(*seq, skb->h.th->seq))
2325                                 break;
2326                         offset = *seq - skb->h.th->seq;
2327                         if (skb->h.th->syn)
2328                                 offset--;
2329                         if (offset < skb->len)
2330                                 goto found_ok_skb;
2331                         if (skb->h.th->fin)
2332                                 goto found_fin_ok;
2333                         if (!(flags & MSG_PEEK))
2334                                 skb->used = 1;
2335                         skb = skb->next;
2336                 }
2337                 while (skb != (struct sk_buff *)&sk->receive_queue);
2338 
2339                 if (copied)
2340                         break;
2341 
2342                 if (sk->err) 
2343                 {
2344                         copied = -sk->err;
2345                         sk->err = 0;
2346                         break;
2347                 }
2348 
2349                 if (sk->state == TCP_CLOSE) 
2350                 {
2351                         if (!sk->done) 
2352                         {
2353                                 sk->done = 1;
2354                                 break;
2355                         }
2356                         copied = -ENOTCONN;
2357                         break;
2358                 }
2359 
2360                 if (sk->shutdown & RCV_SHUTDOWN) 
2361                 {
2362                         sk->done = 1;
2363                         break;
2364                 }
2365                         
2366                 if (nonblock) 
2367                 {
2368                         copied = -EAGAIN;
2369                         break;
2370                 }
2371 
2372                 cleanup_rbuf(sk);
2373                 release_sock(sk);
2374                 sk->socket->flags |= SO_WAITDATA;
2375                 schedule();
2376                 sk->socket->flags &= ~SO_WAITDATA;
2377                 sk->inuse = 1;
2378 
2379                 if (current->signal & ~current->blocked) 
2380                 {
2381                         copied = -ERESTARTSYS;
2382                         break;
2383                 }
2384                 continue;
2385 
2386         found_ok_skb:
2387                 /*
2388                  *      Lock the buffer. We can be fairly relaxed as
2389                  *      an interrupt will never steal a buffer we are 
2390                  *      using unless I've missed something serious in
2391                  *      tcp_data.
2392                  */
2393                 
2394                 skb->users++;
2395                 
2396                 /*
2397                  *      Ok so how much can we use ? 
2398                  */
2399                  
2400                 used = skb->len - offset;
2401                 if (len < used)
2402                         used = len;
2403                 /*
2404                  *      Do we have urgent data here? 
2405                  */
2406                 
2407                 if (sk->urg_data) 
2408                 {
2409                         u32 urg_offset = sk->urg_seq - *seq;
2410                         if (urg_offset < used) 
2411                         {
2412                                 if (!urg_offset) 
2413                                 {
2414                                         if (!sk->urginline) 
2415                                         {
2416                                                 ++*seq;
2417                                                 offset++;
2418                                                 used--;
2419                                         }
2420                                 }
2421                                 else
2422                                         used = urg_offset;
2423                         }
2424                 }
2425                 
2426                 /*
2427                  *      Copy it - We _MUST_ update *seq first so that we
2428                  *      don't ever double read when we have dual readers
2429                  */
2430                  
2431                 *seq += used;
2432 
2433                 /*
2434                  *      This memcpy_tofs can sleep. If it sleeps and we
2435                  *      do a second read it relies on the skb->users to avoid
2436                  *      a crash when cleanup_rbuf() gets called.
2437                  */
2438                  
2439                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2440                         skb->h.th->doff*4 + offset, used);
2441                 copied += used;
2442                 len -= used;
2443                 
2444                 /*
2445                  *      We now will not sleep again until we are finished
2446                  *      with skb. Sorry if you are doing the SMP port
2447                  *      but you'll just have to fix it neatly ;)
2448                  */
2449                  
2450                 skb->users --;
2451                 
2452                 if (after(sk->copied_seq,sk->urg_seq))
2453                         sk->urg_data = 0;
2454                 if (used + offset < skb->len)
2455                         continue;
2456                 
2457                 /*
2458                  *      Process the FIN.
2459                  */
2460 
2461                 if (skb->h.th->fin)
2462                         goto found_fin_ok;
2463                 if (flags & MSG_PEEK)
2464                         continue;
2465                 skb->used = 1;
2466                 continue;
2467 
2468         found_fin_ok:
2469                 ++*seq;
2470                 if (flags & MSG_PEEK)
2471                         break;
2472                         
2473                 /*
2474                  *      All is done
2475                  */
2476                  
2477                 skb->used = 1;
2478                 sk->shutdown |= RCV_SHUTDOWN;
2479                 break;
2480 
2481         }
2482         
2483         if(copied>0 && msg->msg_name)
2484         {
2485                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2486                 sin->sin_family=AF_INET;
2487                 sin->sin_addr.s_addr=sk->daddr;
2488                 sin->sin_port=sk->dummy_th.dest;
2489         }
2490         if(addr_len)
2491                 *addr_len=sizeof(struct sockaddr_in);
2492                 
2493         remove_wait_queue(sk->sleep, &wait);
2494         current->state = TASK_RUNNING;
2495 
2496         /* Clean up data we have read: This will do ACK frames */
2497         cleanup_rbuf(sk);
2498         release_sock(sk);
2499         return copied;
2500 }
2501 
2502 
2503 static int tcp_recvfrom(struct sock *sk, unsigned char *ubuf, int size, int noblock, unsigned flags,
     /* [previous][next][first][last][top][bottom][index][help] */
2504                 struct sockaddr_in *sa, int *addr_len)
2505 {
2506         struct iovec iov;
2507         struct msghdr msg;
2508 
2509         iov.iov_base = (void *)ubuf;
2510         iov.iov_len  = size;
2511 
2512         msg.msg_name      = (void *)sa;
2513         msg.msg_namelen   = 0;
2514         if (addr_len)
2515                 msg.msg_namelen = *addr_len;
2516         msg.msg_accrights = NULL;
2517         msg.msg_iov       = &iov;
2518         msg.msg_iovlen    = 1;
2519 
2520         return tcp_recvmsg(sk, &msg, size, noblock, flags, addr_len);
2521 }
2522 
2523 int tcp_read(struct sock *sk, unsigned char *buff, int len, int noblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2524          unsigned flags)
2525 {
2526         return(tcp_recvfrom(sk, buff, len, noblock, flags, NULL, NULL));
2527 }
2528 
2529 
2530 /*
2531  *      State processing on a close. This implements the state shift for
2532  *      sending our FIN frame. Note that we only send a FIN for some 
2533  *      states. A shutdown() may have already sent the FIN, or we may be
2534  *      closed.
2535  */
2536  
2537 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2538 {
2539         int ns=TCP_CLOSE;
2540         int send_fin=0;
2541         switch(sk->state)
2542         {
2543                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2544                         break;
2545                 case TCP_SYN_RECV:
2546                 case TCP_ESTABLISHED:   /* Closedown begin */
2547                         ns=TCP_FIN_WAIT1;
2548                         send_fin=1;
2549                         break;
2550                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2551                 case TCP_FIN_WAIT2:
2552                 case TCP_CLOSING:
2553                         ns=sk->state;
2554                         break;
2555                 case TCP_CLOSE:
2556                 case TCP_LISTEN:
2557                         break;
2558                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2559                                            wait only for the ACK */
2560                         ns=TCP_LAST_ACK;
2561                         send_fin=1;
2562         }
2563         
2564         tcp_set_state(sk,ns);
2565                 
2566         /*
2567          *      This is a (useful) BSD violating of the RFC. There is a
2568          *      problem with TCP as specified in that the other end could
2569          *      keep a socket open forever with no application left this end.
2570          *      We use a 3 minute timeout (about the same as BSD) then kill
2571          *      our end. If they send after that then tough - BUT: long enough
2572          *      that we won't make the old 4*rto = almost no time - whoops
2573          *      reset mistake.
2574          */
2575         if(dead && ns==TCP_FIN_WAIT2)
2576         {
2577                 int timer_active=del_timer(&sk->timer);
2578                 if(timer_active)
2579                         add_timer(&sk->timer);
2580                 else
2581                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2582         }
2583         
2584         return send_fin;
2585 }
2586 
2587 /*
2588  *      Send a fin.
2589  */
2590 
2591 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2592 {
2593         struct proto *prot =(struct proto *)sk->prot;
2594         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2595         struct tcphdr *t1;
2596         struct sk_buff *buff;
2597         struct device *dev=NULL;
2598         int tmp;
2599                 
2600         release_sock(sk); /* in case the malloc sleeps. */
2601         
2602         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2603         sk->inuse = 1;
2604 
2605         if (buff == NULL)
2606         {
2607                 /* This is a disaster if it occurs */
2608                 printk("tcp_send_fin: Impossible malloc failure");
2609                 return;
2610         }
2611 
2612         /*
2613          *      Administrivia
2614          */
2615          
2616         buff->sk = sk;
2617         buff->localroute = sk->localroute;
2618 
2619         /*
2620          *      Put in the IP header and routing stuff. 
2621          */
2622 
2623         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2624                            IPPROTO_TCP, sk->opt,
2625                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2626         if (tmp < 0) 
2627         {
2628                 int t;
2629                 /*
2630                  *      Finish anyway, treat this as a send that got lost. 
2631                  *      (Not good).
2632                  */
2633                  
2634                 buff->free = 1;
2635                 sock_wfree(sk,buff);
2636                 sk->write_seq++;
2637                 t=del_timer(&sk->timer);
2638                 if(t)
2639                         add_timer(&sk->timer);
2640                 else
2641                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2642                 return;
2643         }
2644         
2645         /*
2646          *      We ought to check if the end of the queue is a buffer and
2647          *      if so simply add the fin to that buffer, not send it ahead.
2648          */
2649 
2650         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2651         buff->dev = dev;
2652         memcpy(t1, th, sizeof(*t1));
2653         t1->seq = ntohl(sk->write_seq);
2654         sk->write_seq++;
2655         buff->h.seq = sk->write_seq;
2656         t1->ack = 1;
2657         t1->ack_seq = ntohl(sk->acked_seq);
2658         t1->window = ntohs(sk->window=tcp_select_window(sk));
2659         t1->fin = 1;
2660         t1->rst = 0;
2661         t1->doff = sizeof(*t1)/4;
2662         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2663 
2664         /*
2665          * If there is data in the write queue, the fin must be appended to
2666          * the write queue.
2667          */
2668         
2669         if (skb_peek(&sk->write_queue) != NULL) 
2670         {
2671                 buff->free = 0;
2672                 if (buff->next != NULL) 
2673                 {
2674                         printk("tcp_send_fin: next != NULL\n");
2675                         skb_unlink(buff);
2676                 }
2677                 skb_queue_tail(&sk->write_queue, buff);
2678         } 
2679         else 
2680         {
2681                 sk->sent_seq = sk->write_seq;
2682                 sk->prot->queue_xmit(sk, dev, buff, 0);
2683                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2684         }
2685 }
2686 
2687 /*
2688  *      Shutdown the sending side of a connection. Much like close except
2689  *      that we don't receive shut down or set sk->dead=1.
2690  */
2691 
2692 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2693 {
2694         /*
2695          *      We need to grab some memory, and put together a FIN,
2696          *      and then put it into the queue to be sent.
2697          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2698          */
2699 
2700         if (!(how & SEND_SHUTDOWN)) 
2701                 return;
2702          
2703         /*
2704          *      If we've already sent a FIN, or it's a closed state
2705          */
2706          
2707         if (sk->state == TCP_FIN_WAIT1 ||
2708             sk->state == TCP_FIN_WAIT2 ||
2709             sk->state == TCP_CLOSING ||
2710             sk->state == TCP_LAST_ACK ||
2711             sk->state == TCP_TIME_WAIT || 
2712             sk->state == TCP_CLOSE ||
2713             sk->state == TCP_LISTEN
2714           )
2715         {
2716                 return;
2717         }
2718         sk->inuse = 1;
2719 
2720         /*
2721          * flag that the sender has shutdown
2722          */
2723 
2724         sk->shutdown |= SEND_SHUTDOWN;
2725 
2726         /*
2727          *  Clear out any half completed packets. 
2728          */
2729 
2730         if (sk->partial)
2731                 tcp_send_partial(sk);
2732                 
2733         /*
2734          *      FIN if needed
2735          */
2736          
2737         if(tcp_close_state(sk,0))
2738                 tcp_send_fin(sk);
2739                 
2740         release_sock(sk);
2741 }
2742 
2743 /*
2744  *      This routine will send an RST to the other tcp. 
2745  */
2746  
2747 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2748           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2749 {
2750         struct sk_buff *buff;
2751         struct tcphdr *t1;
2752         int tmp;
2753         struct device *ndev=NULL;
2754 
2755         /*
2756          *      Cannot reset a reset (Think about it).
2757          */
2758          
2759         if(th->rst)
2760                 return;
2761   
2762         /*
2763          * We need to grab some memory, and put together an RST,
2764          * and then put it into the queue to be sent.
2765          */
2766 
2767         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2768         if (buff == NULL) 
2769                 return;
2770 
2771         buff->sk = NULL;
2772         buff->dev = dev;
2773         buff->localroute = 0;
2774 
2775         /*
2776          *      Put in the IP header and routing stuff. 
2777          */
2778 
2779         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2780                            sizeof(struct tcphdr),tos,ttl);
2781         if (tmp < 0) 
2782         {
2783                 buff->free = 1;
2784                 sock_wfree(NULL, buff);
2785                 return;
2786         }
2787 
2788         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2789         memcpy(t1, th, sizeof(*t1));
2790 
2791         /*
2792          *      Swap the send and the receive. 
2793          */
2794 
2795         t1->dest = th->source;
2796         t1->source = th->dest;
2797         t1->rst = 1;  
2798         t1->window = 0;
2799   
2800         if(th->ack)
2801         {
2802                 t1->ack = 0;
2803                 t1->seq = th->ack_seq;
2804                 t1->ack_seq = 0;
2805         }
2806         else
2807         {
2808                 t1->ack = 1;
2809                 if(!th->syn)
2810                         t1->ack_seq=htonl(th->seq);
2811                 else
2812                         t1->ack_seq=htonl(th->seq+1);
2813                 t1->seq=0;
2814         }
2815 
2816         t1->syn = 0;
2817         t1->urg = 0;
2818         t1->fin = 0;
2819         t1->psh = 0;
2820         t1->doff = sizeof(*t1)/4;
2821         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2822         prot->queue_xmit(NULL, ndev, buff, 1);
2823         tcp_statistics.TcpOutSegs++;
2824 }
2825 
2826 
2827 /*
2828  *      Look for tcp options. Parses everything but only knows about MSS.
2829  *      This routine is always called with the packet containing the SYN.
2830  *      However it may also be called with the ack to the SYN.  So you
2831  *      can't assume this is always the SYN.  It's always called after
2832  *      we have set up sk->mtu to our own MTU.
2833  *
2834  *      We need at minimum to add PAWS support here. Possibly large windows
2835  *      as Linux gets deployed on 100Mb/sec networks.
2836  */
2837  
2838 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2839 {
2840         unsigned char *ptr;
2841         int length=(th->doff*4)-sizeof(struct tcphdr);
2842         int mss_seen = 0;
2843     
2844         ptr = (unsigned char *)(th + 1);
2845   
2846         while(length>0)
2847         {
2848                 int opcode=*ptr++;
2849                 int opsize=*ptr++;
2850                 switch(opcode)
2851                 {
2852                         case TCPOPT_EOL:
2853                                 return;
2854                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2855                                 length--;
2856                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2857                                 continue;
2858                         
2859                         default:
2860                                 if(opsize<=2)   /* Avoid silly options looping forever */
2861                                         return;
2862                                 switch(opcode)
2863                                 {
2864                                         case TCPOPT_MSS:
2865                                                 if(opsize==4 && th->syn)
2866                                                 {
2867                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2868                                                         mss_seen = 1;
2869                                                 }
2870                                                 break;
2871                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2872                                 }
2873                                 ptr+=opsize-2;
2874                                 length-=opsize;
2875                 }
2876         }
2877         if (th->syn) 
2878         {
2879                 if (! mss_seen)
2880                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2881         }
2882 #ifdef CONFIG_INET_PCTCP
2883         sk->mss = min(sk->max_window >> 1, sk->mtu);
2884 #else    
2885         sk->mss = min(sk->max_window, sk->mtu);
2886 #endif  
2887 }
2888 
2889 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2890 {
2891         dst = ntohl(dst);
2892         if (IN_CLASSA(dst))
2893                 return htonl(IN_CLASSA_NET);
2894         if (IN_CLASSB(dst))
2895                 return htonl(IN_CLASSB_NET);
2896         return htonl(IN_CLASSC_NET);
2897 }
2898 
2899 /*
2900  *      Default sequence number picking algorithm.
2901  *      As close as possible to RFC 793, which
2902  *      suggests using a 250kHz clock.
2903  *      Further reading shows this assumes 2MB/s networks.
2904  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2905  *      That's funny, Linux has one built in!  Use it!
2906  */
2907 
2908 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2909 {
2910         struct timeval tv;
2911         do_gettimeofday(&tv);
2912         return tv.tv_usec+tv.tv_sec*1000000;
2913 }
2914 
2915 /*
2916  *      This routine handles a connection request.
2917  *      It should make sure we haven't already responded.
2918  *      Because of the way BSD works, we have to send a syn/ack now.
2919  *      This also means it will be harder to close a socket which is
2920  *      listening.
2921  */
2922  
2923 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2924                  unsigned long daddr, unsigned long saddr,
2925                  struct options *opt, struct device *dev, u32 seq)
2926 {
2927         struct sk_buff *buff;
2928         struct tcphdr *t1;
2929         unsigned char *ptr;
2930         struct sock *newsk;
2931         struct tcphdr *th;
2932         struct device *ndev=NULL;
2933         int tmp;
2934         struct rtable *rt;
2935   
2936         th = skb->h.th;
2937 
2938         /* If the socket is dead, don't accept the connection. */
2939         if (!sk->dead) 
2940         {
2941                 sk->data_ready(sk,0);
2942         }
2943         else 
2944         {
2945                 if(sk->debug)
2946                         printk("Reset on %p: Connect on dead socket.\n",sk);
2947                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2948                 tcp_statistics.TcpAttemptFails++;
2949                 kfree_skb(skb, FREE_READ);
2950                 return;
2951         }
2952 
2953         /*
2954          * Make sure we can accept more.  This will prevent a
2955          * flurry of syns from eating up all our memory.
2956          */
2957 
2958         if (sk->ack_backlog >= sk->max_ack_backlog) 
2959         {
2960                 tcp_statistics.TcpAttemptFails++;
2961                 kfree_skb(skb, FREE_READ);
2962                 return;
2963         }
2964 
2965         /*
2966          * We need to build a new sock struct.
2967          * It is sort of bad to have a socket without an inode attached
2968          * to it, but the wake_up's will just wake up the listening socket,
2969          * and if the listening socket is destroyed before this is taken
2970          * off of the queue, this will take care of it.
2971          */
2972 
2973         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2974         if (newsk == NULL) 
2975         {
2976                 /* just ignore the syn.  It will get retransmitted. */
2977                 tcp_statistics.TcpAttemptFails++;
2978                 kfree_skb(skb, FREE_READ);
2979                 return;
2980         }
2981 
2982         memcpy(newsk, sk, sizeof(*newsk));
2983         newsk->opt = NULL;
2984         if (opt && opt->optlen) {
2985           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2986           if (!sk->opt) {
2987                 kfree_s(newsk, sizeof(struct sock));
2988                 tcp_statistics.TcpAttemptFails++;
2989                 kfree_skb(skb, FREE_READ);
2990                 return;
2991           }
2992           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2993                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2994                 kfree_s(newsk, sizeof(struct sock));
2995                 tcp_statistics.TcpAttemptFails++;
2996                 kfree_skb(skb, FREE_READ);
2997                 return;
2998           }
2999         }
3000         skb_queue_head_init(&newsk->write_queue);
3001         skb_queue_head_init(&newsk->receive_queue);
3002         newsk->send_head = NULL;
3003         newsk->send_tail = NULL;
3004         skb_queue_head_init(&newsk->back_log);
3005         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
3006         newsk->rto = TCP_TIMEOUT_INIT;
3007         newsk->mdev = 0;
3008         newsk->max_window = 0;
3009         newsk->cong_window = 1;
3010         newsk->cong_count = 0;
3011         newsk->ssthresh = 0;
3012         newsk->backoff = 0;
3013         newsk->blog = 0;
3014         newsk->intr = 0;
3015         newsk->proc = 0;
3016         newsk->done = 0;
3017         newsk->partial = NULL;
3018         newsk->pair = NULL;
3019         newsk->wmem_alloc = 0;
3020         newsk->rmem_alloc = 0;
3021         newsk->localroute = sk->localroute;
3022 
3023         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3024 
3025         newsk->err = 0;
3026         newsk->shutdown = 0;
3027         newsk->ack_backlog = 0;
3028         newsk->acked_seq = skb->h.th->seq+1;
3029         newsk->copied_seq = skb->h.th->seq+1;
3030         newsk->fin_seq = skb->h.th->seq;
3031         newsk->state = TCP_SYN_RECV;
3032         newsk->timeout = 0;
3033         newsk->ip_xmit_timeout = 0;
3034         newsk->write_seq = seq; 
3035         newsk->window_seq = newsk->write_seq;
3036         newsk->rcv_ack_seq = newsk->write_seq;
3037         newsk->urg_data = 0;
3038         newsk->retransmits = 0;
3039         newsk->linger=0;
3040         newsk->destroy = 0;
3041         init_timer(&newsk->timer);
3042         newsk->timer.data = (unsigned long)newsk;
3043         newsk->timer.function = &net_timer;
3044         init_timer(&newsk->retransmit_timer);
3045         newsk->retransmit_timer.data = (unsigned long)newsk;
3046         newsk->retransmit_timer.function=&retransmit_timer;
3047         newsk->dummy_th.source = skb->h.th->dest;
3048         newsk->dummy_th.dest = skb->h.th->source;
3049         
3050         /*
3051          *      Swap these two, they are from our point of view. 
3052          */
3053          
3054         newsk->daddr = saddr;
3055         newsk->saddr = daddr;
3056         newsk->rcv_saddr = daddr;
3057 
3058         put_sock(newsk->num,newsk);
3059         newsk->dummy_th.res1 = 0;
3060         newsk->dummy_th.doff = 6;
3061         newsk->dummy_th.fin = 0;
3062         newsk->dummy_th.syn = 0;
3063         newsk->dummy_th.rst = 0;        
3064         newsk->dummy_th.psh = 0;
3065         newsk->dummy_th.ack = 0;
3066         newsk->dummy_th.urg = 0;
3067         newsk->dummy_th.res2 = 0;
3068         newsk->acked_seq = skb->h.th->seq + 1;
3069         newsk->copied_seq = skb->h.th->seq + 1;
3070         newsk->socket = NULL;
3071 
3072         /*
3073          *      Grab the ttl and tos values and use them 
3074          */
3075 
3076         newsk->ip_ttl=sk->ip_ttl;
3077         newsk->ip_tos=skb->ip_hdr->tos;
3078 
3079         /*
3080          *      Use 512 or whatever user asked for 
3081          */
3082 
3083         /*
3084          *      Note use of sk->user_mss, since user has no direct access to newsk 
3085          */
3086 
3087         rt=ip_rt_route(saddr, NULL,NULL);
3088         
3089         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3090                 newsk->window_clamp = rt->rt_window;
3091         else
3092                 newsk->window_clamp = 0;
3093                 
3094         if (sk->user_mss)
3095                 newsk->mtu = sk->user_mss;
3096         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3097                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3098         else 
3099         {
3100 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3101                 if ((saddr ^ daddr) & default_mask(saddr))
3102 #else
3103                 if ((saddr ^ daddr) & dev->pa_mask)
3104 #endif
3105                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3106                 else
3107                         newsk->mtu = MAX_WINDOW;
3108         }
3109 
3110         /*
3111          *      But not bigger than device MTU 
3112          */
3113 
3114         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3115 
3116         /*
3117          *      This will min with what arrived in the packet 
3118          */
3119 
3120         tcp_options(newsk,skb->h.th);
3121         
3122         tcp_cache_zap();
3123 
3124         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3125         if (buff == NULL) 
3126         {
3127                 sk->err = ENOMEM;
3128                 newsk->dead = 1;
3129                 newsk->state = TCP_CLOSE;
3130                 /* And this will destroy it */
3131                 release_sock(newsk);
3132                 kfree_skb(skb, FREE_READ);
3133                 tcp_statistics.TcpAttemptFails++;
3134                 return;
3135         }
3136   
3137         buff->sk = newsk;
3138         buff->localroute = newsk->localroute;
3139 
3140         /*
3141          *      Put in the IP header and routing stuff. 
3142          */
3143 
3144         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3145                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3146 
3147         /*
3148          *      Something went wrong. 
3149          */
3150 
3151         if (tmp < 0) 
3152         {
3153                 sk->err = tmp;
3154                 buff->free = 1;
3155                 kfree_skb(buff,FREE_WRITE);
3156                 newsk->dead = 1;
3157                 newsk->state = TCP_CLOSE;
3158                 release_sock(newsk);
3159                 skb->sk = sk;
3160                 kfree_skb(skb, FREE_READ);
3161                 tcp_statistics.TcpAttemptFails++;
3162                 return;
3163         }
3164 
3165         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3166   
3167         memcpy(t1, skb->h.th, sizeof(*t1));
3168         buff->h.seq = newsk->write_seq;
3169         /*
3170          *      Swap the send and the receive. 
3171          */
3172         t1->dest = skb->h.th->source;
3173         t1->source = newsk->dummy_th.source;
3174         t1->seq = ntohl(newsk->write_seq++);
3175         t1->ack = 1;
3176         newsk->window = tcp_select_window(newsk);
3177         newsk->sent_seq = newsk->write_seq;
3178         t1->window = ntohs(newsk->window);
3179         t1->res1 = 0;
3180         t1->res2 = 0;
3181         t1->rst = 0;
3182         t1->urg = 0;
3183         t1->psh = 0;
3184         t1->syn = 1;
3185         t1->ack_seq = ntohl(skb->h.th->seq+1);
3186         t1->doff = sizeof(*t1)/4+1;
3187         ptr = skb_put(buff,4);
3188         ptr[0] = 2;
3189         ptr[1] = 4;
3190         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3191         ptr[3] =(newsk->mtu) & 0xff;
3192 
3193         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3194         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3195         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3196         skb->sk = newsk;
3197 
3198         /*
3199          *      Charge the sock_buff to newsk. 
3200          */
3201          
3202         sk->rmem_alloc -= skb->truesize;
3203         newsk->rmem_alloc += skb->truesize;
3204         
3205         skb_queue_tail(&sk->receive_queue,skb);
3206         sk->ack_backlog++;
3207         release_sock(newsk);
3208         tcp_statistics.TcpOutSegs++;
3209 }
3210 
3211 
3212 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
3213 {
3214         /*
3215          * We need to grab some memory, and put together a FIN, 
3216          * and then put it into the queue to be sent.
3217          */
3218         
3219         sk->inuse = 1;
3220         
3221         if(th_cache_sk==sk)
3222                 tcp_cache_zap();
3223         if(sk->state == TCP_LISTEN)
3224         {
3225                 /* Special case */
3226                 tcp_set_state(sk, TCP_CLOSE);
3227                 tcp_close_pending(sk);
3228                 release_sock(sk);
3229                 return;
3230         }
3231         
3232         sk->keepopen = 1;
3233         sk->shutdown = SHUTDOWN_MASK;
3234 
3235         if (!sk->dead) 
3236                 sk->state_change(sk);
3237 
3238         if (timeout == 0) 
3239         {
3240                 struct sk_buff *skb;
3241                 
3242                 /*
3243                  *  We need to flush the recv. buffs.  We do this only on the
3244                  *  descriptor close, not protocol-sourced closes, because the
3245                  *  reader process may not have drained the data yet!
3246                  */
3247                  
3248                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3249                         kfree_skb(skb, FREE_READ);
3250                 /*
3251                  *      Get rid off any half-completed packets. 
3252                  */
3253 
3254                 if (sk->partial) 
3255                         tcp_send_partial(sk);
3256         }
3257 
3258                 
3259         /*
3260          *      Timeout is not the same thing - however the code likes
3261          *      to send both the same way (sigh).
3262          */
3263          
3264         if(timeout)
3265         {
3266                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3267         }
3268         else
3269         {
3270                 if(tcp_close_state(sk,1)==1)
3271                 {
3272                         tcp_send_fin(sk);
3273                 }
3274         }
3275         release_sock(sk);
3276 }
3277 
3278 
3279 /*
3280  *      This routine takes stuff off of the write queue,
3281  *      and puts it in the xmit queue. This happens as incoming acks
3282  *      open up the remote window for us.
3283  */
3284  
3285 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3286 {
3287         struct sk_buff *skb;
3288 
3289         /*
3290          *      The bytes will have to remain here. In time closedown will
3291          *      empty the write queue and all will be happy 
3292          */
3293 
3294         if(sk->zapped)
3295                 return;
3296 
3297         /*
3298          *      Anything on the transmit queue that fits the window can
3299          *      be added providing we are not
3300          *
3301          *      a) retransmitting (Nagle's rule)
3302          *      b) exceeding our congestion window.
3303          */
3304          
3305         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3306                 before(skb->h.seq, sk->window_seq + 1) &&
3307                 (sk->retransmits == 0 ||
3308                  sk->ip_xmit_timeout != TIME_WRITE ||
3309                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3310                 && sk->packets_out < sk->cong_window) 
3311         {
3312                 IS_SKB(skb);
3313                 skb_unlink(skb);
3314                 
3315                 /*
3316                  *      See if we really need to send the packet. 
3317                  */
3318                  
3319                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3320                 {
3321                         /*
3322                          *      This is acked data. We can discard it. This 
3323                          *      cannot currently occur.
3324                          */
3325                          
3326                         sk->retransmits = 0;
3327                         kfree_skb(skb, FREE_WRITE);
3328                         if (!sk->dead) 
3329                                 sk->write_space(sk);
3330                 } 
3331                 else
3332                 {
3333                         struct tcphdr *th;
3334                         struct iphdr *iph;
3335                         int size;
3336 /*
3337  * put in the ack seq and window at this point rather than earlier,
3338  * in order to keep them monotonic.  We really want to avoid taking
3339  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3340  * Ack and window will in general have changed since this packet was put
3341  * on the write queue.
3342  */
3343                         iph = skb->ip_hdr;
3344                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3345                         size = skb->len - (((unsigned char *) th) - skb->data);
3346                         
3347                         th->ack_seq = ntohl(sk->acked_seq);
3348                         th->window = ntohs(tcp_select_window(sk));
3349 
3350                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3351 
3352                         sk->sent_seq = skb->h.seq;
3353                         
3354                         /*
3355                          *      IP manages our queue for some crazy reason
3356                          */
3357                          
3358                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3359                         
3360                         /*
3361                          *      Again we slide the timer wrongly
3362                          */
3363                          
3364                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3365                 }
3366         }
3367 }
3368 
3369 
3370 /*
3371  *      This routine deals with incoming acks, but not outgoing ones.
3372  */
3373 
3374 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3375 {
3376         u32 ack;
3377         int flag = 0;
3378 
3379         /* 
3380          * 1 - there was data in packet as well as ack or new data is sent or 
3381          *     in shutdown state
3382          * 2 - data from retransmit queue was acked and removed
3383          * 4 - window shrunk or data from retransmit queue was acked and removed
3384          */
3385 
3386         if(sk->zapped)
3387                 return(1);      /* Dead, cant ack any more so why bother */
3388 
3389         /*
3390          *      Have we discovered a larger window
3391          */
3392          
3393         ack = ntohl(th->ack_seq);
3394 
3395         if (ntohs(th->window) > sk->max_window) 
3396         {
3397                 sk->max_window = ntohs(th->window);
3398 #ifdef CONFIG_INET_PCTCP
3399                 /* Hack because we don't send partial packets to non SWS
3400                    handling hosts */
3401                 sk->mss = min(sk->max_window>>1, sk->mtu);
3402 #else
3403                 sk->mss = min(sk->max_window, sk->mtu);
3404 #endif  
3405         }
3406 
3407         /*
3408          *      We have dropped back to keepalive timeouts. Thus we have
3409          *      no retransmits pending.
3410          */
3411          
3412         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3413                 sk->retransmits = 0;
3414 
3415         /*
3416          *      If the ack is newer than sent or older than previous acks
3417          *      then we can probably ignore it.
3418          */
3419          
3420         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3421         {
3422                 if(sk->debug)
3423                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3424                         
3425                 /*
3426                  *      Keepalive processing.
3427                  */
3428                  
3429                 if (after(ack, sk->sent_seq)) 
3430                 {
3431                         return(0);
3432                 }
3433                 
3434                 /*
3435                  *      Restart the keepalive timer.
3436                  */
3437                  
3438                 if (sk->keepopen) 
3439                 {
3440                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3441                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3442                 }
3443                 return(1);
3444         }
3445 
3446         /*
3447          *      If there is data set flag 1
3448          */
3449          
3450         if (len != th->doff*4) 
3451                 flag |= 1;
3452 
3453         /*
3454          *      See if our window has been shrunk. 
3455          */
3456 
3457         if (after(sk->window_seq, ack+ntohs(th->window))) 
3458         {
3459                 /*
3460                  * We may need to move packets from the send queue
3461                  * to the write queue, if the window has been shrunk on us.
3462                  * The RFC says you are not allowed to shrink your window
3463                  * like this, but if the other end does, you must be able
3464                  * to deal with it.
3465                  */
3466                 struct sk_buff *skb;
3467                 struct sk_buff *skb2;
3468                 struct sk_buff *wskb = NULL;
3469         
3470                 skb2 = sk->send_head;
3471                 sk->send_head = NULL;
3472                 sk->send_tail = NULL;
3473         
3474                 /*
3475                  *      This is an artifact of a flawed concept. We want one
3476                  *      queue and a smarter send routine when we send all.
3477                  */
3478         
3479                 flag |= 4;      /* Window changed */
3480         
3481                 sk->window_seq = ack + ntohs(th->window);
3482                 cli();
3483                 while (skb2 != NULL) 
3484                 {
3485                         skb = skb2;
3486                         skb2 = skb->link3;
3487                         skb->link3 = NULL;
3488                         if (after(skb->h.seq, sk->window_seq)) 
3489                         {
3490                                 if (sk->packets_out > 0) 
3491                                         sk->packets_out--;
3492                                 /* We may need to remove this from the dev send list. */
3493                                 if (skb->next != NULL) 
3494                                 {
3495                                         skb_unlink(skb);                                
3496                                 }
3497                                 /* Now add it to the write_queue. */
3498                                 if (wskb == NULL)
3499                                         skb_queue_head(&sk->write_queue,skb);
3500                                 else
3501                                         skb_append(wskb,skb);
3502                                 wskb = skb;
3503                         } 
3504                         else 
3505                         {
3506                                 if (sk->send_head == NULL) 
3507                                 {
3508                                         sk->send_head = skb;
3509                                         sk->send_tail = skb;
3510                                 }
3511                                 else
3512                                 {
3513                                         sk->send_tail->link3 = skb;
3514                                         sk->send_tail = skb;
3515                                 }
3516                                 skb->link3 = NULL;
3517                         }
3518                 }
3519                 sti();
3520         }
3521 
3522         /*
3523          *      Pipe has emptied
3524          */
3525          
3526         if (sk->send_tail == NULL || sk->send_head == NULL) 
3527         {
3528                 sk->send_head = NULL;
3529                 sk->send_tail = NULL;
3530                 sk->packets_out= 0;
3531         }
3532 
3533         /*
3534          *      Update the right hand window edge of the host
3535          */
3536          
3537         sk->window_seq = ack + ntohs(th->window);
3538 
3539         /*
3540          *      We don't want too many packets out there. 
3541          */
3542          
3543         if (sk->ip_xmit_timeout == TIME_WRITE && 
3544                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3545         {
3546                 /* 
3547                  * This is Jacobson's slow start and congestion avoidance. 
3548                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3549                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3550                  * counter and increment it once every cwnd times.  It's possible
3551                  * that this should be done only if sk->retransmits == 0.  I'm
3552                  * interpreting "new data is acked" as including data that has
3553                  * been retransmitted but is just now being acked.
3554                  */
3555                 if (sk->cong_window < sk->ssthresh)  
3556                         /* 
3557                          *      In "safe" area, increase
3558                          */
3559                         sk->cong_window++;
3560                 else 
3561                 {
3562                         /*
3563                          *      In dangerous area, increase slowly.  In theory this is
3564                          *      sk->cong_window += 1 / sk->cong_window
3565                          */
3566                         if (sk->cong_count >= sk->cong_window) 
3567                         {
3568                                 sk->cong_window++;
3569                                 sk->cong_count = 0;
3570                         }
3571                         else 
3572                                 sk->cong_count++;
3573                 }
3574         }
3575 
3576         /*
3577          *      Remember the highest ack received.
3578          */
3579          
3580         sk->rcv_ack_seq = ack;
3581 
3582         /*
3583          *      If this ack opens up a zero window, clear backoff.  It was
3584          *      being used to time the probes, and is probably far higher than
3585          *      it needs to be for normal retransmission.
3586          */
3587 
3588         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3589         {
3590                 sk->retransmits = 0;    /* Our probe was answered */
3591                 
3592                 /*
3593                  *      Was it a usable window open ?
3594                  */
3595                  
3596                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3597                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3598                 {
3599                         sk->backoff = 0;
3600                         
3601                         /*
3602                          *      Recompute rto from rtt.  this eliminates any backoff.
3603                          */
3604 
3605                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3606                         if (sk->rto > 120*HZ)
3607                                 sk->rto = 120*HZ;
3608                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3609                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3610                                                    .2 of a second is going to need huge windows (SIGH) */
3611                         sk->rto = 20;
3612                 }
3613         }
3614 
3615         /* 
3616          *      See if we can take anything off of the retransmit queue.
3617          */
3618    
3619         while(sk->send_head != NULL) 
3620         {
3621                 /* Check for a bug. */
3622                 if (sk->send_head->link3 &&
3623                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3624                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3625                         
3626                 /*
3627                  *      If our packet is before the ack sequence we can
3628                  *      discard it as it's confirmed to have arrived the other end.
3629                  */
3630                  
3631                 if (before(sk->send_head->h.seq, ack+1)) 
3632                 {
3633                         struct sk_buff *oskb;   
3634                         if (sk->retransmits) 
3635                         {       
3636                                 /*
3637                                  *      We were retransmitting.  don't count this in RTT est 
3638                                  */
3639                                 flag |= 2;
3640 
3641                                 /*
3642                                  * even though we've gotten an ack, we're still
3643                                  * retransmitting as long as we're sending from
3644                                  * the retransmit queue.  Keeping retransmits non-zero
3645                                  * prevents us from getting new data interspersed with
3646                                  * retransmissions.
3647                                  */
3648 
3649                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3650                                         sk->retransmits = 1;
3651                                 else
3652                                         sk->retransmits = 0;
3653                         }
3654                         /*
3655                          * Note that we only reset backoff and rto in the
3656                          * rtt recomputation code.  And that doesn't happen
3657                          * if there were retransmissions in effect.  So the
3658                          * first new packet after the retransmissions is
3659                          * sent with the backoff still in effect.  Not until
3660                          * we get an ack from a non-retransmitted packet do
3661                          * we reset the backoff and rto.  This allows us to deal
3662                          * with a situation where the network delay has increased
3663                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3664                          */
3665 
3666                         /*
3667                          *      We have one less packet out there. 
3668                          */
3669                          
3670                         if (sk->packets_out > 0) 
3671                                 sk->packets_out --;
3672                         /* 
3673                          *      Wake up the process, it can probably write more. 
3674                          */
3675                         if (!sk->dead) 
3676                                 sk->write_space(sk);
3677                         oskb = sk->send_head;
3678 
3679                         if (!(flag&2))  /* Not retransmitting */
3680                         {
3681                                 long m;
3682         
3683                                 /*
3684                                  *      The following amusing code comes from Jacobson's
3685                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3686                                  *      are scaled versions of rtt and mean deviation.
3687                                  *      This is designed to be as fast as possible 
3688                                  *      m stands for "measurement".
3689                                  */
3690         
3691                                 m = jiffies - oskb->when;  /* RTT */
3692                                 if(m<=0)
3693                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3694                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3695                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3696                                 if (m < 0)
3697                                         m = -m;         /* m is now abs(error) */
3698                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3699                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3700         
3701                                 /*
3702                                  *      Now update timeout.  Note that this removes any backoff.
3703                                  */
3704                          
3705                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3706                                 if (sk->rto > 120*HZ)
3707                                         sk->rto = 120*HZ;
3708                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3709                                         sk->rto = 20;
3710                                 sk->backoff = 0;
3711                         }
3712                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3713                                            In this case as we just set it up */
3714                         cli();
3715                         oskb = sk->send_head;
3716                         IS_SKB(oskb);
3717                         sk->send_head = oskb->link3;
3718                         if (sk->send_head == NULL) 
3719                         {
3720                                 sk->send_tail = NULL;
3721                         }
3722 
3723                 /*
3724                  *      We may need to remove this from the dev send list. 
3725                  */
3726 
3727                         if (oskb->next)
3728                                 skb_unlink(oskb);
3729                         sti();
3730                         kfree_skb(oskb, FREE_WRITE); /* write. */
3731                         if (!sk->dead) 
3732                                 sk->write_space(sk);
3733                 }
3734                 else
3735                 {
3736                         break;
3737                 }
3738         }
3739 
3740         /*
3741          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3742          * returns non-NULL, we complete ignore the timer stuff in the else
3743          * clause.  We ought to organize the code so that else clause can
3744          * (should) be executed regardless, possibly moving the PROBE timer
3745          * reset over.  The skb_peek() thing should only move stuff to the
3746          * write queue, NOT also manage the timer functions.
3747          */
3748 
3749         /*
3750          * Maybe we can take some stuff off of the write queue,
3751          * and put it onto the xmit queue.
3752          */
3753         if (skb_peek(&sk->write_queue) != NULL) 
3754         {
3755                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3756                         (sk->retransmits == 0 || 
3757                          sk->ip_xmit_timeout != TIME_WRITE ||
3758                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3759                         && sk->packets_out < sk->cong_window) 
3760                 {
3761                         /*
3762                          *      Add more data to the send queue.
3763                          */
3764                         flag |= 1;
3765                         tcp_write_xmit(sk);
3766                 }
3767                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3768                         sk->send_head == NULL &&
3769                         sk->ack_backlog == 0 &&
3770                         sk->state != TCP_TIME_WAIT) 
3771                 {
3772                         /*
3773                          *      Data to queue but no room.
3774                          */
3775                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3776                 }               
3777         }
3778         else
3779         {
3780                 /*
3781                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3782                  * from TCP_CLOSE we don't do anything
3783                  *
3784                  * from anything else, if there is write data (or fin) pending,
3785                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3786                  * a KEEPALIVE timeout, else we delete the timer.
3787                  *
3788                  * We do not set flag for nominal write data, otherwise we may
3789                  * force a state where we start to write itsy bitsy tidbits
3790                  * of data.
3791                  */
3792 
3793                 switch(sk->state) {
3794                 case TCP_TIME_WAIT:
3795                         /*
3796                          * keep us in TIME_WAIT until we stop getting packets,
3797                          * reset the timeout.
3798                          */
3799                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3800                         break;
3801                 case TCP_CLOSE:
3802                         /*
3803                          * don't touch the timer.
3804                          */
3805                         break;
3806                 default:
3807                         /*
3808                          *      Must check send_head, write_queue, and ack_backlog
3809                          *      to determine which timeout to use.
3810                          */
3811                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3812                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3813                         } else if (sk->keepopen) {
3814                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3815                         } else {
3816                                 del_timer(&sk->retransmit_timer);
3817                                 sk->ip_xmit_timeout = 0;
3818                         }
3819                         break;
3820                 }
3821         }
3822 
3823         /*
3824          *      We have nothing queued but space to send. Send any partial
3825          *      packets immediately (end of Nagle rule application).
3826          */
3827          
3828         if (sk->packets_out == 0 && sk->partial != NULL &&
3829                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3830         {
3831                 flag |= 1;
3832                 tcp_send_partial(sk);
3833         }
3834 
3835         /*
3836          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3837          * we are now waiting for an acknowledge to our FIN.  The other end is
3838          * already in TIME_WAIT.
3839          *
3840          * Move to TCP_CLOSE on success.
3841          */
3842 
3843         if (sk->state == TCP_LAST_ACK) 
3844         {
3845                 if (!sk->dead)
3846                         sk->state_change(sk);
3847                 if(sk->debug)
3848                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3849                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3850                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3851                 {
3852                         flag |= 1;
3853                         tcp_set_state(sk,TCP_CLOSE);
3854                         sk->shutdown = SHUTDOWN_MASK;
3855                 }
3856         }
3857 
3858         /*
3859          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3860          *
3861          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3862          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3863          */
3864 
3865         if (sk->state == TCP_FIN_WAIT1) 
3866         {
3867 
3868                 if (!sk->dead) 
3869                         sk->state_change(sk);
3870                 if (sk->rcv_ack_seq == sk->write_seq) 
3871                 {
3872                         flag |= 1;
3873                         sk->shutdown |= SEND_SHUTDOWN;
3874                         tcp_set_state(sk, TCP_FIN_WAIT2);
3875                 }
3876         }
3877 
3878         /*
3879          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3880          *
3881          *      Move to TIME_WAIT
3882          */
3883 
3884         if (sk->state == TCP_CLOSING) 
3885         {
3886 
3887                 if (!sk->dead) 
3888                         sk->state_change(sk);
3889                 if (sk->rcv_ack_seq == sk->write_seq) 
3890                 {
3891                         flag |= 1;
3892                         tcp_time_wait(sk);
3893                 }
3894         }
3895         
3896         /*
3897          *      Final ack of a three way shake 
3898          */
3899          
3900         if(sk->state==TCP_SYN_RECV)
3901         {
3902                 tcp_set_state(sk, TCP_ESTABLISHED);
3903                 tcp_options(sk,th);
3904                 sk->dummy_th.dest=th->source;
3905                 sk->copied_seq = sk->acked_seq;
3906                 if(!sk->dead)
3907                         sk->state_change(sk);
3908                 if(sk->max_window==0)
3909                 {
3910                         sk->max_window=32;      /* Sanity check */
3911                         sk->mss=min(sk->max_window,sk->mtu);
3912                 }
3913         }
3914         
3915         /*
3916          * I make no guarantees about the first clause in the following
3917          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3918          * what conditions "!flag" would be true.  However I think the rest
3919          * of the conditions would prevent that from causing any
3920          * unnecessary retransmission. 
3921          *   Clearly if the first packet has expired it should be 
3922          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3923          * harder to explain:  You have to look carefully at how and when the
3924          * timer is set and with what timeout.  The most recent transmission always
3925          * sets the timer.  So in general if the most recent thing has timed
3926          * out, everything before it has as well.  So we want to go ahead and
3927          * retransmit some more.  If we didn't explicitly test for this
3928          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3929          * would not be true.  If you look at the pattern of timing, you can
3930          * show that rto is increased fast enough that the next packet would
3931          * almost never be retransmitted immediately.  Then you'd end up
3932          * waiting for a timeout to send each packet on the retransmission
3933          * queue.  With my implementation of the Karn sampling algorithm,
3934          * the timeout would double each time.  The net result is that it would
3935          * take a hideous amount of time to recover from a single dropped packet.
3936          * It's possible that there should also be a test for TIME_WRITE, but
3937          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3938          * got to be in real retransmission mode.
3939          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3940          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3941          * As long as no further losses occur, this seems reasonable.
3942          */
3943         
3944         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3945                (((flag&2) && sk->retransmits) ||
3946                (sk->send_head->when + sk->rto < jiffies))) 
3947         {
3948                 if(sk->send_head->when + sk->rto < jiffies)
3949                         tcp_retransmit(sk,0);   
3950                 else
3951                 {
3952                         tcp_do_retransmit(sk, 1);
3953                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3954                 }
3955         }
3956 
3957         return(1);
3958 }
3959 
3960 
3961 /*
3962  *      Process the FIN bit. This now behaves as it is supposed to work
3963  *      and the FIN takes effect when it is validly part of sequence
3964  *      space. Not before when we get holes.
3965  *
3966  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3967  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3968  *      TIME-WAIT)
3969  *
3970  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3971  *      close and we go into CLOSING (and later onto TIME-WAIT)
3972  *
3973  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3974  *
3975  */
3976  
3977 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3978 {
3979         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3980 
3981         if (!sk->dead) 
3982         {
3983                 sk->state_change(sk);
3984                 sock_wake_async(sk->socket, 1);
3985         }
3986 
3987         switch(sk->state) 
3988         {
3989                 case TCP_SYN_RECV:
3990                 case TCP_SYN_SENT:
3991                 case TCP_ESTABLISHED:
3992                         /*
3993                          * move to CLOSE_WAIT, tcp_data() already handled
3994                          * sending the ack.
3995                          */
3996                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3997                         if (th->rst)
3998                                 sk->shutdown = SHUTDOWN_MASK;
3999                         break;
4000 
4001                 case TCP_CLOSE_WAIT:
4002                 case TCP_CLOSING:
4003                         /*
4004                          * received a retransmission of the FIN, do
4005                          * nothing.
4006                          */
4007                         break;
4008                 case TCP_TIME_WAIT:
4009                         /*
4010                          * received a retransmission of the FIN,
4011                          * restart the TIME_WAIT timer.
4012                          */
4013                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4014                         return(0);
4015                 case TCP_FIN_WAIT1:
4016                         /*
4017                          * This case occurs when a simultaneous close
4018                          * happens, we must ack the received FIN and
4019                          * enter the CLOSING state.
4020                          *
4021                          * This causes a WRITE timeout, which will either
4022                          * move on to TIME_WAIT when we timeout, or resend
4023                          * the FIN properly (maybe we get rid of that annoying
4024                          * FIN lost hang). The TIME_WRITE code is already correct
4025                          * for handling this timeout.
4026                          */
4027 
4028                         if(sk->ip_xmit_timeout != TIME_WRITE)
4029                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4030                         tcp_set_state(sk,TCP_CLOSING);
4031                         break;
4032                 case TCP_FIN_WAIT2:
4033                         /*
4034                          * received a FIN -- send ACK and enter TIME_WAIT
4035                          */
4036                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4037                         sk->shutdown|=SHUTDOWN_MASK;
4038                         tcp_set_state(sk,TCP_TIME_WAIT);
4039                         break;
4040                 case TCP_CLOSE:
4041                         /*
4042                          * already in CLOSE
4043                          */
4044                         break;
4045                 default:
4046                         tcp_set_state(sk,TCP_LAST_ACK);
4047         
4048                         /* Start the timers. */
4049                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4050                         return(0);
4051         }
4052 
4053         return(0);
4054 }
4055 
4056 
4057 
4058 /*
4059  *      This routine handles the data.  If there is room in the buffer,
4060  *      it will be have already been moved into it.  If there is no
4061  *      room, then we will just have to discard the packet.
4062  */
4063 
4064 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
4065          unsigned long saddr, unsigned short len)
4066 {
4067         struct sk_buff *skb1, *skb2;
4068         struct tcphdr *th;
4069         int dup_dumped=0;
4070         u32 new_seq, shut_seq;
4071 
4072         th = skb->h.th;
4073         skb_pull(skb,th->doff*4);
4074         skb_trim(skb,len-(th->doff*4));
4075 
4076         /*
4077          *      The bytes in the receive read/assembly queue has increased. Needed for the
4078          *      low memory discard algorithm 
4079          */
4080            
4081         sk->bytes_rcv += skb->len;
4082         
4083         if (skb->len == 0 && !th->fin) 
4084         {
4085                 /* 
4086                  *      Don't want to keep passing ack's back and forth. 
4087                  *      (someone sent us dataless, boring frame)
4088                  */
4089                 if (!th->ack)
4090                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4091                 kfree_skb(skb, FREE_READ);
4092                 return(0);
4093         }
4094         
4095         /*
4096          *      We no longer have anyone receiving data on this connection.
4097          */
4098 
4099 #ifndef TCP_DONT_RST_SHUTDOWN            
4100 
4101         if(sk->shutdown & RCV_SHUTDOWN)
4102         {
4103                 /*
4104                  *      FIXME: BSD has some magic to avoid sending resets to
4105                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4106                  *      BSD stacks still have broken keepalives so we want to
4107                  *      cope with it.
4108                  */
4109 
4110                 if(skb->len)    /* We don't care if it's just an ack or
4111                                    a keepalive/window probe */
4112                 {
4113                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4114                         
4115                         /* Do this the way 4.4BSD treats it. Not what I'd
4116                            regard as the meaning of the spec but it's what BSD
4117                            does and clearly they know everything 8) */
4118 
4119                         /*
4120                          *      This is valid because of two things
4121                          *
4122                          *      a) The way tcp_data behaves at the bottom.
4123                          *      b) A fin takes effect when read not when received.
4124                          */
4125                          
4126                         shut_seq=sk->acked_seq+1;       /* Last byte */
4127                         
4128                         if(after(new_seq,shut_seq))
4129                         {
4130                                 if(sk->debug)
4131                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4132                                                 sk, new_seq, shut_seq, sk->blog);
4133                                 if(sk->dead)
4134                                 {
4135                                         sk->acked_seq = new_seq + th->fin;
4136                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4137                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4138                                         tcp_statistics.TcpEstabResets++;
4139                                         tcp_set_state(sk,TCP_CLOSE);
4140                                         sk->err = EPIPE;
4141                                         sk->shutdown = SHUTDOWN_MASK;
4142                                         kfree_skb(skb, FREE_READ);
4143                                         return 0;
4144                                 }
4145                         }
4146                 }
4147         }
4148 
4149 #endif
4150 
4151         /*
4152          *      Now we have to walk the chain, and figure out where this one
4153          *      goes into it.  This is set up so that the last packet we received
4154          *      will be the first one we look at, that way if everything comes
4155          *      in order, there will be no performance loss, and if they come
4156          *      out of order we will be able to fit things in nicely.
4157          *
4158          *      [AC: This is wrong. We should assume in order first and then walk
4159          *       forwards from the first hole based upon real traffic patterns.]
4160          *      
4161          */
4162 
4163         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4164         {
4165                 skb_queue_head(&sk->receive_queue,skb);
4166                 skb1= NULL;
4167         } 
4168         else
4169         {
4170                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4171                 {
4172                         if(sk->debug)
4173                         {
4174                                 printk("skb1=%p :", skb1);
4175                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4176                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4177                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4178                                                 sk->acked_seq);
4179                         }
4180                         
4181                         /*
4182                          *      Optimisation: Duplicate frame or extension of previous frame from
4183                          *      same sequence point (lost ack case).
4184                          *      The frame contains duplicate data or replaces a previous frame
4185                          *      discard the previous frame (safe as sk->inuse is set) and put
4186                          *      the new one in its place.
4187                          */
4188                          
4189                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4190                         {
4191                                 skb_append(skb1,skb);
4192                                 skb_unlink(skb1);
4193                                 kfree_skb(skb1,FREE_READ);
4194                                 dup_dumped=1;
4195                                 skb1=NULL;
4196                                 break;
4197                         }
4198                         
4199                         /*
4200                          *      Found where it fits
4201                          */
4202                          
4203                         if (after(th->seq+1, skb1->h.th->seq))
4204                         {
4205                                 skb_append(skb1,skb);
4206                                 break;
4207                         }
4208                         
4209                         /*
4210                          *      See if we've hit the start. If so insert.
4211                          */
4212                         if (skb1 == skb_peek(&sk->receive_queue))
4213                         {
4214                                 skb_queue_head(&sk->receive_queue, skb);
4215                                 break;
4216                         }
4217                 }
4218         }
4219 
4220         /*
4221          *      Figure out what the ack value for this frame is
4222          */
4223          
4224         th->ack_seq = th->seq + skb->len;
4225         if (th->syn) 
4226                 th->ack_seq++;
4227         if (th->fin)
4228                 th->ack_seq++;
4229 
4230         if (before(sk->acked_seq, sk->copied_seq)) 
4231         {
4232                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4233                 sk->acked_seq = sk->copied_seq;
4234         }
4235 
4236         /*
4237          *      Now figure out if we can ack anything. This is very messy because we really want two
4238          *      receive queues, a completed and an assembly queue. We also want only one transmit
4239          *      queue.
4240          */
4241 
4242         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4243         {
4244                 if (before(th->seq, sk->acked_seq+1)) 
4245                 {
4246                         int newwindow;
4247 
4248                         if (after(th->ack_seq, sk->acked_seq)) 
4249                         {
4250                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4251                                 if (newwindow < 0)
4252                                         newwindow = 0;  
4253                                 sk->window = newwindow;
4254                                 sk->acked_seq = th->ack_seq;
4255                         }
4256                         skb->acked = 1;
4257 
4258                         /*
4259                          *      When we ack the fin, we do the FIN 
4260                          *      processing.
4261                          */
4262 
4263                         if (skb->h.th->fin) 
4264                         {
4265                                 tcp_fin(skb,sk,skb->h.th);
4266                         }
4267           
4268                         for(skb2 = skb->next;
4269                             skb2 != (struct sk_buff *)&sk->receive_queue;
4270                             skb2 = skb2->next) 
4271                         {
4272                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4273                                 {
4274                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4275                                         {
4276                                                 newwindow = sk->window -
4277                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4278                                                 if (newwindow < 0)
4279                                                         newwindow = 0;  
4280                                                 sk->window = newwindow;
4281                                                 sk->acked_seq = skb2->h.th->ack_seq;
4282                                         }
4283                                         skb2->acked = 1;
4284                                         /*
4285                                          *      When we ack the fin, we do
4286                                          *      the fin handling.
4287                                          */
4288                                         if (skb2->h.th->fin) 
4289                                         {
4290                                                 tcp_fin(skb,sk,skb->h.th);
4291                                         }
4292 
4293                                         /*
4294                                          *      Force an immediate ack.
4295                                          */
4296                                          
4297                                         sk->ack_backlog = sk->max_ack_backlog;
4298                                 }
4299                                 else
4300                                 {
4301                                         break;
4302                                 }
4303                         }
4304 
4305                         /*
4306                          *      This also takes care of updating the window.
4307                          *      This if statement needs to be simplified.
4308                          */
4309                         if (!sk->delay_acks ||
4310                             sk->ack_backlog >= sk->max_ack_backlog || 
4311                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4312         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4313                         }
4314                         else 
4315                         {
4316                                 sk->ack_backlog++;
4317                                 if(sk->debug)
4318                                         printk("Ack queued.\n");
4319                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4320                         }
4321                 }
4322         }
4323 
4324         /*
4325          *      If we've missed a packet, send an ack.
4326          *      Also start a timer to send another.
4327          */
4328          
4329         if (!skb->acked) 
4330         {
4331         
4332         /*
4333          *      This is important.  If we don't have much room left,
4334          *      we need to throw out a few packets so we have a good
4335          *      window.  Note that mtu is used, not mss, because mss is really
4336          *      for the send side.  He could be sending us stuff as large as mtu.
4337          */
4338                  
4339                 while (sock_rspace(sk) < sk->mtu) 
4340                 {
4341                         skb1 = skb_peek(&sk->receive_queue);
4342                         if (skb1 == NULL) 
4343                         {
4344                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4345                                 break;
4346                         }
4347 
4348                         /*
4349                          *      Don't throw out something that has been acked. 
4350                          */
4351                  
4352                         if (skb1->acked) 
4353                         {
4354                                 break;
4355                         }
4356                 
4357                         skb_unlink(skb1);
4358                         kfree_skb(skb1, FREE_READ);
4359                 }
4360                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4361                 sk->ack_backlog++;
4362                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4363         }
4364         else
4365         {
4366                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4367         }
4368 
4369         /*
4370          *      Now tell the user we may have some data. 
4371          */
4372          
4373         if (!sk->dead) 
4374         {
4375                 if(sk->debug)
4376                         printk("Data wakeup.\n");
4377                 sk->data_ready(sk,0);
4378         } 
4379         return(0);
4380 }
4381 
4382 
4383 /*
4384  *      This routine is only called when we have urgent data
4385  *      signalled. Its the 'slow' part of tcp_urg. It could be
4386  *      moved inline now as tcp_urg is only called from one
4387  *      place. We handle URGent data wrong. We have to - as
4388  *      BSD still doesn't use the correction from RFC961.
4389  */
4390  
4391 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4392 {
4393         u32 ptr = ntohs(th->urg_ptr);
4394 
4395         if (ptr)
4396                 ptr--;
4397         ptr += th->seq;
4398 
4399         /* ignore urgent data that we've already seen and read */
4400         if (after(sk->copied_seq, ptr))
4401                 return;
4402 
4403         /* do we already have a newer (or duplicate) urgent pointer? */
4404         if (sk->urg_data && !after(ptr, sk->urg_seq))
4405                 return;
4406 
4407         /* tell the world about our new urgent pointer */
4408         if (sk->proc != 0) {
4409                 if (sk->proc > 0) {
4410                         kill_proc(sk->proc, SIGURG, 1);
4411                 } else {
4412                         kill_pg(-sk->proc, SIGURG, 1);
4413                 }
4414         }
4415         sk->urg_data = URG_NOTYET;
4416         sk->urg_seq = ptr;
4417 }
4418 
4419 /*
4420  *      This is the 'fast' part of urgent handling.
4421  */
4422  
4423 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4424         unsigned long saddr, unsigned long len)
4425 {
4426         u32 ptr;
4427 
4428         /*
4429          *      Check if we get a new urgent pointer - normally not 
4430          */
4431          
4432         if (th->urg)
4433                 tcp_check_urg(sk,th);
4434 
4435         /*
4436          *      Do we wait for any urgent data? - normally not
4437          */
4438          
4439         if (sk->urg_data != URG_NOTYET)
4440                 return 0;
4441 
4442         /*
4443          *      Is the urgent pointer pointing into this packet? 
4444          */
4445          
4446         ptr = sk->urg_seq - th->seq + th->doff*4;
4447         if (ptr >= len)
4448                 return 0;
4449 
4450         /*
4451          *      Ok, got the correct packet, update info 
4452          */
4453          
4454         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4455         if (!sk->dead)
4456                 sk->data_ready(sk,0);
4457         return 0;
4458 }
4459 
4460 /*
4461  *      This will accept the next outstanding connection. 
4462  */
4463  
4464 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4465 {
4466         struct sock *newsk;
4467         struct sk_buff *skb;
4468   
4469   /*
4470    * We need to make sure that this socket is listening,
4471    * and that it has something pending.
4472    */
4473 
4474         if (sk->state != TCP_LISTEN) 
4475         {
4476                 sk->err = EINVAL;
4477                 return(NULL); 
4478         }
4479 
4480         /* Avoid the race. */
4481         cli();
4482         sk->inuse = 1;
4483 
4484         while((skb = tcp_dequeue_established(sk)) == NULL) 
4485         {
4486                 if (flags & O_NONBLOCK) 
4487                 {
4488                         sti();
4489                         release_sock(sk);
4490                         sk->err = EAGAIN;
4491                         return(NULL);
4492                 }
4493 
4494                 release_sock(sk);
4495                 interruptible_sleep_on(sk->sleep);
4496                 if (current->signal & ~current->blocked) 
4497                 {
4498                         sti();
4499                         sk->err = ERESTARTSYS;
4500                         return(NULL);
4501                 }
4502                 sk->inuse = 1;
4503         }
4504         sti();
4505 
4506         /*
4507          *      Now all we need to do is return skb->sk. 
4508          */
4509 
4510         newsk = skb->sk;
4511 
4512         kfree_skb(skb, FREE_READ);
4513         sk->ack_backlog--;
4514         release_sock(sk);
4515         return(newsk);
4516 }
4517 
4518 
4519 /*
4520  *      This will initiate an outgoing connection. 
4521  */
4522  
4523 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4524 {
4525         struct sk_buff *buff;
4526         struct device *dev=NULL;
4527         unsigned char *ptr;
4528         int tmp;
4529         int atype;
4530         struct tcphdr *t1;
4531         struct rtable *rt;
4532 
4533         if (sk->state != TCP_CLOSE) 
4534         {
4535                 return(-EISCONN);
4536         }
4537         
4538         if (addr_len < 8) 
4539                 return(-EINVAL);
4540 
4541         if (usin->sin_family && usin->sin_family != AF_INET) 
4542                 return(-EAFNOSUPPORT);
4543 
4544         /*
4545          *      connect() to INADDR_ANY means loopback (BSD'ism).
4546          */
4547         
4548         if(usin->sin_addr.s_addr==INADDR_ANY)
4549                 usin->sin_addr.s_addr=ip_my_addr();
4550                   
4551         /*
4552          *      Don't want a TCP connection going to a broadcast address 
4553          */
4554 
4555         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4556                 return -ENETUNREACH;
4557   
4558         sk->inuse = 1;
4559         sk->daddr = usin->sin_addr.s_addr;
4560         sk->write_seq = tcp_init_seq();
4561         sk->window_seq = sk->write_seq;
4562         sk->rcv_ack_seq = sk->write_seq -1;
4563         sk->err = 0;
4564         sk->dummy_th.dest = usin->sin_port;
4565         release_sock(sk);
4566 
4567         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4568         if (buff == NULL) 
4569         {
4570                 return(-ENOMEM);
4571         }
4572         sk->inuse = 1;
4573         buff->sk = sk;
4574         buff->free = 0;
4575         buff->localroute = sk->localroute;
4576         
4577 
4578         /*
4579          *      Put in the IP header and routing stuff.
4580          */
4581          
4582         if (sk->localroute)
4583           rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4584         else
4585           rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4586 
4587         /*
4588          *      When we connect we enforce receive requirements too.
4589          */
4590          
4591         sk->rcv_saddr=sk->saddr;
4592         
4593         /*
4594          *      We need to build the routing stuff from the things saved in skb. 
4595          */
4596 
4597         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4598                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4599         if (tmp < 0) 
4600         {
4601                 sock_wfree(sk, buff);
4602                 release_sock(sk);
4603                 return(-ENETUNREACH);
4604         }
4605 
4606         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4607 
4608         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4609         t1->seq = ntohl(sk->write_seq++);
4610         sk->sent_seq = sk->write_seq;
4611         buff->h.seq = sk->write_seq;
4612         t1->ack = 0;
4613         t1->window = 2;
4614         t1->res1=0;
4615         t1->res2=0;
4616         t1->rst = 0;
4617         t1->urg = 0;
4618         t1->psh = 0;
4619         t1->syn = 1;
4620         t1->urg_ptr = 0;
4621         t1->doff = 6;
4622         /* use 512 or whatever user asked for */
4623         
4624         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4625                 sk->window_clamp=rt->rt_window;
4626         else
4627                 sk->window_clamp=0;
4628 
4629         if (sk->user_mss)
4630                 sk->mtu = sk->user_mss;
4631         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4632                 sk->mtu = rt->rt_mss;
4633         else 
4634         {
4635 #ifdef CONFIG_INET_SNARL
4636                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4637 #else
4638                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4639 #endif
4640                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4641                 else
4642                         sk->mtu = MAX_WINDOW;
4643         }
4644         /*
4645          *      but not bigger than device MTU 
4646          */
4647 
4648         if(sk->mtu <32)
4649                 sk->mtu = 32;   /* Sanity limit */
4650                 
4651         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4652         
4653         /*
4654          *      Put in the TCP options to say MTU. 
4655          */
4656 
4657         ptr = skb_put(buff,4);
4658         ptr[0] = 2;
4659         ptr[1] = 4;
4660         ptr[2] = (sk->mtu) >> 8;
4661         ptr[3] = (sk->mtu) & 0xff;
4662         tcp_send_check(t1, sk->saddr, sk->daddr,
4663                   sizeof(struct tcphdr) + 4, sk);
4664 
4665         /*
4666          *      This must go first otherwise a really quick response will get reset. 
4667          */
4668 
4669         tcp_cache_zap();
4670         tcp_set_state(sk,TCP_SYN_SENT);
4671         if(rt&&rt->rt_flags&RTF_IRTT)
4672                 sk->rto = rt->rt_irtt;
4673         else
4674                 sk->rto = TCP_TIMEOUT_INIT;
4675         sk->retransmit_timer.function=&retransmit_timer;
4676         sk->retransmit_timer.data = (unsigned long)sk;
4677         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4678         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4679 
4680         sk->prot->queue_xmit(sk, dev, buff, 0);  
4681         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4682         tcp_statistics.TcpActiveOpens++;
4683         tcp_statistics.TcpOutSegs++;
4684   
4685         release_sock(sk);
4686         return(0);
4687 }
4688 
4689 
4690 /* This functions checks to see if the tcp header is actually acceptable. */
4691 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4692              struct options *opt, unsigned long saddr, struct device *dev)
4693 {
4694         u32 next_seq;
4695 
4696         next_seq = len - 4*th->doff;
4697         if (th->fin)
4698                 next_seq++;
4699         /* if we have a zero window, we can't have any data in the packet.. */
4700         if (next_seq && !sk->window)
4701                 goto ignore_it;
4702         next_seq += th->seq;
4703 
4704         /*
4705          * This isn't quite right.  sk->acked_seq could be more recent
4706          * than sk->window.  This is however close enough.  We will accept
4707          * slightly more packets than we should, but it should not cause
4708          * problems unless someone is trying to forge packets.
4709          */
4710 
4711         /* have we already seen all of this packet? */
4712         if (!after(next_seq+1, sk->acked_seq))
4713                 goto ignore_it;
4714         /* or does it start beyond the window? */
4715         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4716                 goto ignore_it;
4717 
4718         /* ok, at least part of this packet would seem interesting.. */
4719         return 1;
4720 
4721 ignore_it:
4722         if (th->rst)
4723                 return 0;
4724 
4725         /*
4726          *      Send a reset if we get something not ours and we are
4727          *      unsynchronized. Note: We don't do anything to our end. We
4728          *      are just killing the bogus remote connection then we will
4729          *      connect again and it will work (with luck).
4730          */
4731          
4732         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4733         {
4734                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4735                 return 1;
4736         }
4737 
4738         /* Try to resync things. */
4739         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4740         return 0;
4741 }
4742 
4743 /*
4744  *      When we get a reset we do this.
4745  */
4746 
4747 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4748 {
4749         sk->zapped = 1;
4750         sk->err = ECONNRESET;
4751         if (sk->state == TCP_SYN_SENT)
4752                 sk->err = ECONNREFUSED;
4753         if (sk->state == TCP_CLOSE_WAIT)
4754                 sk->err = EPIPE;
4755 #ifdef TCP_DO_RFC1337           
4756         /*
4757          *      Time wait assassination protection [RFC1337]
4758          */
4759         if(sk->state!=TCP_TIME_WAIT)
4760         {       
4761                 tcp_set_state(sk,TCP_CLOSE);
4762                 sk->shutdown = SHUTDOWN_MASK;
4763         }
4764 #else   
4765         tcp_set_state(sk,TCP_CLOSE);
4766         sk->shutdown = SHUTDOWN_MASK;
4767 #endif  
4768         if (!sk->dead) 
4769                 sk->state_change(sk);
4770         kfree_skb(skb, FREE_READ);
4771         release_sock(sk);
4772         return(0);
4773 }
4774 
4775 /*
4776  *      A TCP packet has arrived.
4777  *              skb->h.raw is the TCP header.
4778  */
4779  
4780 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4781         __u32 daddr, unsigned short len,
4782         __u32 saddr, int redo, struct inet_protocol * protocol)
4783 {
4784         struct tcphdr *th;
4785         struct sock *sk;
4786         int syn_ok=0;
4787         
4788         tcp_statistics.TcpInSegs++;
4789         if(skb->pkt_type!=PACKET_HOST)
4790         {
4791                 kfree_skb(skb,FREE_READ);
4792                 return(0);
4793         }
4794   
4795         th = skb->h.th;
4796 
4797         /*
4798          *      Find the socket, using the last hit cache if applicable.
4799          */
4800 
4801         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4802         {
4803                 sk=(struct sock *)th_cache_sk;
4804                 /*
4805                  *      We think this is causing the bug so
4806                  */
4807                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4808                         printk("Cache mismatch on TCP.\n");
4809         }
4810         else
4811         {
4812                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4813                 th_cache_saddr=saddr;
4814                 th_cache_daddr=daddr;
4815                 th_cache_dport=th->dest;
4816                 th_cache_sport=th->source;
4817                 th_cache_sk=sk;
4818         }               
4819 
4820         /*
4821          *      If this socket has got a reset it's to all intents and purposes 
4822          *      really dead. Count closed sockets as dead.
4823          *
4824          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4825          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4826          *      exist so should cause resets as if the port was unreachable.
4827          */
4828          
4829         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4830                 sk=NULL;
4831 
4832         if (!redo) 
4833         {
4834                 /*
4835                  *      Pull up the IP header.
4836                  */
4837                 skb_pull(skb, skb->h.raw-skb->data);
4838                 /*
4839                  *      Try to use the device checksum if provided.
4840                  */
4841                 if (
4842                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4843                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4844                     )
4845                 {
4846                         skb->sk = NULL;
4847                         kfree_skb(skb,FREE_READ);
4848                         /*
4849                          *      We don't release the socket because it was
4850                          *      never marked in use.
4851                          */
4852                         return(0);
4853                 }
4854                 th->seq = ntohl(th->seq);
4855 
4856                 /* See if we know about the socket. */
4857                 if (sk == NULL) 
4858                 {
4859                         /*
4860                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4861                          */
4862                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4863                         skb->sk = NULL;
4864                         /*
4865                          *      Discard frame
4866                          */
4867                         kfree_skb(skb, FREE_READ);
4868                         return(0);
4869                 }
4870 
4871 /*              skb->len = len;*/
4872                 skb->acked = 0;
4873                 skb->used = 0;
4874                 skb->free = 0;
4875                 skb->saddr = daddr;
4876                 skb->daddr = saddr;
4877         
4878                 /* We may need to add it to the backlog here. */
4879                 cli();
4880                 if (sk->inuse) 
4881                 {
4882                         skb_queue_tail(&sk->back_log, skb);
4883                         sti();
4884                         return(0);
4885                 }
4886                 sk->inuse = 1;
4887                 sti();
4888         }
4889         else
4890         {
4891                 if (sk==NULL) 
4892                 {
4893                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4894                         skb->sk = NULL;
4895                         kfree_skb(skb, FREE_READ);
4896                         return(0);
4897                 }
4898         }
4899 
4900 
4901         if (!sk->prot) 
4902         {
4903                 printk("IMPOSSIBLE 3\n");
4904                 return(0);
4905         }
4906 
4907 
4908         /*
4909          *      Charge the memory to the socket. 
4910          */
4911          
4912         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4913         {
4914                 kfree_skb(skb, FREE_READ);
4915                 release_sock(sk);
4916                 return(0);
4917         }
4918 
4919         skb->sk=sk;
4920         sk->rmem_alloc += skb->truesize;
4921 
4922         /*
4923          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4924          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4925          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4926          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4927          */
4928 
4929         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4930         {
4931         
4932                 /*
4933                  *      Now deal with unusual cases.
4934                  */
4935          
4936                 if(sk->state==TCP_LISTEN)
4937                 {
4938                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4939                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4940 
4941                         /*
4942                          *      We don't care for RST, and non SYN are absorbed (old segments)
4943                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4944                          *      netmask on a running connection it can go broadcast. Even Sun's have
4945                          *      this problem so I'm ignoring it 
4946                          */
4947                            
4948                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4949                         {
4950                                 kfree_skb(skb, FREE_READ);
4951                                 release_sock(sk);
4952                                 return 0;
4953                         }
4954                 
4955                         /*      
4956                          *      Guess we need to make a new socket up 
4957                          */
4958                 
4959                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4960                 
4961                         /*
4962                          *      Now we have several options: In theory there is nothing else
4963                          *      in the frame. KA9Q has an option to send data with the syn,
4964                          *      BSD accepts data with the syn up to the [to be] advertised window
4965                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4966                          *      it, that fits the spec precisely and avoids incompatibilities. It
4967                          *      would be nice in future to drop through and process the data.
4968                          */
4969                          
4970                         release_sock(sk);
4971                         return 0;
4972                 }
4973         
4974                 /* retransmitted SYN? */
4975                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4976                 {
4977                         kfree_skb(skb, FREE_READ);
4978                         release_sock(sk);
4979                         return 0;
4980                 }
4981                 
4982                 /*
4983                  *      SYN sent means we have to look for a suitable ack and either reset
4984                  *      for bad matches or go to connected 
4985                  */
4986            
4987                 if(sk->state==TCP_SYN_SENT)
4988                 {
4989                         /* Crossed SYN or previous junk segment */
4990                         if(th->ack)
4991                         {
4992                                 /* We got an ack, but it's not a good ack */
4993                                 if(!tcp_ack(sk,th,saddr,len))
4994                                 {
4995                                         /* Reset the ack - its an ack from a 
4996                                            different connection  [ th->rst is checked in tcp_reset()] */
4997                                         tcp_statistics.TcpAttemptFails++;
4998                                         tcp_reset(daddr, saddr, th,
4999                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5000                                         kfree_skb(skb, FREE_READ);
5001                                         release_sock(sk);
5002                                         return(0);
5003                                 }
5004                                 if(th->rst)
5005                                         return tcp_std_reset(sk,skb);
5006                                 if(!th->syn)
5007                                 {
5008                                         /* A valid ack from a different connection
5009                                            start. Shouldn't happen but cover it */
5010                                         kfree_skb(skb, FREE_READ);
5011                                         release_sock(sk);
5012                                         return 0;
5013                                 }
5014                                 /*
5015                                  *      Ok.. it's good. Set up sequence numbers and
5016                                  *      move to established.
5017                                  */
5018                                 syn_ok=1;       /* Don't reset this connection for the syn */
5019                                 sk->acked_seq=th->seq+1;
5020                                 sk->fin_seq=th->seq;
5021                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5022                                 tcp_set_state(sk, TCP_ESTABLISHED);
5023                                 tcp_options(sk,th);
5024                                 sk->dummy_th.dest=th->source;
5025                                 sk->copied_seq = sk->acked_seq;
5026                                 if(!sk->dead)
5027                                 {
5028                                         sk->state_change(sk);
5029                                         sock_wake_async(sk->socket, 0);
5030                                 }
5031                                 if(sk->max_window==0)
5032                                 {
5033                                         sk->max_window = 32;
5034                                         sk->mss = min(sk->max_window, sk->mtu);
5035                                 }
5036                         }
5037                         else
5038                         {
5039                                 /* See if SYN's cross. Drop if boring */
5040                                 if(th->syn && !th->rst)
5041                                 {
5042                                         /* Crossed SYN's are fine - but talking to
5043                                            yourself is right out... */
5044                                         if(sk->saddr==saddr && sk->daddr==daddr &&
5045                                                 sk->dummy_th.source==th->source &&
5046                                                 sk->dummy_th.dest==th->dest)
5047                                         {
5048                                                 tcp_statistics.TcpAttemptFails++;
5049                                                 return tcp_std_reset(sk,skb);
5050                                         }
5051                                         tcp_set_state(sk,TCP_SYN_RECV);
5052                                         
5053                                         /*
5054                                          *      FIXME:
5055                                          *      Must send SYN|ACK here
5056                                          */
5057                                 }               
5058                                 /* Discard junk segment */
5059                                 kfree_skb(skb, FREE_READ);
5060                                 release_sock(sk);
5061                                 return 0;
5062                         }
5063                         /*
5064                          *      SYN_RECV with data maybe.. drop through
5065                          */
5066                         goto rfc_step6;
5067                 }
5068 
5069         /*
5070          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5071          *      a more complex suggestion for fixing these reuse issues in RFC1644
5072          *      but not yet ready for general use. Also see RFC1379.
5073          */
5074         
5075 #define BSD_TIME_WAIT
5076 #ifdef BSD_TIME_WAIT
5077                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5078                         after(th->seq, sk->acked_seq) && !th->rst)
5079                 {
5080                         u32 seq = sk->write_seq;
5081                         if(sk->debug)
5082                                 printk("Doing a BSD time wait\n");
5083                         tcp_statistics.TcpEstabResets++;           
5084                         sk->rmem_alloc -= skb->truesize;
5085                         skb->sk = NULL;
5086                         sk->err=ECONNRESET;
5087                         tcp_set_state(sk, TCP_CLOSE);
5088                         sk->shutdown = SHUTDOWN_MASK;
5089                         release_sock(sk);
5090                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5091                         if (sk && sk->state==TCP_LISTEN)
5092                         {
5093                                 sk->inuse=1;
5094                                 skb->sk = sk;
5095                                 sk->rmem_alloc += skb->truesize;
5096                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5097                                 release_sock(sk);
5098                                 return 0;
5099                         }
5100                         kfree_skb(skb, FREE_READ);
5101                         return 0;
5102                 }
5103 #endif  
5104         }
5105 
5106         /*
5107          *      We are now in normal data flow (see the step list in the RFC)
5108          *      Note most of these are inline now. I'll inline the lot when
5109          *      I have time to test it hard and look at what gcc outputs 
5110          */
5111         
5112         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5113         {
5114                 kfree_skb(skb, FREE_READ);
5115                 release_sock(sk);
5116                 return 0;
5117         }
5118 
5119         if(th->rst)
5120                 return tcp_std_reset(sk,skb);
5121         
5122         /*
5123          *      !syn_ok is effectively the state test in RFC793.
5124          */
5125          
5126         if(th->syn && !syn_ok)
5127         {
5128                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5129                 return tcp_std_reset(sk,skb);   
5130         }
5131 
5132         /*
5133          *      Process the ACK
5134          */
5135          
5136 
5137         if(th->ack && !tcp_ack(sk,th,saddr,len))
5138         {
5139                 /*
5140                  *      Our three way handshake failed.
5141                  */
5142                  
5143                 if(sk->state==TCP_SYN_RECV)
5144                 {
5145                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5146                 }
5147                 kfree_skb(skb, FREE_READ);
5148                 release_sock(sk);
5149                 return 0;
5150         }
5151         
5152 rfc_step6:              /* I'll clean this up later */
5153 
5154         /*
5155          *      Process urgent data
5156          */
5157                 
5158         if(tcp_urg(sk, th, saddr, len))
5159         {
5160                 kfree_skb(skb, FREE_READ);
5161                 release_sock(sk);
5162                 return 0;
5163         }
5164         
5165         
5166         /*
5167          *      Process the encapsulated data
5168          */
5169         
5170         if(tcp_data(skb,sk, saddr, len))
5171         {
5172                 kfree_skb(skb, FREE_READ);
5173                 release_sock(sk);
5174                 return 0;
5175         }
5176 
5177         /*
5178          *      And done
5179          */     
5180         
5181         release_sock(sk);
5182         return 0;
5183 }
5184 
5185 /*
5186  *      This routine sends a packet with an out of date sequence
5187  *      number. It assumes the other end will try to ack it.
5188  */
5189 
5190 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5191 {
5192         struct sk_buff *buff,*skb;
5193         struct tcphdr *t1;
5194         struct device *dev=NULL;
5195         int tmp;
5196 
5197         if (sk->zapped)
5198                 return; /* After a valid reset we can send no more */
5199 
5200         /*
5201          *      Write data can still be transmitted/retransmitted in the
5202          *      following states.  If any other state is encountered, return.
5203          *      [listen/close will never occur here anyway]
5204          */
5205 
5206         if (sk->state != TCP_ESTABLISHED && 
5207             sk->state != TCP_CLOSE_WAIT &&
5208             sk->state != TCP_FIN_WAIT1 && 
5209             sk->state != TCP_LAST_ACK &&
5210             sk->state != TCP_CLOSING
5211         ) 
5212         {
5213                 return;
5214         }
5215         if ( before(sk->sent_seq, sk->window_seq) && 
5216             (skb=skb_peek(&sk->write_queue)))
5217         {
5218                 /*
5219                  * We are probing the opening of a window
5220                  * but the window size is != 0
5221                  * must have been a result SWS advoidance ( sender )
5222                  */
5223             
5224                 struct iphdr *iph;
5225                 struct tcphdr *th;
5226                 struct tcphdr *nth;
5227                 unsigned long win_size;
5228 #if 0
5229                 unsigned long ow_size;
5230 #endif
5231                 void * tcp_data_start;
5232         
5233                 /*
5234                  *      How many bytes can we send ?
5235                  */
5236                  
5237                 win_size = sk->window_seq - sk->sent_seq;
5238 
5239                 /*
5240                  *      Recover the buffer pointers
5241                  */
5242                  
5243                 iph = (struct iphdr *)skb->ip_hdr;
5244                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5245 
5246                 /*
5247                  *      Grab the data for a temporary frame
5248                  */
5249                  
5250                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5251                                      (iph->ihl << 2) +
5252                                      sk->prot->max_header + 15, 
5253                                      1, GFP_ATOMIC);
5254                 if ( buff == NULL )
5255                         return;
5256 
5257                 /* 
5258                  *      If we strip the packet on the write queue we must
5259                  *      be ready to retransmit this one 
5260                  */
5261             
5262                 buff->free = /*0*/1;
5263 
5264                 buff->sk = sk;
5265                 buff->localroute = sk->localroute;
5266                 
5267                 /*
5268                  *      Put headers on the new packet
5269                  */
5270 
5271                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5272                                          IPPROTO_TCP, sk->opt, buff->truesize,
5273                                          sk->ip_tos,sk->ip_ttl);
5274                 if (tmp < 0) 
5275                 {
5276                         sock_wfree(sk, buff);
5277                         return;
5278                 }
5279                 
5280                 /*
5281                  *      Move the TCP header over
5282                  */
5283 
5284                 buff->dev = dev;
5285 
5286                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5287 
5288                 memcpy(nth, th, th->doff * 4);
5289                 
5290                 /*
5291                  *      Correct the new header
5292                  */
5293                  
5294                 nth->ack = 1; 
5295                 nth->ack_seq = ntohl(sk->acked_seq);
5296                 nth->window = ntohs(tcp_select_window(sk));
5297                 nth->check = 0;
5298 
5299                 /*
5300                  *      Find the first data byte.
5301                  */
5302                  
5303                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5304                                 (iph->ihl << 2) + th->doff * 4;
5305 
5306                 /*
5307                  *      Add it to our new buffer
5308                  */
5309                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5310                 
5311                 /*
5312                  *      Remember our right edge sequence number.
5313                  */
5314                  
5315                 buff->h.seq = sk->sent_seq + win_size;
5316                 sk->sent_seq = buff->h.seq;             /* Hack */
5317 #if 0
5318 
5319                 /*
5320                  *      now: shrink the queue head segment 
5321                  */
5322                  
5323                 th->check = 0;
5324                 ow_size = skb->len - win_size - 
5325                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5326 
5327                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5328                 skb_trim(skb,skb->len-win_size);
5329                 sk->sent_seq += win_size;
5330                 th->seq = htonl(sk->sent_seq);
5331                 if (th->urg)
5332                 {
5333                         unsigned short urg_ptr;
5334         
5335                         urg_ptr = ntohs(th->urg_ptr);
5336                         if (urg_ptr <= win_size)
5337                                 th->urg = 0;
5338                         else
5339                         {
5340                                 urg_ptr -= win_size;
5341                                 th->urg_ptr = htons(urg_ptr);
5342                                 nth->urg_ptr = htons(win_size);
5343                         }
5344                 }
5345 #else
5346                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5347                         nth->urg = 0;
5348 #endif          
5349 
5350                 /*
5351                  *      Checksum the split buffer
5352                  */
5353                  
5354                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5355                            nth->doff * 4 + win_size , sk);
5356         }
5357         else
5358         {       
5359                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5360                 if (buff == NULL) 
5361                         return;
5362 
5363                 buff->free = 1;
5364                 buff->sk = sk;
5365                 buff->localroute = sk->localroute;
5366 
5367                 /*
5368                  *      Put in the IP header and routing stuff. 
5369                  */
5370                  
5371                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5372                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5373                 if (tmp < 0) 
5374                 {
5375                         sock_wfree(sk, buff);
5376                         return;
5377                 }
5378 
5379                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5380                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5381 
5382                 /*
5383                  *      Use a previous sequence.
5384                  *      This should cause the other end to send an ack.
5385                  */
5386          
5387                 t1->seq = htonl(sk->sent_seq-1);
5388                 t1->ack = 1; 
5389                 t1->res1= 0;
5390                 t1->res2= 0;
5391                 t1->rst = 0;
5392                 t1->urg = 0;
5393                 t1->psh = 0;
5394                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5395                 t1->syn = 0;
5396                 t1->ack_seq = ntohl(sk->acked_seq);
5397                 t1->window = ntohs(tcp_select_window(sk));
5398                 t1->doff = sizeof(*t1)/4;
5399                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5400 
5401         }               
5402 
5403         /*
5404          *      Send it.
5405          */
5406         
5407         sk->prot->queue_xmit(sk, dev, buff, 1);
5408         tcp_statistics.TcpOutSegs++;
5409 }
5410 
5411 /*
5412  *      A window probe timeout has occurred.
5413  */
5414 
5415 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5416 {
5417         if (sk->zapped)
5418                 return;         /* After a valid reset we can send no more */
5419 
5420         tcp_write_wakeup(sk);
5421 
5422         sk->backoff++;
5423         sk->rto = min(sk->rto << 1, 120*HZ);
5424         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5425         sk->retransmits++;
5426         sk->prot->retransmits ++;
5427 }
5428 
5429 /*
5430  *      Socket option code for TCP. 
5431  */
5432   
5433 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5434 {
5435         int val,err;
5436 
5437         if(level!=SOL_TCP)
5438                 return ip_setsockopt(sk,level,optname,optval,optlen);
5439 
5440         if (optval == NULL) 
5441                 return(-EINVAL);
5442 
5443         err=verify_area(VERIFY_READ, optval, sizeof(int));
5444         if(err)
5445                 return err;
5446         
5447         val = get_user((int *)optval);
5448 
5449         switch(optname)
5450         {
5451                 case TCP_MAXSEG:
5452 /*
5453  * values greater than interface MTU won't take effect.  however at
5454  * the point when this call is done we typically don't yet know
5455  * which interface is going to be used
5456  */
5457                         if(val<1||val>MAX_WINDOW)
5458                                 return -EINVAL;
5459                         sk->user_mss=val;
5460                         return 0;
5461                 case TCP_NODELAY:
5462                         sk->nonagle=(val==0)?0:1;
5463                         return 0;
5464                 default:
5465                         return(-ENOPROTOOPT);
5466         }
5467 }
5468 
5469 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5470 {
5471         int val,err;
5472 
5473         if(level!=SOL_TCP)
5474                 return ip_getsockopt(sk,level,optname,optval,optlen);
5475                         
5476         switch(optname)
5477         {
5478                 case TCP_MAXSEG:
5479                         val=sk->user_mss;
5480                         break;
5481                 case TCP_NODELAY:
5482                         val=sk->nonagle;
5483                         break;
5484                 default:
5485                         return(-ENOPROTOOPT);
5486         }
5487         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5488         if(err)
5489                 return err;
5490         put_user(sizeof(int),(int *) optlen);
5491 
5492         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5493         if(err)
5494                 return err;
5495         put_user(val,(int *)optval);
5496 
5497         return(0);
5498 }       
5499 
5500 
5501 struct proto tcp_prot = {
5502         tcp_close,
5503         tcp_read,
5504         tcp_write,
5505         tcp_sendto,
5506         tcp_recvfrom,
5507         ip_build_header,
5508         tcp_connect,
5509         tcp_accept,
5510         ip_queue_xmit,
5511         tcp_retransmit,
5512         tcp_write_wakeup,
5513         tcp_read_wakeup,
5514         tcp_rcv,
5515         tcp_select,
5516         tcp_ioctl,
5517         NULL,
5518         tcp_shutdown,
5519         tcp_setsockopt,
5520         tcp_getsockopt,
5521         tcp_sendmsg,
5522         tcp_recvmsg,
5523         128,
5524         0,
5525         "TCP",
5526         0, 0,
5527         {NULL,}
5528 };

/* [previous][next][first][last][top][bottom][index][help] */