root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_find_established
  6. tcp_dequeue_established
  7. tcp_close_pending
  8. tcp_time_wait
  9. tcp_do_retransmit
  10. reset_xmit_timer
  11. tcp_retransmit_time
  12. tcp_retransmit
  13. tcp_write_timeout
  14. retransmit_timer
  15. tcp_err
  16. tcp_readable
  17. tcp_listen_select
  18. tcp_select
  19. tcp_ioctl
  20. tcp_check
  21. tcp_send_check
  22. tcp_send_skb
  23. tcp_dequeue_partial
  24. tcp_send_partial
  25. tcp_enqueue_partial
  26. tcp_send_ack
  27. tcp_build_header
  28. tcp_write
  29. tcp_sendto
  30. tcp_read_wakeup
  31. cleanup_rbuf
  32. tcp_read_urg
  33. tcp_read
  34. tcp_close_state
  35. tcp_send_fin
  36. tcp_shutdown
  37. tcp_recvfrom
  38. tcp_reset
  39. tcp_options
  40. default_mask
  41. tcp_init_seq
  42. tcp_conn_request
  43. tcp_close
  44. tcp_write_xmit
  45. tcp_ack
  46. tcp_fin
  47. tcp_data
  48. tcp_check_urg
  49. tcp_urg
  50. tcp_accept
  51. tcp_connect
  52. tcp_sequence
  53. tcp_std_reset
  54. tcp_rcv
  55. tcp_write_wakeup
  56. tcp_send_probe0
  57. tcp_setsockopt
  58. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications
 178  *
 179  *
 180  * To Fix:
 181  *              Fast path the code. Two things here - fix the window calculation
 182  *              so it doesn't iterate over the queue, also spot packets with no funny
 183  *              options arriving in order and process directly.
 184  *
 185  *              Implement RFC 1191 [Path MTU discovery]
 186  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 187  *              Rewrite output state machine to use a single queue and do low window
 188  *              situations as per the spec (RFC 1122)
 189  *              Speed up input assembly algorithm.
 190  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 191  *              could do with it working on IPv4
 192  *              User settable/learned rtt/max window/mtu
 193  *              Cope with MTU/device switches when retransmitting in tcp.
 194  *              Fix the window handling to use PR's new code.
 195  *
 196  *              Change the fundamental structure to a single send queue maintained
 197  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 198  *              active routes too]). Cut the queue off in tcp_retransmit/
 199  *              tcp_transmit.
 200  *              Change the receive queue to assemble as it goes. This lets us
 201  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 202  *              tcp_data/tcp_read as well as the window shrink crud.
 203  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 204  *              tcp_queue_skb seem obvious routines to extract.
 205  *      
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *              
 243  *      TCP_CLOSE               socket is finished
 244  */
 245 
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * 
 253  * Use of PSH (4.2.2.2)
 254  *   MAY aggregate data sent without the PSH flag. (does)
 255  *   MAY queue data recieved without the PSH flag. (does)
 256  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 257  *   MAY implement PSH on send calls. (doesn't, thus:)
 258  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 259  *     MUST set PSH on last segment (does)
 260  *   MAY pass received PSH to application layer (doesn't)
 261  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 262  * 
 263  * Window Size (4.2.2.3, 4.2.2.16)
 264  *   MUST treat window size as an unsigned number (does)
 265  *   SHOULD treat window size as a 32-bit number (does not)
 266  *   MUST NOT shrink window once it is offered (does not normally)
 267  *   
 268  * Urgent Pointer (4.2.2.4)
 269  * **MUST point urgent pointer to last byte of urgent data (not right
 270  *     after). (doesn't, to be like BSD)
 271  *   MUST inform application layer asynchronously of incoming urgent
 272  *     data. (does)
 273  *   MUST provide application with means of determining the amount of
 274  *     urgent data pending. (does)
 275  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 276  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 277  *      [Follows BSD 1 byte of urgent data]
 278  * 
 279  * TCP Options (4.2.2.5)
 280  *   MUST be able to recieve TCP options in any segment. (does)
 281  *   MUST ignore unsupported options (does)
 282  *   
 283  * Maximum Segment Size Option (4.2.2.6)
 284  *   MUST implement both sending and receiving MSS. (does)
 285  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 286  *     it always). (does, even when MSS == 536, which is legal)
 287  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 288  *   MUST calculate "effective send MSS" correctly:
 289  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 290  *     (does - but allows operator override)
 291  *  
 292  * TCP Checksum (4.2.2.7)
 293  *   MUST generate and check TCP checksum. (does)
 294  * 
 295  * Initial Sequence Number Selection (4.2.2.8)
 296  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 297  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 298  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 299  * 
 300  * Simultaneous Open Attempts (4.2.2.10)
 301  *   MUST support simultaneous open attempts (does)
 302  * 
 303  * Recovery from Old Duplicate SYN (4.2.2.11)
 304  *   MUST keep track of active vs. passive open (does)
 305  * 
 306  * RST segment (4.2.2.12)
 307  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 308  *     anything with it, which is standard)
 309  * 
 310  * Closing a Connection (4.2.2.13)
 311  *   MUST inform application of whether connectin was closed by RST or
 312  *     normal close. (does)
 313  *   MAY allow "half-duplex" close (treat connection as closed for the
 314  *     local app, even before handshake is done). (does)
 315  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 316  * 
 317  * Retransmission Timeout (4.2.2.15)
 318  *   MUST implement Jacobson's slow start and congestion avoidance
 319  *     stuff. (does) 
 320  * 
 321  * Probing Zero Windows (4.2.2.17)
 322  *   MUST support probing of zero windows. (does)
 323  *   MAY keep offered window closed indefinitely. (does)
 324  *   MUST allow remote window to stay closed indefinitely. (does)
 325  * 
 326  * Passive Open Calls (4.2.2.18)
 327  *   MUST NOT let new passive open affect other connections. (doesn't)
 328  *   MUST support passive opens (LISTENs) concurrently. (does)
 329  *   
 330  * Time to Live (4.2.2.19)
 331  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 332  * 
 333  * Event Processing (4.2.2.20)
 334  *   SHOULD queue out-of-order segments. (does)
 335  *   MUST aggregate ACK segments whenever possible. (does but badly)
 336  *   
 337  * Retransmission Timeout Calculation (4.2.3.1)
 338  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 339  *     calculation. (does, or at least explains them in the comments 8*b)
 340  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 341  * 
 342  * When to Send an ACK Segment (4.2.3.2)
 343  *   SHOULD implement delayed ACK. (does not)
 344  *   MUST keep ACK delay < 0.5 sec. (N/A)
 345  * 
 346  * When to Send a Window Update (4.2.3.3)
 347  *   MUST implement receiver-side SWS. (does)
 348  *   
 349  * When to Send Data (4.2.3.4)
 350  *   MUST implement sender-side SWS. (does - imperfectly)
 351  *   SHOULD implement Nagle algorithm. (does)
 352  * 
 353  * TCP Connection Failures (4.2.3.5)
 354  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 355  *   SHOULD inform application layer of soft errors. (doesn't)
 356  *   
 357  * TCP Keep-Alives (4.2.3.6)
 358  *   MAY provide keep-alives. (does)
 359  *   MUST make keep-alives configurable on a per-connection basis. (does)
 360  *   MUST default to no keep-alives. (does)
 361  * **MUST make keep-alive interval configurable. (doesn't)
 362  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 363  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 364  *     connection. (doesn't)
 365  *   SHOULD send keep-alive with no data. (does)
 366  * 
 367  * TCP Multihoming (4.2.3.7)
 368  *   MUST get source address from IP layer before sending first
 369  *     SYN. (does)
 370  *   MUST use same local address for all segments of a connection. (does)
 371  * 
 372  * IP Options (4.2.3.8)
 373  *   (I don't think the IP layer sees the IP options, yet.)
 374  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 375  *   MAY support Time Stamp and Record Route. (doesn't)
 376  * **MUST allow application to specify a source route. (doesn't?)
 377  * **MUST allow receieved Source Route option to set route for all future
 378  *     segments on this connection. (doesn't, not that I think it's a
 379  *     huge problem)
 380  * 
 381  * ICMP messages (4.2.3.9)
 382  *   MUST act on ICMP errors. (does)
 383  *   MUST slow transmission upon receipt of a Source Quench. (does)
 384  *   MUST NOT abort connection upon receipt of soft Destination
 385  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 386  *     Problems. (doesn't)
 387  *   SHOULD report soft Destination Unreachables etc. to the
 388  *     application. (doesn't)
 389  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 390  *     messages (2, 3, 4). (does)
 391  * 
 392  * Remote Address Validation (4.2.3.10)
 393  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 394  *   MUST ignore SYN with invalid source address. (does)
 395  *   MUST silently discard incoming SYN for broadcast/multicast
 396  *     address. (does) 
 397  * 
 398  * Asynchronous Reports (4.2.4.1)
 399  * **MUST provide mechanism for reporting soft errors to application
 400  *     layer. (doesn't)
 401  * 
 402  * Type of Service (4.2.4.2)
 403  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 404  * 
 405  * (Whew. -- MS 950903)
 406  **/
 407 
 408 #include <linux/types.h>
 409 #include <linux/sched.h>
 410 #include <linux/mm.h>
 411 #include <linux/time.h>
 412 #include <linux/string.h>
 413 #include <linux/config.h>
 414 #include <linux/socket.h>
 415 #include <linux/sockios.h>
 416 #include <linux/termios.h>
 417 #include <linux/in.h>
 418 #include <linux/fcntl.h>
 419 #include <linux/inet.h>
 420 #include <linux/netdevice.h>
 421 #include <net/snmp.h>
 422 #include <net/ip.h>
 423 #include <net/protocol.h>
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426 #include <net/arp.h>
 427 #include <linux/skbuff.h>
 428 #include <net/sock.h>
 429 #include <net/route.h>
 430 #include <linux/errno.h>
 431 #include <linux/timer.h>
 432 #include <asm/system.h>
 433 #include <asm/segment.h>
 434 #include <linux/mm.h>
 435 #include <net/checksum.h>
 436 
 437 /*
 438  *      The MSL timer is the 'normal' timer.
 439  */
 440  
 441 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 442 
 443 #define SEQ_TICK 3
 444 unsigned long seq_offset;
 445 struct tcp_mib  tcp_statistics;
 446 
 447 /*
 448  *      Cached last hit socket
 449  */
 450  
 451 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 452 volatile unsigned short  th_cache_dport, th_cache_sport;
 453 volatile struct sock *th_cache_sk;
 454 
 455 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 456 {
 457         unsigned long flags;
 458         save_flags(flags);
 459         cli();
 460         th_cache_saddr=0;
 461         th_cache_daddr=0;
 462         th_cache_dport=0;
 463         th_cache_sport=0;
 464         th_cache_sk=NULL;
 465         restore_flags(flags);
 466 }
 467 
 468 static void tcp_close(struct sock *sk, int timeout);
 469 
 470 
 471 /*
 472  *      The less said about this the better, but it works and will do for 1.2 
 473  */
 474 
 475 static struct wait_queue *master_select_wakeup;
 476 
 477 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 478 {
 479         if (a < b) 
 480                 return(a);
 481         return(b);
 482 }
 483 
 484 #undef STATE_TRACE
 485 
 486 #ifdef STATE_TRACE
 487 static char *statename[]={
 488         "Unused","Established","Syn Sent","Syn Recv",
 489         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 490         "Close Wait","Last ACK","Listen","Closing"
 491 };
 492 #endif
 493 
 494 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 495 {
 496         if(sk->state==TCP_ESTABLISHED)
 497                 tcp_statistics.TcpCurrEstab--;
 498 #ifdef STATE_TRACE
 499         if(sk->debug)
 500                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 501 #endif  
 502         /* This is a hack but it doesn't occur often and it's going to
 503            be a real        to fix nicely */
 504            
 505         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 506         {
 507                 wake_up_interruptible(&master_select_wakeup);
 508         }
 509         sk->state=state;
 510         if(state==TCP_ESTABLISHED)
 511                 tcp_statistics.TcpCurrEstab++;
 512         if(sk->state==TCP_CLOSE)
 513                 tcp_cache_zap();
 514 }
 515 
 516 /*
 517  *      This routine picks a TCP windows for a socket based on
 518  *      the following constraints
 519  *  
 520  *      1. The window can never be shrunk once it is offered (RFC 793)
 521  *      2. We limit memory per socket
 522  *   
 523  *      For now we use NET2E3's heuristic of offering half the memory
 524  *      we have handy. All is not as bad as this seems however because
 525  *      of two things. Firstly we will bin packets even within the window
 526  *      in order to get the data we are waiting for into the memory limit.
 527  *      Secondly we bin common duplicate forms at receive time
 528  *      Better heuristics welcome
 529  */
 530    
 531 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 532 {
 533         int new_window = sk->prot->rspace(sk);
 534         
 535         if(sk->window_clamp)
 536                 new_window=min(sk->window_clamp,new_window);
 537         /*
 538          *      Two things are going on here.  First, we don't ever offer a
 539          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 540          *      receiver side of SWS as specified in RFC1122.
 541          *      Second, we always give them at least the window they
 542          *      had before, in order to avoid retracting window.  This
 543          *      is technically allowed, but RFC1122 advises against it and
 544          *      in practice it causes trouble.
 545          *
 546          *      Fixme: This doesn't correctly handle the case where
 547          *      new_window > sk->window but not by enough to allow for the
 548          *      shift in sequence space. 
 549          */
 550         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 551                 return(sk->window);
 552         return(new_window);
 553 }
 554 
 555 /*
 556  *      Find someone to 'accept'. Must be called with
 557  *      sk->inuse=1 or cli()
 558  */ 
 559 
 560 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 561 {
 562         struct sk_buff *p=skb_peek(&s->receive_queue);
 563         if(p==NULL)
 564                 return NULL;
 565         do
 566         {
 567                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 568                         return p;
 569                 p=p->next;
 570         }
 571         while(p!=(struct sk_buff *)&s->receive_queue);
 572         return NULL;
 573 }
 574 
 575 /*
 576  *      Remove a completed connection and return it. This is used by
 577  *      tcp_accept() to get connections from the queue.
 578  */
 579 
 580 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 581 {
 582         struct sk_buff *skb;
 583         unsigned long flags;
 584         save_flags(flags);
 585         cli(); 
 586         skb=tcp_find_established(s);
 587         if(skb!=NULL)
 588                 skb_unlink(skb);        /* Take it off the queue */
 589         restore_flags(flags);
 590         return skb;
 591 }
 592 
 593 /* 
 594  *      This routine closes sockets which have been at least partially
 595  *      opened, but not yet accepted. Currently it is only called by
 596  *      tcp_close, and timeout mirrors the value there. 
 597  */
 598 
 599 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 600 {
 601         struct sk_buff *skb;
 602 
 603         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 604         {
 605                 skb->sk->dead=1;
 606                 tcp_close(skb->sk, 0);
 607                 kfree_skb(skb, FREE_READ);
 608         }
 609         return;
 610 }
 611 
 612 /*
 613  *      Enter the time wait state. 
 614  */
 615 
 616 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 617 {
 618         tcp_set_state(sk,TCP_TIME_WAIT);
 619         sk->shutdown = SHUTDOWN_MASK;
 620         if (!sk->dead)
 621                 sk->state_change(sk);
 622         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 623 }
 624 
 625 /*
 626  *      A socket has timed out on its send queue and wants to do a
 627  *      little retransmitting. Currently this means TCP.
 628  */
 629 
 630 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 631 {
 632         struct sk_buff * skb;
 633         struct proto *prot;
 634         struct device *dev;
 635         int ct=0;
 636         struct rtable *rt;
 637 
 638         prot = sk->prot;
 639         skb = sk->send_head;
 640 
 641         while (skb != NULL)
 642         {
 643                 struct tcphdr *th;
 644                 struct iphdr *iph;
 645                 int size;
 646 
 647                 dev = skb->dev;
 648                 IS_SKB(skb);
 649                 skb->when = jiffies;
 650 
 651                 /*
 652                  *      Discard the surplus MAC header
 653                  */
 654                  
 655                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 656 
 657                 /*
 658                  * In general it's OK just to use the old packet.  However we
 659                  * need to use the current ack and window fields.  Urg and
 660                  * urg_ptr could possibly stand to be updated as well, but we
 661                  * don't keep the necessary data.  That shouldn't be a problem,
 662                  * if the other end is doing the right thing.  Since we're
 663                  * changing the packet, we have to issue a new IP identifier.
 664                  */
 665 
 666                 iph = (struct iphdr *)skb->data;
 667                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 668                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 669                 
 670                 /*
 671                  *      Note: We ought to check for window limits here but
 672                  *      currently this is done (less efficiently) elsewhere.
 673                  */
 674 
 675                 iph->id = htons(ip_id_count++);
 676                 ip_send_check(iph);
 677                 
 678                 /*
 679                  *      Put a MAC header back on (may cause ARPing)
 680                  */
 681                  
 682                 if(skb->localroute)
 683                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 684                 else
 685                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 686                         
 687                 if(rt==NULL)    /* Deep poo */
 688                 {
 689                         if(skb->sk)
 690                         {
 691                                 skb->sk->err=ENETUNREACH;
 692                                 skb->sk->error_report(skb->sk);
 693                         }
 694                 }
 695                 else
 696                 {
 697                         dev=rt->rt_dev;
 698                         skb->raddr=rt->rt_gateway;
 699                         if(skb->raddr==0)
 700                                 skb->raddr=iph->daddr;
 701                         skb->dev=dev;
 702                         skb->arp=1;
 703                         if(dev->hard_header)
 704                         {
 705                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 706                                         skb->arp=0;
 707                         }
 708                 
 709                         /*
 710                          *      This is not the right way to handle this. We have to
 711                          *      issue an up to date window and ack report with this 
 712                          *      retransmit to keep the odd buggy tcp that relies on 
 713                          *      the fact BSD does this happy. 
 714                          *      We don't however need to recalculate the entire 
 715                          *      checksum, so someone wanting a small problem to play
 716                          *      with might like to implement RFC1141/RFC1624 and speed
 717                          *      this up by avoiding a full checksum.
 718                          */
 719                  
 720                         th->ack_seq = ntohl(sk->acked_seq);
 721                         th->window = ntohs(tcp_select_window(sk));
 722                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 723                 
 724                         /*
 725                          *      If the interface is (still) up and running, kick it.
 726                          */
 727         
 728                         if (dev->flags & IFF_UP)
 729                         {
 730                                 /*
 731                                  *      If the packet is still being sent by the device/protocol
 732                                  *      below then don't retransmit. This is both needed, and good -
 733                                  *      especially with connected mode AX.25 where it stops resends
 734                                  *      occurring of an as yet unsent anyway frame!
 735                                  *      We still add up the counts as the round trip time wants
 736                                  *      adjusting.
 737                                  */
 738                                 if (sk && !skb_device_locked(skb))
 739                                 {
 740                                         /* Remove it from any existing driver queue first! */
 741                                         skb_unlink(skb);
 742                                         /* Now queue it */
 743                                         ip_statistics.IpOutRequests++;
 744                                         dev_queue_xmit(skb, dev, sk->priority);
 745                                 }
 746                         }
 747                 }
 748                 
 749                 /*
 750                  *      Count retransmissions
 751                  */
 752                  
 753                 ct++;
 754                 sk->prot->retransmits ++;
 755                 tcp_statistics.TcpRetransSegs++;
 756                 
 757 
 758                 /*
 759                  *      Only one retransmit requested.
 760                  */
 761         
 762                 if (!all)
 763                         break;
 764 
 765                 /*
 766                  *      This should cut it off before we send too many packets.
 767                  */
 768 
 769                 if (ct >= sk->cong_window)
 770                         break;
 771                 skb = skb->link3;
 772         }
 773 }
 774 
 775 /*
 776  *      Reset the retransmission timer
 777  */
 778  
 779 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 780 {
 781         del_timer(&sk->retransmit_timer);
 782         sk->ip_xmit_timeout = why;
 783         if((int)when < 0)
 784         {
 785                 when=3;
 786                 printk("Error: Negative timer in xmit_timer\n");
 787         }
 788         sk->retransmit_timer.expires=jiffies+when;
 789         add_timer(&sk->retransmit_timer);
 790 }
 791 
 792 /*
 793  *      This is the normal code called for timeouts.  It does the retransmission
 794  *      and then does backoff.  tcp_do_retransmit is separated out because
 795  *      tcp_ack needs to send stuff from the retransmit queue without
 796  *      initiating a backoff.
 797  */
 798 
 799 
 800 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 801 {
 802         tcp_do_retransmit(sk, all);
 803 
 804         /*
 805          * Increase the timeout each time we retransmit.  Note that
 806          * we do not increase the rtt estimate.  rto is initialized
 807          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 808          * that doubling rto each time is the least we can get away with.
 809          * In KA9Q, Karn uses this for the first few times, and then
 810          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 811          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 812          * defined in the protocol as the maximum possible RTT.  I guess
 813          * we'll have to use something other than TCP to talk to the
 814          * University of Mars.
 815          *
 816          * PAWS allows us longer timeouts and large windows, so once
 817          * implemented ftp to mars will work nicely. We will have to fix
 818          * the 120 second clamps though!
 819          */
 820 
 821         sk->retransmits++;
 822         sk->prot->retransmits++;
 823         sk->backoff++;
 824         sk->rto = min(sk->rto << 1, 120*HZ);
 825         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 826 }
 827 
 828 
 829 /*
 830  *      A timer event has trigger a tcp retransmit timeout. The
 831  *      socket xmit queue is ready and set up to send. Because
 832  *      the ack receive code keeps the queue straight we do
 833  *      nothing clever here.
 834  */
 835 
 836 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 837 {
 838         if (all) 
 839         {
 840                 tcp_retransmit_time(sk, all);
 841                 return;
 842         }
 843 
 844         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 845         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 846         sk->cong_count = 0;
 847 
 848         sk->cong_window = 1;
 849 
 850         /* Do the actual retransmit. */
 851         tcp_retransmit_time(sk, all);
 852 }
 853 
 854 /*
 855  *      A write timeout has occurred. Process the after effects.
 856  */
 857 
 858 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 859 {
 860         /*
 861          *      Look for a 'soft' timeout.
 862          */
 863         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 864                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 865         {
 866                 /*
 867                  *      Attempt to recover if arp has changed (unlikely!) or
 868                  *      a route has shifted (not supported prior to 1.3).
 869                  */
 870                 arp_destroy (sk->daddr, 0);
 871                 /*ip_route_check (sk->daddr);*/
 872         }
 873         
 874         /*
 875          *      Have we tried to SYN too many times (repent repent 8))
 876          */
 877          
 878         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 879         {
 880                 sk->err=ETIMEDOUT;
 881                 sk->error_report(sk);
 882                 del_timer(&sk->retransmit_timer);
 883                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 884                 tcp_set_state(sk,TCP_CLOSE);
 885                 /* Don't FIN, we got nothing back */
 886                 release_sock(sk);
 887                 return 0;
 888         }
 889         /*
 890          *      Has it gone just too far ?
 891          */
 892         if (sk->retransmits > TCP_RETR2) 
 893         {
 894                 sk->err = ETIMEDOUT;
 895                 sk->error_report(sk);
 896                 del_timer(&sk->retransmit_timer);
 897                 /*
 898                  *      Time wait the socket 
 899                  */
 900                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 901                 {
 902                         tcp_set_state(sk,TCP_TIME_WAIT);
 903                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 904                 }
 905                 else
 906                 {
 907                         /*
 908                          *      Clean up time.
 909                          */
 910                         tcp_set_state(sk, TCP_CLOSE);
 911                         release_sock(sk);
 912                         return 0;
 913                 }
 914         }
 915         return 1;
 916 }
 917 
 918 /*
 919  *      The TCP retransmit timer. This lacks a few small details.
 920  *
 921  *      1.      An initial rtt timeout on the probe0 should cause what we can
 922  *              of the first write queue buffer to be split and sent.
 923  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 924  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 925  *              tcp_err should save a 'soft error' for us.
 926  */
 927 
 928 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 929 {
 930         struct sock *sk = (struct sock*)data;
 931         int why = sk->ip_xmit_timeout;
 932 
 933         /* 
 934          * only process if socket is not in use
 935          */
 936 
 937         cli();
 938         if (sk->inuse || in_bh) 
 939         {
 940                 /* Try again in 1 second */
 941                 sk->retransmit_timer.expires = jiffies+HZ;
 942                 add_timer(&sk->retransmit_timer);
 943                 sti();
 944                 return;
 945         }
 946 
 947         sk->inuse = 1;
 948         sti();
 949 
 950         /* Always see if we need to send an ack. */
 951 
 952         if (sk->ack_backlog && !sk->zapped) 
 953         {
 954                 sk->prot->read_wakeup (sk);
 955                 if (! sk->dead)
 956                         sk->data_ready(sk,0);
 957         }
 958 
 959         /* Now we need to figure out why the socket was on the timer. */
 960 
 961         switch (why) 
 962         {
 963                 /* Window probing */
 964                 case TIME_PROBE0:
 965                         tcp_send_probe0(sk);
 966                         tcp_write_timeout(sk);
 967                         break;
 968                 /* Retransmitting */
 969                 case TIME_WRITE:
 970                         /* It could be we got here because we needed to send an ack.
 971                          * So we need to check for that.
 972                          */
 973                 {
 974                         struct sk_buff *skb;
 975                         unsigned long flags;
 976 
 977                         save_flags(flags);
 978                         cli();
 979                         skb = sk->send_head;
 980                         if (!skb) 
 981                         {
 982                                 restore_flags(flags);
 983                         } 
 984                         else 
 985                         {
 986                                 /*
 987                                  *      Kicked by a delayed ack. Reset timer
 988                                  *      correctly now
 989                                  */
 990                                 if (jiffies < skb->when + sk->rto) 
 991                                 {
 992                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 993                                         restore_flags(flags);
 994                                         break;
 995                                 }
 996                                 restore_flags(flags);
 997                                 /*
 998                                  *      Retransmission
 999                                  */
1000                                 sk->retransmits++;
1001                                 sk->prot->retransmits++;
1002                                 sk->prot->retransmit (sk, 0);
1003                                 tcp_write_timeout(sk);
1004                         }
1005                         break;
1006                 }
1007                 /* Sending Keepalives */
1008                 case TIME_KEEPOPEN:
1009                         /* 
1010                          * this reset_timer() call is a hack, this is not
1011                          * how KEEPOPEN is supposed to work.
1012                          */
1013                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1014 
1015                         /* Send something to keep the connection open. */
1016                         if (sk->prot->write_wakeup)
1017                                   sk->prot->write_wakeup (sk);
1018                         sk->retransmits++;
1019                         sk->prot->retransmits++;
1020                         tcp_write_timeout(sk);
1021                         break;
1022                 default:
1023                         printk ("rexmit_timer: timer expired - reason unknown\n");
1024                         break;
1025         }
1026         release_sock(sk);
1027 }
1028 
1029 /*
1030  * This routine is called by the ICMP module when it gets some
1031  * sort of error condition.  If err < 0 then the socket should
1032  * be closed and the error returned to the user.  If err > 0
1033  * it's just the icmp type << 8 | icmp code.  After adjustment
1034  * header points to the first 8 bytes of the tcp header.  We need
1035  * to find the appropriate port.
1036  */
1037 
1038 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
1039         __u32 saddr, struct inet_protocol *protocol)
1040 {
1041         struct tcphdr *th;
1042         struct sock *sk;
1043         struct iphdr *iph=(struct iphdr *)header;
1044   
1045         header+=4*iph->ihl;
1046    
1047 
1048         th =(struct tcphdr *)header;
1049         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1050 
1051         if (sk == NULL) 
1052                 return;
1053   
1054         if (type == ICMP_SOURCE_QUENCH) 
1055         {
1056                 /*
1057                  * FIXME:
1058                  * For now we will just trigger a linear backoff.
1059                  * The slow start code should cause a real backoff here.
1060                  */
1061                 if (sk->cong_window > 4)
1062                         sk->cong_window--;
1063                 return;
1064         }
1065         
1066         if (type == ICMP_PARAMETERPROB)
1067         {
1068                 sk->err=EPROTO;
1069                 sk->error_report(sk);
1070         }
1071 
1072         /*
1073          * If we've already connected we will keep trying
1074          * until we time out, or the user gives up.
1075          */
1076 
1077         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1078         {
1079                 sk->err = icmp_err_convert[code].errno;
1080                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1081                 {
1082                         tcp_statistics.TcpAttemptFails++;
1083                         tcp_set_state(sk,TCP_CLOSE);
1084                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1085                 }
1086         }
1087         return;
1088 }
1089 
1090 
1091 /*
1092  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1093  *      in the received data queue (ie a frame missing that needs sending to us). Not
1094  *      sorting using two queues as data arrives makes life so much harder.
1095  */
1096 
1097 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1098 {
1099         unsigned long counted;
1100         unsigned long amount;
1101         struct sk_buff *skb;
1102         int sum;
1103         unsigned long flags;
1104 
1105         if(sk && sk->debug)
1106                 printk("tcp_readable: %p - ",sk);
1107 
1108         save_flags(flags);
1109         cli();
1110         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1111         {
1112                 restore_flags(flags);
1113                 if(sk && sk->debug) 
1114                         printk("empty\n");
1115                 return(0);
1116         }
1117   
1118         counted = sk->copied_seq;       /* Where we are at the moment */
1119         amount = 0;
1120   
1121         /* 
1122          *      Do until a push or until we are out of data. 
1123          */
1124          
1125         do 
1126         {
1127                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1128                         break;
1129                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1130                 if (skb->h.th->syn)
1131                         sum++;
1132                 if (sum > 0) 
1133                 {                                       /* Add it up, move on */
1134                         amount += sum;
1135                         if (skb->h.th->syn) 
1136                                 amount--;
1137                         counted += sum;
1138                 }
1139                 /*
1140                  * Don't count urg data ... but do it in the right place!
1141                  * Consider: "old_data (ptr is here) URG PUSH data"
1142                  * The old code would stop at the first push because
1143                  * it counted the urg (amount==1) and then does amount--
1144                  * *after* the loop.  This means tcp_readable() always
1145                  * returned zero if any URG PUSH was in the queue, even
1146                  * though there was normal data available. If we subtract
1147                  * the urg data right here, we even get it to work for more
1148                  * than one URG PUSH skb without normal data.
1149                  * This means that select() finally works now with urg data
1150                  * in the queue.  Note that rlogin was never affected
1151                  * because it doesn't use select(); it uses two processes
1152                  * and a blocking read().  And the queue scan in tcp_read()
1153                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1154                  */
1155                 if (skb->h.th->urg)
1156                         amount--;       /* don't count urg data */
1157                 if (amount && skb->h.th->psh) break;
1158                 skb = skb->next;
1159         }
1160         while(skb != (struct sk_buff *)&sk->receive_queue);
1161 
1162         restore_flags(flags);
1163         if(sk->debug)
1164                 printk("got %lu bytes.\n",amount);
1165         return(amount);
1166 }
1167 
1168 /*
1169  * LISTEN is a special case for select..
1170  */
1171 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1172 {
1173         if (sel_type == SEL_IN) {
1174                 int retval;
1175 
1176                 sk->inuse = 1;
1177                 retval = (tcp_find_established(sk) != NULL);
1178                 release_sock(sk);
1179                 if (!retval)
1180                         select_wait(&master_select_wakeup,wait);
1181                 return retval;
1182         }
1183         return 0;
1184 }
1185 
1186 
1187 /*
1188  *      Wait for a TCP event.
1189  *
1190  *      Note that we don't need to set "sk->inuse", as the upper select layers
1191  *      take care of normal races (between the test and the event) and we don't
1192  *      go look at any of the socket buffers directly.
1193  */
1194 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1195 {
1196         if (sk->state == TCP_LISTEN)
1197                 return tcp_listen_select(sk, sel_type, wait);
1198 
1199         switch(sel_type) {
1200         case SEL_IN:
1201                 if (sk->err)
1202                         return 1;
1203                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204                         break;
1205 
1206                 if (sk->shutdown & RCV_SHUTDOWN)
1207                         return 1;
1208                         
1209                 if (sk->acked_seq == sk->copied_seq)
1210                         break;
1211 
1212                 if (sk->urg_seq != sk->copied_seq ||
1213                     sk->acked_seq != sk->copied_seq+1 ||
1214                     sk->urginline || !sk->urg_data)
1215                         return 1;
1216                 break;
1217 
1218         case SEL_OUT:
1219                 if (sk->err)
1220                         return 1;
1221                 if (sk->shutdown & SEND_SHUTDOWN) 
1222                         return 0;
1223                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1224                         break;
1225                 /*
1226                  * This is now right thanks to a small fix
1227                  * by Matt Dillon.
1228                  */
1229 
1230                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1231                         break;
1232                 return 1;
1233 
1234         case SEL_EX:
1235                 if (sk->urg_data)
1236                         return 1;
1237                 break;
1238         }
1239         select_wait(sk->sleep, wait);
1240         return 0;
1241 }
1242 
1243 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
1244 {
1245         int err;
1246         switch(cmd) 
1247         {
1248 
1249                 case TIOCINQ:
1250 #ifdef FIXME    /* FIXME: */
1251                 case FIONREAD:
1252 #endif
1253                 {
1254                         unsigned long amount;
1255 
1256                         if (sk->state == TCP_LISTEN) 
1257                                 return(-EINVAL);
1258 
1259                         sk->inuse = 1;
1260                         amount = tcp_readable(sk);
1261                         release_sock(sk);
1262                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1263                         if(err)
1264                                 return err;
1265                         put_user(amount, (int *)arg);
1266                         return(0);
1267                 }
1268                 case SIOCATMARK:
1269                 {
1270                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1271 
1272                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1273                         if (err)
1274                                 return err;
1275                         put_user(answ,(int *) arg);
1276                         return(0);
1277                 }
1278                 case TIOCOUTQ:
1279                 {
1280                         unsigned long amount;
1281 
1282                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1283                         amount = sk->prot->wspace(sk);
1284                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1285                         if(err)
1286                                 return err;
1287                         put_user(amount, (int *)arg);
1288                         return(0);
1289                 }
1290                 default:
1291                         return(-EINVAL);
1292         }
1293 }
1294 
1295 
1296 /*
1297  *      This routine computes a TCP checksum. 
1298  *
1299  *      Modified January 1995 from a go-faster DOS routine by
1300  *      Jorge Cwik <jorge@laser.satlink.net>
1301  */
1302  
1303 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1304           unsigned long saddr, unsigned long daddr, unsigned long base)
1305 {     
1306         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1307 }
1308 
1309 
1310 
1311 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1312                 unsigned long daddr, int len, struct sock *sk)
1313 {
1314         th->check = 0;
1315         th->check = tcp_check(th, len, saddr, daddr,
1316                 csum_partial((char *)th,len,0));
1317         return;
1318 }
1319 
1320 /*
1321  *      This is the main buffer sending routine. We queue the buffer
1322  *      having checked it is sane seeming.
1323  */
1324  
1325 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1326 {
1327         int size;
1328         struct tcphdr * th = skb->h.th;
1329 
1330         /*
1331          *      length of packet (not counting length of pre-tcp headers) 
1332          */
1333          
1334         size = skb->len - ((unsigned char *) th - skb->data);
1335 
1336         /*
1337          *      Sanity check it.. 
1338          */
1339          
1340         if (size < sizeof(struct tcphdr) || size > skb->len) 
1341         {
1342                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1343                         skb, skb->data, th, skb->len);
1344                 kfree_skb(skb, FREE_WRITE);
1345                 return;
1346         }
1347 
1348         /*
1349          *      If we have queued a header size packet.. (these crash a few
1350          *      tcp stacks if ack is not set)
1351          */
1352          
1353         if (size == sizeof(struct tcphdr)) 
1354         {
1355                 /* If it's got a syn or fin it's notionally included in the size..*/
1356                 if(!th->syn && !th->fin) 
1357                 {
1358                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1359                         kfree_skb(skb,FREE_WRITE);
1360                         return;
1361                 }
1362         }
1363 
1364         /*
1365          *      Actual processing.
1366          */
1367          
1368         tcp_statistics.TcpOutSegs++;  
1369         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1370         
1371         /*
1372          *      We must queue if
1373          *
1374          *      a) The right edge of this frame exceeds the window
1375          *      b) We are retransmitting (Nagle's rule)
1376          *      c) We have too many packets 'in flight'
1377          */
1378          
1379         if (after(skb->h.seq, sk->window_seq) ||
1380             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1381              sk->packets_out >= sk->cong_window) 
1382         {
1383                 /* checksum will be supplied by tcp_write_xmit.  So
1384                  * we shouldn't need to set it at all.  I'm being paranoid */
1385                 th->check = 0;
1386                 if (skb->next != NULL) 
1387                 {
1388                         printk("tcp_send_partial: next != NULL\n");
1389                         skb_unlink(skb);
1390                 }
1391                 skb_queue_tail(&sk->write_queue, skb);
1392                 
1393                 /*
1394                  *      If we don't fit we have to start the zero window
1395                  *      probes. This is broken - we really need to do a partial
1396                  *      send _first_ (This is what causes the Cisco and PC/TCP
1397                  *      grief).
1398                  */
1399                  
1400                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1401                     sk->send_head == NULL && sk->ack_backlog == 0)
1402                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1403         } 
1404         else 
1405         {
1406                 /*
1407                  *      This is going straight out
1408                  */
1409                  
1410                 th->ack_seq = ntohl(sk->acked_seq);
1411                 th->window = ntohs(tcp_select_window(sk));
1412 
1413                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1414 
1415                 sk->sent_seq = sk->write_seq;
1416                 
1417                 /*
1418                  *      This is mad. The tcp retransmit queue is put together
1419                  *      by the ip layer. This causes half the problems with
1420                  *      unroutable FIN's and other things.
1421                  */
1422                  
1423                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1424                 
1425                 /*
1426                  *      Set for next retransmit based on expected ACK time.
1427                  *      FIXME: We set this every time which means our 
1428                  *      retransmits are really about a window behind.
1429                  */
1430 
1431                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1432         }
1433 }
1434 
1435 /*
1436  *      Locking problems lead us to a messy situation where we can have
1437  *      multiple partially complete buffers queued up. This is really bad
1438  *      as we don't want to be sending partial buffers. Fix this with
1439  *      a semaphore or similar to lock tcp_write per socket.
1440  *
1441  *      These routines are pretty self descriptive.
1442  */
1443  
1444 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1445 {
1446         struct sk_buff * skb;
1447         unsigned long flags;
1448 
1449         save_flags(flags);
1450         cli();
1451         skb = sk->partial;
1452         if (skb) {
1453                 sk->partial = NULL;
1454                 del_timer(&sk->partial_timer);
1455         }
1456         restore_flags(flags);
1457         return skb;
1458 }
1459 
1460 /*
1461  *      Empty the partial queue
1462  */
1463  
1464 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1465 {
1466         struct sk_buff *skb;
1467 
1468         if (sk == NULL)
1469                 return;
1470         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1471                 tcp_send_skb(sk, skb);
1472 }
1473 
1474 /*
1475  *      Queue a partial frame
1476  */
1477  
1478 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1479 {
1480         struct sk_buff * tmp;
1481         unsigned long flags;
1482 
1483         save_flags(flags);
1484         cli();
1485         tmp = sk->partial;
1486         if (tmp)
1487                 del_timer(&sk->partial_timer);
1488         sk->partial = skb;
1489         init_timer(&sk->partial_timer);
1490         /*
1491          *      Wait up to 1 second for the buffer to fill.
1492          */
1493         sk->partial_timer.expires = jiffies+HZ;
1494         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1495         sk->partial_timer.data = (unsigned long) sk;
1496         add_timer(&sk->partial_timer);
1497         restore_flags(flags);
1498         if (tmp)
1499                 tcp_send_skb(sk, tmp);
1500 }
1501 
1502 
1503 /*
1504  *      This routine sends an ack and also updates the window. 
1505  */
1506  
1507 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1508              struct sock *sk,
1509              struct tcphdr *th, unsigned long daddr)
1510 {
1511         struct sk_buff *buff;
1512         struct tcphdr *t1;
1513         struct device *dev = NULL;
1514         int tmp;
1515 
1516         if(sk->zapped)
1517                 return;         /* We have been reset, we may not send again */
1518                 
1519         /*
1520          * We need to grab some memory, and put together an ack,
1521          * and then put it into the queue to be sent.
1522          */
1523 
1524         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1525         if (buff == NULL) 
1526         {
1527                 /* 
1528                  *      Force it to send an ack. We don't have to do this
1529                  *      (ACK is unreliable) but it's much better use of 
1530                  *      bandwidth on slow links to send a spare ack than
1531                  *      resend packets. 
1532                  */
1533                  
1534                 sk->ack_backlog++;
1535                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1536                 {
1537                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1538                 }
1539                 return;
1540         }
1541 
1542         /*
1543          *      Assemble a suitable TCP frame
1544          */
1545          
1546         buff->sk = sk;
1547         buff->localroute = sk->localroute;
1548 
1549         /* 
1550          *      Put in the IP header and routing stuff. 
1551          */
1552          
1553         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1554                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1555         if (tmp < 0) 
1556         {
1557                 buff->free = 1;
1558                 sk->prot->wfree(sk, buff);
1559                 return;
1560         }
1561         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1562 
1563         memcpy(t1, th, sizeof(*t1));
1564 
1565         /*
1566          *      Swap the send and the receive. 
1567          */
1568          
1569         t1->dest = th->source;
1570         t1->source = th->dest;
1571         t1->seq = ntohl(sequence);
1572         t1->ack = 1;
1573         sk->window = tcp_select_window(sk);
1574         t1->window = ntohs(sk->window);
1575         t1->res1 = 0;
1576         t1->res2 = 0;
1577         t1->rst = 0;
1578         t1->urg = 0;
1579         t1->syn = 0;
1580         t1->psh = 0;
1581         t1->fin = 0;
1582         
1583         /*
1584          *      If we have nothing queued for transmit and the transmit timer
1585          *      is on we are just doing an ACK timeout and need to switch
1586          *      to a keepalive.
1587          */
1588          
1589         if (ack == sk->acked_seq) 
1590         {
1591                 sk->ack_backlog = 0;
1592                 sk->bytes_rcv = 0;
1593                 sk->ack_timed = 0;
1594                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1595                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1596                 {
1597                         if(sk->keepopen) {
1598                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1599                         } else {
1600                                 delete_timer(sk);
1601                         }
1602                 }
1603         }
1604         
1605         /*
1606          *      Fill in the packet and send it
1607          */
1608          
1609         t1->ack_seq = ntohl(ack);
1610         t1->doff = sizeof(*t1)/4;
1611         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1612         if (sk->debug)
1613                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1614         tcp_statistics.TcpOutSegs++;
1615         sk->prot->queue_xmit(sk, dev, buff, 1);
1616 }
1617 
1618 
1619 /* 
1620  *      This routine builds a generic TCP header. 
1621  */
1622  
1623 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1624 {
1625 
1626         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1627         th->seq = htonl(sk->write_seq);
1628         th->psh =(push == 0) ? 1 : 0;
1629         th->doff = sizeof(*th)/4;
1630         th->ack = 1;
1631         th->fin = 0;
1632         sk->ack_backlog = 0;
1633         sk->bytes_rcv = 0;
1634         sk->ack_timed = 0;
1635         th->ack_seq = htonl(sk->acked_seq);
1636         sk->window = tcp_select_window(sk);
1637         th->window = htons(sk->window);
1638 
1639         return(sizeof(*th));
1640 }
1641 
1642 /*
1643  *      This routine copies from a user buffer into a socket,
1644  *      and starts the transmit system.
1645  */
1646 
1647 static int tcp_write(struct sock *sk, const unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1648           int len, int nonblock, unsigned flags)
1649 {
1650         int copied = 0;
1651         int copy;
1652         int tmp;
1653         struct sk_buff *skb;
1654         struct sk_buff *send_tmp;
1655         struct proto *prot;
1656         struct device *dev = NULL;
1657 
1658         sk->inuse=1;
1659         prot = sk->prot;
1660         while(len > 0) 
1661         {
1662                 if (sk->err) 
1663                 {                       /* Stop on an error */
1664                         release_sock(sk);
1665                         if (copied) 
1666                                 return(copied);
1667                         tmp = -sk->err;
1668                         sk->err = 0;
1669                         return(tmp);
1670                 }
1671 
1672                 /*
1673                  *      First thing we do is make sure that we are established. 
1674                  */
1675         
1676                 if (sk->shutdown & SEND_SHUTDOWN) 
1677                 {
1678                         release_sock(sk);
1679                         sk->err = EPIPE;
1680                         if (copied) 
1681                                 return(copied);
1682                         sk->err = 0;
1683                         return(-EPIPE);
1684                 }
1685 
1686                 /* 
1687                  *      Wait for a connection to finish.
1688                  */
1689         
1690                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1691                 {
1692                         if (sk->err) 
1693                         {
1694                                 release_sock(sk);
1695                                 if (copied) 
1696                                         return(copied);
1697                                 tmp = -sk->err;
1698                                 sk->err = 0;
1699                                 return(tmp);
1700                         }
1701 
1702                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1703                         {
1704                                 release_sock(sk);
1705                                 if (copied) 
1706                                         return(copied);
1707 
1708                                 if (sk->err) 
1709                                 {
1710                                         tmp = -sk->err;
1711                                         sk->err = 0;
1712                                         return(tmp);
1713                                 }
1714 
1715                                 if (sk->keepopen) 
1716                                 {
1717                                         send_sig(SIGPIPE, current, 0);
1718                                 }
1719                                 return(-EPIPE);
1720                         }
1721 
1722                         if (nonblock || copied) 
1723                         {
1724                                 release_sock(sk);
1725                                 if (copied) 
1726                                         return(copied);
1727                                 return(-EAGAIN);
1728                         }
1729 
1730                         release_sock(sk);
1731                         cli();
1732                 
1733                         if (sk->state != TCP_ESTABLISHED &&
1734                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1735                         {
1736                                 interruptible_sleep_on(sk->sleep);
1737                                 if (current->signal & ~current->blocked) 
1738                                 {
1739                                         sti();
1740                                         if (copied) 
1741                                                 return(copied);
1742                                         return(-ERESTARTSYS);
1743                                 }
1744                         }
1745                         sk->inuse = 1;
1746                         sti();
1747                 }
1748 
1749         /*
1750          * The following code can result in copy <= if sk->mss is ever
1751          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1752          * sk->mtu is constant once SYN processing is finished.  I.e. we
1753          * had better not get here until we've seen his SYN and at least one
1754          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1755          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1756          * non-decreasing.  Note that any ioctl to set user_mss must be done
1757          * before the exchange of SYN's.  If the initial ack from the other
1758          * end has a window of 0, max_window and thus mss will both be 0.
1759          */
1760 
1761         /* 
1762          *      Now we need to check if we have a half built packet. 
1763          */
1764 
1765                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1766                 {
1767                         int hdrlen;
1768 
1769                          /* IP header + TCP header */
1770                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1771                                  + sizeof(struct tcphdr);
1772         
1773                         /* Add more stuff to the end of skb->len */
1774                         if (!(flags & MSG_OOB)) 
1775                         {
1776                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1777                                 /* FIXME: this is really a bug. */
1778                                 if (copy <= 0) 
1779                                 {
1780                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1781                                         copy = 0;
1782                                 }
1783           
1784                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1785                                 from += copy;
1786                                 copied += copy;
1787                                 len -= copy;
1788                                 sk->write_seq += copy;
1789                         }
1790                         if ((skb->len - hdrlen) >= sk->mss ||
1791                                 (flags & MSG_OOB) || !sk->packets_out)
1792                                 tcp_send_skb(sk, skb);
1793                         else
1794                                 tcp_enqueue_partial(skb, sk);
1795                         continue;
1796                 }
1797 
1798         /*
1799          * We also need to worry about the window.
1800          * If window < 1/2 the maximum window we've seen from this
1801          *   host, don't use it.  This is sender side
1802          *   silly window prevention, as specified in RFC1122.
1803          *   (Note that this is different than earlier versions of
1804          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1805          *   use the whole MSS.  Since the results in the right
1806          *   edge of the packet being outside the window, it will
1807          *   be queued for later rather than sent.
1808          */
1809 
1810                 copy = sk->window_seq - sk->write_seq;
1811                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1812                         copy = sk->mss;
1813                 if (copy > len)
1814                         copy = len;
1815 
1816         /*
1817          *      We should really check the window here also. 
1818          */
1819          
1820                 send_tmp = NULL;
1821                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1822                 {
1823                         /*
1824                          *      We will release the socket in case we sleep here. 
1825                          */
1826                         release_sock(sk);
1827                         /*
1828                          *      NB: following must be mtu, because mss can be increased.
1829                          *      mss is always <= mtu 
1830                          */
1831                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1832                         sk->inuse = 1;
1833                         send_tmp = skb;
1834                 } 
1835                 else 
1836                 {
1837                         /*
1838                          *      We will release the socket in case we sleep here. 
1839                          */
1840                         release_sock(sk);
1841                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1842                         sk->inuse = 1;
1843                 }
1844 
1845                 /*
1846                  *      If we didn't get any memory, we need to sleep. 
1847                  */
1848 
1849                 if (skb == NULL) 
1850                 {
1851                         sk->socket->flags |= SO_NOSPACE;
1852                         if (nonblock) 
1853                         {
1854                                 release_sock(sk);
1855                                 if (copied) 
1856                                         return(copied);
1857                                 return(-EAGAIN);
1858                         }
1859 
1860                         /*
1861                          *      FIXME: here is another race condition. 
1862                          */
1863 
1864                         tmp = sk->wmem_alloc;
1865                         release_sock(sk);
1866                         cli();
1867                         /*
1868                          *      Again we will try to avoid it. 
1869                          */
1870                         if (tmp <= sk->wmem_alloc &&
1871                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1872                                 && sk->err == 0) 
1873                         {
1874                                 sk->socket->flags &= ~SO_NOSPACE;
1875                                 interruptible_sleep_on(sk->sleep);
1876                                 if (current->signal & ~current->blocked) 
1877                                 {
1878                                         sti();
1879                                         if (copied) 
1880                                                 return(copied);
1881                                         return(-ERESTARTSYS);
1882                                 }
1883                         }
1884                         sk->inuse = 1;
1885                         sti();
1886                         continue;
1887                 }
1888 
1889                 skb->sk = sk;
1890                 skb->free = 0;
1891                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1892         
1893                 /*
1894                  * FIXME: we need to optimize this.
1895                  * Perhaps some hints here would be good.
1896                  */
1897                 
1898                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1899                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1900                 if (tmp < 0 ) 
1901                 {
1902                         prot->wfree(sk, skb);
1903                         release_sock(sk);
1904                         if (copied) 
1905                                 return(copied);
1906                         return(tmp);
1907                 }
1908                 skb->dev = dev;
1909                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1910                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1911                 if (tmp < 0) 
1912                 {
1913                         prot->wfree(sk, skb);
1914                         release_sock(sk);
1915                         if (copied) 
1916                                 return(copied);
1917                         return(tmp);
1918                 }
1919 
1920                 if (flags & MSG_OOB) 
1921                 {
1922                         skb->h.th->urg = 1;
1923                         skb->h.th->urg_ptr = ntohs(copy);
1924                 }
1925 
1926                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1927                 
1928                 from += copy;
1929                 copied += copy;
1930                 len -= copy;
1931                 skb->free = 0;
1932                 sk->write_seq += copy;
1933         
1934                 if (send_tmp != NULL && sk->packets_out) 
1935                 {
1936                         tcp_enqueue_partial(send_tmp, sk);
1937                         continue;
1938                 }
1939                 tcp_send_skb(sk, skb);
1940         }
1941         sk->err = 0;
1942 
1943 /*
1944  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1945  *      interactive fast network servers. It's meant to be on and
1946  *      it really improves the throughput though not the echo time
1947  *      on my slow slip link - Alan
1948  */
1949 
1950 /*
1951  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1952  */
1953  
1954         if(sk->partial && ((!sk->packets_out) 
1955      /* If not nagling we can send on the before case too.. */
1956               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1957         ))
1958                 tcp_send_partial(sk);
1959 
1960         release_sock(sk);
1961         return(copied);
1962 }
1963 
1964 /*
1965  *      This is just a wrapper. 
1966  */
1967 
1968 static int tcp_sendto(struct sock *sk, const unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1969            int len, int nonblock, unsigned flags,
1970            struct sockaddr_in *addr, int addr_len)
1971 {
1972         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1973                 return -EINVAL;
1974         if (sk->state == TCP_CLOSE)
1975                 return -ENOTCONN;
1976         if (addr_len < sizeof(*addr))
1977                 return -EINVAL;
1978         if (addr->sin_family && addr->sin_family != AF_INET) 
1979                 return -EINVAL;
1980         if (addr->sin_port != sk->dummy_th.dest) 
1981                 return -EISCONN;
1982         if (addr->sin_addr.s_addr != sk->daddr) 
1983                 return -EISCONN;
1984         return tcp_write(sk, from, len, nonblock, flags);
1985 }
1986 
1987 
1988 /*
1989  *      Send an ack if one is backlogged at this point. Ought to merge
1990  *      this with tcp_send_ack().
1991  */
1992  
1993 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1994 {
1995         int tmp;
1996         struct device *dev = NULL;
1997         struct tcphdr *t1;
1998         struct sk_buff *buff;
1999 
2000         if (!sk->ack_backlog) 
2001                 return;
2002 
2003         /*
2004          * If we're closed, don't send an ack, or we'll get a RST
2005          * from the closed destination.
2006          */
2007         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2008                 return; 
2009 
2010         /*
2011          * FIXME: we need to put code here to prevent this routine from
2012          * being called.  Being called once in a while is ok, so only check
2013          * if this is the second time in a row.
2014          */
2015 
2016         /*
2017          * We need to grab some memory, and put together an ack,
2018          * and then put it into the queue to be sent.
2019          */
2020 
2021         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2022         if (buff == NULL) 
2023         {
2024                 /* Try again real soon. */
2025                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2026                 return;
2027         }
2028 
2029         buff->sk = sk;
2030         buff->localroute = sk->localroute;
2031         
2032         /*
2033          *      Put in the IP header and routing stuff. 
2034          */
2035 
2036         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2037                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2038         if (tmp < 0) 
2039         {
2040                 buff->free = 1;
2041                 sk->prot->wfree(sk, buff);
2042                 return;
2043         }
2044 
2045         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2046 
2047         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2048         t1->seq = htonl(sk->sent_seq);
2049         t1->ack = 1;
2050         t1->res1 = 0;
2051         t1->res2 = 0;
2052         t1->rst = 0;
2053         t1->urg = 0;
2054         t1->syn = 0;
2055         t1->psh = 0;
2056         sk->ack_backlog = 0;
2057         sk->bytes_rcv = 0;
2058         sk->window = tcp_select_window(sk);
2059         t1->window = ntohs(sk->window);
2060         t1->ack_seq = ntohl(sk->acked_seq);
2061         t1->doff = sizeof(*t1)/4;
2062         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2063         sk->prot->queue_xmit(sk, dev, buff, 1);
2064         tcp_statistics.TcpOutSegs++;
2065 }
2066 
2067 
2068 /*
2069  *      FIXME:
2070  *      This routine frees used buffers.
2071  *      It should consider sending an ACK to let the
2072  *      other end know we now have a bigger window.
2073  */
2074 
2075 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2076 {
2077         unsigned long flags;
2078         unsigned long left;
2079         struct sk_buff *skb;
2080         unsigned long rspace;
2081 
2082         if(sk->debug)
2083                 printk("cleaning rbuf for sk=%p\n", sk);
2084   
2085         save_flags(flags);
2086         cli();
2087   
2088         left = sk->prot->rspace(sk);
2089  
2090         /*
2091          *      We have to loop through all the buffer headers,
2092          *      and try to free up all the space we can.
2093          */
2094 
2095         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2096         {
2097                 if (!skb->used || skb->users) 
2098                         break;
2099                 skb_unlink(skb);
2100                 skb->sk = sk;
2101                 kfree_skb(skb, FREE_READ);
2102         }
2103 
2104         restore_flags(flags);
2105 
2106         /*
2107          *      FIXME:
2108          *      At this point we should send an ack if the difference
2109          *      in the window, and the amount of space is bigger than
2110          *      TCP_WINDOW_DIFF.
2111          */
2112 
2113         if(sk->debug)
2114                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
2115                                             left);
2116         if ((rspace=sk->prot->rspace(sk)) != left) 
2117         {
2118                 /*
2119                  * This area has caused the most trouble.  The current strategy
2120                  * is to simply do nothing if the other end has room to send at
2121                  * least 3 full packets, because the ack from those will auto-
2122                  * matically update the window.  If the other end doesn't think
2123                  * we have much space left, but we have room for at least 1 more
2124                  * complete packet than it thinks we do, we will send an ack
2125                  * immediately.  Otherwise we will wait up to .5 seconds in case
2126                  * the user reads some more.
2127                  */
2128                 sk->ack_backlog++;
2129         /*
2130          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2131          * if the other end is offering a window smaller than the agreed on MSS
2132          * (called sk->mtu here).  In theory there's no connection between send
2133          * and receive, and so no reason to think that they're going to send
2134          * small packets.  For the moment I'm using the hack of reducing the mss
2135          * only on the send side, so I'm putting mtu here.
2136          */
2137 
2138                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2139                 {
2140                         /* Send an ack right now. */
2141                         tcp_read_wakeup(sk);
2142                 } 
2143                 else 
2144                 {
2145                         /* Force it to send an ack soon. */
2146                         int was_active = del_timer(&sk->retransmit_timer);
2147                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2148                         {
2149                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2150                         } 
2151                         else
2152                                 add_timer(&sk->retransmit_timer);
2153                 }
2154         }
2155 } 
2156 
2157 
2158 /*
2159  *      Handle reading urgent data. BSD has very simple semantics for
2160  *      this, no blocking and very strange errors 8)
2161  */
2162  
2163 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2164              unsigned char *to, int len, unsigned flags)
2165 {
2166         /*
2167          *      No URG data to read
2168          */
2169         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2170                 return -EINVAL; /* Yes this is right ! */
2171                 
2172         if (sk->err) 
2173         {
2174                 int tmp = -sk->err;
2175                 sk->err = 0;
2176                 return tmp;
2177         }
2178 
2179         if (sk->state == TCP_CLOSE || sk->done) 
2180         {
2181                 if (!sk->done) {
2182                         sk->done = 1;
2183                         return 0;
2184                 }
2185                 return -ENOTCONN;
2186         }
2187 
2188         if (sk->shutdown & RCV_SHUTDOWN) 
2189         {
2190                 sk->done = 1;
2191                 return 0;
2192         }
2193         sk->inuse = 1;
2194         if (sk->urg_data & URG_VALID) 
2195         {
2196                 char c = sk->urg_data;
2197                 if (!(flags & MSG_PEEK))
2198                         sk->urg_data = URG_READ;
2199                 put_fs_byte(c, to);
2200                 release_sock(sk);
2201                 return 1;
2202         }
2203         release_sock(sk);
2204         
2205         /*
2206          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2207          * the available implementations agree in this case:
2208          * this call should never block, independent of the
2209          * blocking state of the socket.
2210          * Mike <pall@rz.uni-karlsruhe.de>
2211          */
2212         return -EAGAIN;
2213 }
2214 
2215 
2216 /*
2217  *      This routine copies from a sock struct into the user buffer. 
2218  */
2219  
2220 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2221         int len, int nonblock, unsigned flags)
2222 {
2223         struct wait_queue wait = { current, NULL };
2224         int copied = 0;
2225         u32 peek_seq;
2226         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2227         unsigned long used;
2228 
2229         /* 
2230          *      This error should be checked. 
2231          */
2232          
2233         if (sk->state == TCP_LISTEN)
2234                 return -ENOTCONN;
2235 
2236         /*
2237          *      Urgent data needs to be handled specially. 
2238          */
2239          
2240         if (flags & MSG_OOB)
2241                 return tcp_read_urg(sk, nonblock, to, len, flags);
2242 
2243         /*
2244          *      Copying sequence to update. This is volatile to handle
2245          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2246          *      inline and thus not flush cached variables otherwise).
2247          */
2248          
2249         peek_seq = sk->copied_seq;
2250         seq = &sk->copied_seq;
2251         if (flags & MSG_PEEK)
2252                 seq = &peek_seq;
2253 
2254         add_wait_queue(sk->sleep, &wait);
2255         sk->inuse = 1;
2256         while (len > 0) 
2257         {
2258                 struct sk_buff * skb;
2259                 u32 offset;
2260         
2261                 /*
2262                  * Are we at urgent data? Stop if we have read anything.
2263                  */
2264                  
2265                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2266                         break;
2267 
2268                 /*
2269                  *      Next get a buffer.
2270                  */
2271                  
2272                 current->state = TASK_INTERRUPTIBLE;
2273 
2274                 skb = skb_peek(&sk->receive_queue);
2275                 do 
2276                 {
2277                         if (!skb)
2278                                 break;
2279                         if (before(*seq, skb->h.th->seq))
2280                                 break;
2281                         offset = *seq - skb->h.th->seq;
2282                         if (skb->h.th->syn)
2283                                 offset--;
2284                         if (offset < skb->len)
2285                                 goto found_ok_skb;
2286                         if (skb->h.th->fin)
2287                                 goto found_fin_ok;
2288                         if (!(flags & MSG_PEEK))
2289                                 skb->used = 1;
2290                         skb = skb->next;
2291                 }
2292                 while (skb != (struct sk_buff *)&sk->receive_queue);
2293 
2294                 if (copied)
2295                         break;
2296 
2297                 if (sk->err) 
2298                 {
2299                         copied = -sk->err;
2300                         sk->err = 0;
2301                         break;
2302                 }
2303 
2304                 if (sk->state == TCP_CLOSE) 
2305                 {
2306                         if (!sk->done) 
2307                         {
2308                                 sk->done = 1;
2309                                 break;
2310                         }
2311                         copied = -ENOTCONN;
2312                         break;
2313                 }
2314 
2315                 if (sk->shutdown & RCV_SHUTDOWN) 
2316                 {
2317                         sk->done = 1;
2318                         break;
2319                 }
2320                         
2321                 if (nonblock) 
2322                 {
2323                         copied = -EAGAIN;
2324                         break;
2325                 }
2326 
2327                 cleanup_rbuf(sk);
2328                 release_sock(sk);
2329                 sk->socket->flags |= SO_WAITDATA;
2330                 schedule();
2331                 sk->socket->flags &= ~SO_WAITDATA;
2332                 sk->inuse = 1;
2333 
2334                 if (current->signal & ~current->blocked) 
2335                 {
2336                         copied = -ERESTARTSYS;
2337                         break;
2338                 }
2339                 continue;
2340 
2341         found_ok_skb:
2342                 /*
2343                  *      Lock the buffer. We can be fairly relaxed as
2344                  *      an interrupt will never steal a buffer we are 
2345                  *      using unless I've missed something serious in
2346                  *      tcp_data.
2347                  */
2348                 
2349                 skb->users++;
2350                 
2351                 /*
2352                  *      Ok so how much can we use ? 
2353                  */
2354                  
2355                 used = skb->len - offset;
2356                 if (len < used)
2357                         used = len;
2358                 /*
2359                  *      Do we have urgent data here? 
2360                  */
2361                 
2362                 if (sk->urg_data) 
2363                 {
2364                         u32 urg_offset = sk->urg_seq - *seq;
2365                         if (urg_offset < used) 
2366                         {
2367                                 if (!urg_offset) 
2368                                 {
2369                                         if (!sk->urginline) 
2370                                         {
2371                                                 ++*seq;
2372                                                 offset++;
2373                                                 used--;
2374                                         }
2375                                 }
2376                                 else
2377                                         used = urg_offset;
2378                         }
2379                 }
2380                 
2381                 /*
2382                  *      Copy it - We _MUST_ update *seq first so that we
2383                  *      don't ever double read when we have dual readers
2384                  */
2385                  
2386                 *seq += used;
2387 
2388                 /*
2389                  *      This memcpy_tofs can sleep. If it sleeps and we
2390                  *      do a second read it relies on the skb->users to avoid
2391                  *      a crash when cleanup_rbuf() gets called.
2392                  */
2393                  
2394                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2395                         skb->h.th->doff*4 + offset, used);
2396                 copied += used;
2397                 len -= used;
2398                 to += used;
2399                 
2400                 /*
2401                  *      We now will not sleep again until we are finished
2402                  *      with skb. Sorry if you are doing the SMP port
2403                  *      but you'll just have to fix it neatly ;)
2404                  */
2405                  
2406                 skb->users --;
2407                 
2408                 if (after(sk->copied_seq,sk->urg_seq))
2409                         sk->urg_data = 0;
2410                 if (used + offset < skb->len)
2411                         continue;
2412                 
2413                 /*
2414                  *      Process the FIN.
2415                  */
2416 
2417                 if (skb->h.th->fin)
2418                         goto found_fin_ok;
2419                 if (flags & MSG_PEEK)
2420                         continue;
2421                 skb->used = 1;
2422                 continue;
2423 
2424         found_fin_ok:
2425                 ++*seq;
2426                 if (flags & MSG_PEEK)
2427                         break;
2428                         
2429                 /*
2430                  *      All is done
2431                  */
2432                  
2433                 skb->used = 1;
2434                 sk->shutdown |= RCV_SHUTDOWN;
2435                 break;
2436 
2437         }
2438         remove_wait_queue(sk->sleep, &wait);
2439         current->state = TASK_RUNNING;
2440 
2441         /* Clean up data we have read: This will do ACK frames */
2442         cleanup_rbuf(sk);
2443         release_sock(sk);
2444         return copied;
2445 }
2446 
2447 /*
2448  *      State processing on a close. This implements the state shift for
2449  *      sending our FIN frame. Note that we only send a FIN for some 
2450  *      states. A shutdown() may have already sent the FIN, or we may be
2451  *      closed.
2452  */
2453  
2454 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2455 {
2456         int ns=TCP_CLOSE;
2457         int send_fin=0;
2458         switch(sk->state)
2459         {
2460                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2461                         break;
2462                 case TCP_SYN_RECV:
2463                 case TCP_ESTABLISHED:   /* Closedown begin */
2464                         ns=TCP_FIN_WAIT1;
2465                         send_fin=1;
2466                         break;
2467                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2468                 case TCP_FIN_WAIT2:
2469                 case TCP_CLOSING:
2470                         ns=sk->state;
2471                         break;
2472                 case TCP_CLOSE:
2473                 case TCP_LISTEN:
2474                         break;
2475                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2476                                            wait only for the ACK */
2477                         ns=TCP_LAST_ACK;
2478                         send_fin=1;
2479         }
2480         
2481         tcp_set_state(sk,ns);
2482                 
2483         /*
2484          *      This is a (useful) BSD violating of the RFC. There is a
2485          *      problem with TCP as specified in that the other end could
2486          *      keep a socket open forever with no application left this end.
2487          *      We use a 3 minute timeout (about the same as BSD) then kill
2488          *      our end. If they send after that then tough - BUT: long enough
2489          *      that we won't make the old 4*rto = almost no time - whoops
2490          *      reset mistake.
2491          */
2492         if(dead && ns==TCP_FIN_WAIT2)
2493         {
2494                 int timer_active=del_timer(&sk->timer);
2495                 if(timer_active)
2496                         add_timer(&sk->timer);
2497                 else
2498                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2499         }
2500         
2501         return send_fin;
2502 }
2503 
2504 /*
2505  *      Send a fin.
2506  */
2507 
2508 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2509 {
2510         struct proto *prot =(struct proto *)sk->prot;
2511         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2512         struct tcphdr *t1;
2513         struct sk_buff *buff;
2514         struct device *dev=NULL;
2515         int tmp;
2516                 
2517         release_sock(sk); /* in case the malloc sleeps. */
2518         
2519         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2520         sk->inuse = 1;
2521 
2522         if (buff == NULL)
2523         {
2524                 /* This is a disaster if it occurs */
2525                 printk("tcp_send_fin: Impossible malloc failure");
2526                 return;
2527         }
2528 
2529         /*
2530          *      Administrivia
2531          */
2532          
2533         buff->sk = sk;
2534         buff->localroute = sk->localroute;
2535 
2536         /*
2537          *      Put in the IP header and routing stuff. 
2538          */
2539 
2540         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2541                            IPPROTO_TCP, sk->opt,
2542                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2543         if (tmp < 0) 
2544         {
2545                 int t;
2546                 /*
2547                  *      Finish anyway, treat this as a send that got lost. 
2548                  *      (Not good).
2549                  */
2550                  
2551                 buff->free = 1;
2552                 prot->wfree(sk,buff);
2553                 sk->write_seq++;
2554                 t=del_timer(&sk->timer);
2555                 if(t)
2556                         add_timer(&sk->timer);
2557                 else
2558                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2559                 return;
2560         }
2561         
2562         /*
2563          *      We ought to check if the end of the queue is a buffer and
2564          *      if so simply add the fin to that buffer, not send it ahead.
2565          */
2566 
2567         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2568         buff->dev = dev;
2569         memcpy(t1, th, sizeof(*t1));
2570         t1->seq = ntohl(sk->write_seq);
2571         sk->write_seq++;
2572         buff->h.seq = sk->write_seq;
2573         t1->ack = 1;
2574         t1->ack_seq = ntohl(sk->acked_seq);
2575         t1->window = ntohs(sk->window=tcp_select_window(sk));
2576         t1->fin = 1;
2577         t1->rst = 0;
2578         t1->doff = sizeof(*t1)/4;
2579         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2580 
2581         /*
2582          * If there is data in the write queue, the fin must be appended to
2583          * the write queue.
2584          */
2585         
2586         if (skb_peek(&sk->write_queue) != NULL) 
2587         {
2588                 buff->free = 0;
2589                 if (buff->next != NULL) 
2590                 {
2591                         printk("tcp_send_fin: next != NULL\n");
2592                         skb_unlink(buff);
2593                 }
2594                 skb_queue_tail(&sk->write_queue, buff);
2595         } 
2596         else 
2597         {
2598                 sk->sent_seq = sk->write_seq;
2599                 sk->prot->queue_xmit(sk, dev, buff, 0);
2600                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2601         }
2602 }
2603 
2604 /*
2605  *      Shutdown the sending side of a connection. Much like close except
2606  *      that we don't receive shut down or set sk->dead=1.
2607  */
2608 
2609 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2610 {
2611         /*
2612          *      We need to grab some memory, and put together a FIN,
2613          *      and then put it into the queue to be sent.
2614          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2615          */
2616 
2617         if (!(how & SEND_SHUTDOWN)) 
2618                 return;
2619          
2620         /*
2621          *      If we've already sent a FIN, or it's a closed state
2622          */
2623          
2624         if (sk->state == TCP_FIN_WAIT1 ||
2625             sk->state == TCP_FIN_WAIT2 ||
2626             sk->state == TCP_CLOSING ||
2627             sk->state == TCP_LAST_ACK ||
2628             sk->state == TCP_TIME_WAIT || 
2629             sk->state == TCP_CLOSE ||
2630             sk->state == TCP_LISTEN
2631           )
2632         {
2633                 return;
2634         }
2635         sk->inuse = 1;
2636 
2637         /*
2638          * flag that the sender has shutdown
2639          */
2640 
2641         sk->shutdown |= SEND_SHUTDOWN;
2642 
2643         /*
2644          *  Clear out any half completed packets. 
2645          */
2646 
2647         if (sk->partial)
2648                 tcp_send_partial(sk);
2649                 
2650         /*
2651          *      FIN if needed
2652          */
2653          
2654         if(tcp_close_state(sk,0))
2655                 tcp_send_fin(sk);
2656                 
2657         release_sock(sk);
2658 }
2659 
2660 
2661 static int
2662 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2663              int to_len, int nonblock, unsigned flags,
2664              struct sockaddr_in *addr, int *addr_len)
2665 {
2666         int result;
2667   
2668         /* 
2669          *      Have to check these first unlike the old code. If 
2670          *      we check them after we lose data on an error
2671          *      which is wrong 
2672          */
2673 
2674         if(addr_len)
2675                 *addr_len = sizeof(*addr);
2676         result=tcp_read(sk, to, to_len, nonblock, flags);
2677 
2678         if (result < 0) 
2679                 return(result);
2680   
2681         if(addr)
2682         {
2683                 addr->sin_family = AF_INET;
2684                 addr->sin_port = sk->dummy_th.dest;
2685                 addr->sin_addr.s_addr = sk->daddr;
2686         }
2687         return(result);
2688 }
2689 
2690 
2691 /*
2692  *      This routine will send an RST to the other tcp. 
2693  */
2694  
2695 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2696           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2697 {
2698         struct sk_buff *buff;
2699         struct tcphdr *t1;
2700         int tmp;
2701         struct device *ndev=NULL;
2702 
2703         /*
2704          *      Cannot reset a reset (Think about it).
2705          */
2706          
2707         if(th->rst)
2708                 return;
2709   
2710         /*
2711          * We need to grab some memory, and put together an RST,
2712          * and then put it into the queue to be sent.
2713          */
2714 
2715         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2716         if (buff == NULL) 
2717                 return;
2718 
2719         buff->sk = NULL;
2720         buff->dev = dev;
2721         buff->localroute = 0;
2722 
2723         /*
2724          *      Put in the IP header and routing stuff. 
2725          */
2726 
2727         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2728                            sizeof(struct tcphdr),tos,ttl);
2729         if (tmp < 0) 
2730         {
2731                 buff->free = 1;
2732                 prot->wfree(NULL, buff);
2733                 return;
2734         }
2735 
2736         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2737         memcpy(t1, th, sizeof(*t1));
2738 
2739         /*
2740          *      Swap the send and the receive. 
2741          */
2742 
2743         t1->dest = th->source;
2744         t1->source = th->dest;
2745         t1->rst = 1;  
2746         t1->window = 0;
2747   
2748         if(th->ack)
2749         {
2750                 t1->ack = 0;
2751                 t1->seq = th->ack_seq;
2752                 t1->ack_seq = 0;
2753         }
2754         else
2755         {
2756                 t1->ack = 1;
2757                 if(!th->syn)
2758                         t1->ack_seq=htonl(th->seq);
2759                 else
2760                         t1->ack_seq=htonl(th->seq+1);
2761                 t1->seq=0;
2762         }
2763 
2764         t1->syn = 0;
2765         t1->urg = 0;
2766         t1->fin = 0;
2767         t1->psh = 0;
2768         t1->doff = sizeof(*t1)/4;
2769         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2770         prot->queue_xmit(NULL, ndev, buff, 1);
2771         tcp_statistics.TcpOutSegs++;
2772 }
2773 
2774 
2775 /*
2776  *      Look for tcp options. Parses everything but only knows about MSS.
2777  *      This routine is always called with the packet containing the SYN.
2778  *      However it may also be called with the ack to the SYN.  So you
2779  *      can't assume this is always the SYN.  It's always called after
2780  *      we have set up sk->mtu to our own MTU.
2781  *
2782  *      We need at minimum to add PAWS support here. Possibly large windows
2783  *      as Linux gets deployed on 100Mb/sec networks.
2784  */
2785  
2786 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2787 {
2788         unsigned char *ptr;
2789         int length=(th->doff*4)-sizeof(struct tcphdr);
2790         int mss_seen = 0;
2791     
2792         ptr = (unsigned char *)(th + 1);
2793   
2794         while(length>0)
2795         {
2796                 int opcode=*ptr++;
2797                 int opsize=*ptr++;
2798                 switch(opcode)
2799                 {
2800                         case TCPOPT_EOL:
2801                                 return;
2802                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2803                                 length--;
2804                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2805                                 continue;
2806                         
2807                         default:
2808                                 if(opsize<=2)   /* Avoid silly options looping forever */
2809                                         return;
2810                                 switch(opcode)
2811                                 {
2812                                         case TCPOPT_MSS:
2813                                                 if(opsize==4 && th->syn)
2814                                                 {
2815                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2816                                                         mss_seen = 1;
2817                                                 }
2818                                                 break;
2819                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2820                                 }
2821                                 ptr+=opsize-2;
2822                                 length-=opsize;
2823                 }
2824         }
2825         if (th->syn) 
2826         {
2827                 if (! mss_seen)
2828                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2829         }
2830 #ifdef CONFIG_INET_PCTCP
2831         sk->mss = min(sk->max_window >> 1, sk->mtu);
2832 #else    
2833         sk->mss = min(sk->max_window, sk->mtu);
2834 #endif  
2835 }
2836 
2837 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2838 {
2839         dst = ntohl(dst);
2840         if (IN_CLASSA(dst))
2841                 return htonl(IN_CLASSA_NET);
2842         if (IN_CLASSB(dst))
2843                 return htonl(IN_CLASSB_NET);
2844         return htonl(IN_CLASSC_NET);
2845 }
2846 
2847 /*
2848  *      Default sequence number picking algorithm.
2849  *      As close as possible to RFC 793, which
2850  *      suggests using a 250kHz clock.
2851  *      Further reading shows this assumes 2MB/s networks.
2852  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2853  *      That's funny, Linux has one built in!  Use it!
2854  */
2855 
2856 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2857 {
2858         struct timeval tv;
2859         do_gettimeofday(&tv);
2860         return tv.tv_usec+tv.tv_sec*1000000;
2861 }
2862 
2863 /*
2864  *      This routine handles a connection request.
2865  *      It should make sure we haven't already responded.
2866  *      Because of the way BSD works, we have to send a syn/ack now.
2867  *      This also means it will be harder to close a socket which is
2868  *      listening.
2869  */
2870  
2871 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2872                  unsigned long daddr, unsigned long saddr,
2873                  struct options *opt, struct device *dev, u32 seq)
2874 {
2875         struct sk_buff *buff;
2876         struct tcphdr *t1;
2877         unsigned char *ptr;
2878         struct sock *newsk;
2879         struct tcphdr *th;
2880         struct device *ndev=NULL;
2881         int tmp;
2882         struct rtable *rt;
2883   
2884         th = skb->h.th;
2885 
2886         /* If the socket is dead, don't accept the connection. */
2887         if (!sk->dead) 
2888         {
2889                 sk->data_ready(sk,0);
2890         }
2891         else 
2892         {
2893                 if(sk->debug)
2894                         printk("Reset on %p: Connect on dead socket.\n",sk);
2895                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2896                 tcp_statistics.TcpAttemptFails++;
2897                 kfree_skb(skb, FREE_READ);
2898                 return;
2899         }
2900 
2901         /*
2902          * Make sure we can accept more.  This will prevent a
2903          * flurry of syns from eating up all our memory.
2904          */
2905 
2906         if (sk->ack_backlog >= sk->max_ack_backlog) 
2907         {
2908                 tcp_statistics.TcpAttemptFails++;
2909                 kfree_skb(skb, FREE_READ);
2910                 return;
2911         }
2912 
2913         /*
2914          * We need to build a new sock struct.
2915          * It is sort of bad to have a socket without an inode attached
2916          * to it, but the wake_up's will just wake up the listening socket,
2917          * and if the listening socket is destroyed before this is taken
2918          * off of the queue, this will take care of it.
2919          */
2920 
2921         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2922         if (newsk == NULL) 
2923         {
2924                 /* just ignore the syn.  It will get retransmitted. */
2925                 tcp_statistics.TcpAttemptFails++;
2926                 kfree_skb(skb, FREE_READ);
2927                 return;
2928         }
2929 
2930         memcpy(newsk, sk, sizeof(*newsk));
2931         newsk->opt = NULL;
2932         if (opt && opt->optlen) {
2933           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2934           if (!sk->opt) {
2935                 kfree_s(newsk, sizeof(struct sock));
2936                 tcp_statistics.TcpAttemptFails++;
2937                 kfree_skb(skb, FREE_READ);
2938                 return;
2939           }
2940           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2941                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2942                 kfree_s(newsk, sizeof(struct sock));
2943                 tcp_statistics.TcpAttemptFails++;
2944                 kfree_skb(skb, FREE_READ);
2945                 return;
2946           }
2947         }
2948         skb_queue_head_init(&newsk->write_queue);
2949         skb_queue_head_init(&newsk->receive_queue);
2950         newsk->send_head = NULL;
2951         newsk->send_tail = NULL;
2952         skb_queue_head_init(&newsk->back_log);
2953         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2954         newsk->rto = TCP_TIMEOUT_INIT;
2955         newsk->mdev = 0;
2956         newsk->max_window = 0;
2957         newsk->cong_window = 1;
2958         newsk->cong_count = 0;
2959         newsk->ssthresh = 0;
2960         newsk->backoff = 0;
2961         newsk->blog = 0;
2962         newsk->intr = 0;
2963         newsk->proc = 0;
2964         newsk->done = 0;
2965         newsk->partial = NULL;
2966         newsk->pair = NULL;
2967         newsk->wmem_alloc = 0;
2968         newsk->rmem_alloc = 0;
2969         newsk->localroute = sk->localroute;
2970 
2971         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2972 
2973         newsk->err = 0;
2974         newsk->shutdown = 0;
2975         newsk->ack_backlog = 0;
2976         newsk->acked_seq = skb->h.th->seq+1;
2977         newsk->copied_seq = skb->h.th->seq+1;
2978         newsk->fin_seq = skb->h.th->seq;
2979         newsk->state = TCP_SYN_RECV;
2980         newsk->timeout = 0;
2981         newsk->ip_xmit_timeout = 0;
2982         newsk->write_seq = seq; 
2983         newsk->window_seq = newsk->write_seq;
2984         newsk->rcv_ack_seq = newsk->write_seq;
2985         newsk->urg_data = 0;
2986         newsk->retransmits = 0;
2987         newsk->linger=0;
2988         newsk->destroy = 0;
2989         init_timer(&newsk->timer);
2990         newsk->timer.data = (unsigned long)newsk;
2991         newsk->timer.function = &net_timer;
2992         init_timer(&newsk->retransmit_timer);
2993         newsk->retransmit_timer.data = (unsigned long)newsk;
2994         newsk->retransmit_timer.function=&retransmit_timer;
2995         newsk->dummy_th.source = skb->h.th->dest;
2996         newsk->dummy_th.dest = skb->h.th->source;
2997         
2998         /*
2999          *      Swap these two, they are from our point of view. 
3000          */
3001          
3002         newsk->daddr = saddr;
3003         newsk->saddr = daddr;
3004 
3005         put_sock(newsk->num,newsk);
3006         newsk->dummy_th.res1 = 0;
3007         newsk->dummy_th.doff = 6;
3008         newsk->dummy_th.fin = 0;
3009         newsk->dummy_th.syn = 0;
3010         newsk->dummy_th.rst = 0;        
3011         newsk->dummy_th.psh = 0;
3012         newsk->dummy_th.ack = 0;
3013         newsk->dummy_th.urg = 0;
3014         newsk->dummy_th.res2 = 0;
3015         newsk->acked_seq = skb->h.th->seq + 1;
3016         newsk->copied_seq = skb->h.th->seq + 1;
3017         newsk->socket = NULL;
3018 
3019         /*
3020          *      Grab the ttl and tos values and use them 
3021          */
3022 
3023         newsk->ip_ttl=sk->ip_ttl;
3024         newsk->ip_tos=skb->ip_hdr->tos;
3025 
3026         /*
3027          *      Use 512 or whatever user asked for 
3028          */
3029 
3030         /*
3031          *      Note use of sk->user_mss, since user has no direct access to newsk 
3032          */
3033 
3034         rt=ip_rt_route(saddr, NULL,NULL);
3035         
3036         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3037                 newsk->window_clamp = rt->rt_window;
3038         else
3039                 newsk->window_clamp = 0;
3040                 
3041         if (sk->user_mss)
3042                 newsk->mtu = sk->user_mss;
3043         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3044                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3045         else 
3046         {
3047 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3048                 if ((saddr ^ daddr) & default_mask(saddr))
3049 #else
3050                 if ((saddr ^ daddr) & dev->pa_mask)
3051 #endif
3052                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3053                 else
3054                         newsk->mtu = MAX_WINDOW;
3055         }
3056 
3057         /*
3058          *      But not bigger than device MTU 
3059          */
3060 
3061         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3062 
3063         /*
3064          *      This will min with what arrived in the packet 
3065          */
3066 
3067         tcp_options(newsk,skb->h.th);
3068         
3069         tcp_cache_zap();
3070 
3071         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3072         if (buff == NULL) 
3073         {
3074                 sk->err = ENOMEM;
3075                 newsk->dead = 1;
3076                 newsk->state = TCP_CLOSE;
3077                 /* And this will destroy it */
3078                 release_sock(newsk);
3079                 kfree_skb(skb, FREE_READ);
3080                 tcp_statistics.TcpAttemptFails++;
3081                 return;
3082         }
3083   
3084         buff->sk = newsk;
3085         buff->localroute = newsk->localroute;
3086 
3087         /*
3088          *      Put in the IP header and routing stuff. 
3089          */
3090 
3091         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3092                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3093 
3094         /*
3095          *      Something went wrong. 
3096          */
3097 
3098         if (tmp < 0) 
3099         {
3100                 sk->err = tmp;
3101                 buff->free = 1;
3102                 kfree_skb(buff,FREE_WRITE);
3103                 newsk->dead = 1;
3104                 newsk->state = TCP_CLOSE;
3105                 release_sock(newsk);
3106                 skb->sk = sk;
3107                 kfree_skb(skb, FREE_READ);
3108                 tcp_statistics.TcpAttemptFails++;
3109                 return;
3110         }
3111 
3112         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3113   
3114         memcpy(t1, skb->h.th, sizeof(*t1));
3115         buff->h.seq = newsk->write_seq;
3116         /*
3117          *      Swap the send and the receive. 
3118          */
3119         t1->dest = skb->h.th->source;
3120         t1->source = newsk->dummy_th.source;
3121         t1->seq = ntohl(newsk->write_seq++);
3122         t1->ack = 1;
3123         newsk->window = tcp_select_window(newsk);
3124         newsk->sent_seq = newsk->write_seq;
3125         t1->window = ntohs(newsk->window);
3126         t1->res1 = 0;
3127         t1->res2 = 0;
3128         t1->rst = 0;
3129         t1->urg = 0;
3130         t1->psh = 0;
3131         t1->syn = 1;
3132         t1->ack_seq = ntohl(skb->h.th->seq+1);
3133         t1->doff = sizeof(*t1)/4+1;
3134         ptr = skb_put(buff,4);
3135         ptr[0] = 2;
3136         ptr[1] = 4;
3137         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3138         ptr[3] =(newsk->mtu) & 0xff;
3139 
3140         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3141         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3142         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3143         skb->sk = newsk;
3144 
3145         /*
3146          *      Charge the sock_buff to newsk. 
3147          */
3148          
3149         sk->rmem_alloc -= skb->truesize;
3150         newsk->rmem_alloc += skb->truesize;
3151         
3152         skb_queue_tail(&sk->receive_queue,skb);
3153         sk->ack_backlog++;
3154         release_sock(newsk);
3155         tcp_statistics.TcpOutSegs++;
3156 }
3157 
3158 
3159 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
3160 {
3161         /*
3162          * We need to grab some memory, and put together a FIN, 
3163          * and then put it into the queue to be sent.
3164          */
3165         
3166         sk->inuse = 1;
3167         
3168         if(th_cache_sk==sk)
3169                 tcp_cache_zap();
3170         if(sk->state == TCP_LISTEN)
3171         {
3172                 /* Special case */
3173                 tcp_set_state(sk, TCP_CLOSE);
3174                 tcp_close_pending(sk);
3175                 release_sock(sk);
3176                 return;
3177         }
3178         
3179         sk->keepopen = 1;
3180         sk->shutdown = SHUTDOWN_MASK;
3181 
3182         if (!sk->dead) 
3183                 sk->state_change(sk);
3184 
3185         if (timeout == 0) 
3186         {
3187                 struct sk_buff *skb;
3188                 
3189                 /*
3190                  *  We need to flush the recv. buffs.  We do this only on the
3191                  *  descriptor close, not protocol-sourced closes, because the
3192                  *  reader process may not have drained the data yet!
3193                  */
3194                  
3195                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3196                         kfree_skb(skb, FREE_READ);
3197                 /*
3198                  *      Get rid off any half-completed packets. 
3199                  */
3200 
3201                 if (sk->partial) 
3202                         tcp_send_partial(sk);
3203         }
3204 
3205                 
3206         /*
3207          *      Timeout is not the same thing - however the code likes
3208          *      to send both the same way (sigh).
3209          */
3210          
3211         if(timeout)
3212         {
3213                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3214         }
3215         else
3216         {
3217                 if(tcp_close_state(sk,1)==1)
3218                 {
3219                         tcp_send_fin(sk);
3220                 }
3221         }
3222         release_sock(sk);
3223 }
3224 
3225 
3226 /*
3227  *      This routine takes stuff off of the write queue,
3228  *      and puts it in the xmit queue. This happens as incoming acks
3229  *      open up the remote window for us.
3230  */
3231  
3232 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3233 {
3234         struct sk_buff *skb;
3235 
3236         /*
3237          *      The bytes will have to remain here. In time closedown will
3238          *      empty the write queue and all will be happy 
3239          */
3240 
3241         if(sk->zapped)
3242                 return;
3243 
3244         /*
3245          *      Anything on the transmit queue that fits the window can
3246          *      be added providing we are not
3247          *
3248          *      a) retransmitting (Nagle's rule)
3249          *      b) exceeding our congestion window.
3250          */
3251          
3252         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3253                 before(skb->h.seq, sk->window_seq + 1) &&
3254                 (sk->retransmits == 0 ||
3255                  sk->ip_xmit_timeout != TIME_WRITE ||
3256                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3257                 && sk->packets_out < sk->cong_window) 
3258         {
3259                 IS_SKB(skb);
3260                 skb_unlink(skb);
3261                 
3262                 /*
3263                  *      See if we really need to send the packet. 
3264                  */
3265                  
3266                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3267                 {
3268                         /*
3269                          *      This is acked data. We can discard it. This 
3270                          *      cannot currently occur.
3271                          */
3272                          
3273                         sk->retransmits = 0;
3274                         kfree_skb(skb, FREE_WRITE);
3275                         if (!sk->dead) 
3276                                 sk->write_space(sk);
3277                 } 
3278                 else
3279                 {
3280                         struct tcphdr *th;
3281                         struct iphdr *iph;
3282                         int size;
3283 /*
3284  * put in the ack seq and window at this point rather than earlier,
3285  * in order to keep them monotonic.  We really want to avoid taking
3286  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3287  * Ack and window will in general have changed since this packet was put
3288  * on the write queue.
3289  */
3290                         iph = skb->ip_hdr;
3291                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3292                         size = skb->len - (((unsigned char *) th) - skb->data);
3293                         
3294                         th->ack_seq = ntohl(sk->acked_seq);
3295                         th->window = ntohs(tcp_select_window(sk));
3296 
3297                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3298 
3299                         sk->sent_seq = skb->h.seq;
3300                         
3301                         /*
3302                          *      IP manages our queue for some crazy reason
3303                          */
3304                          
3305                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3306                         
3307                         /*
3308                          *      Again we slide the timer wrongly
3309                          */
3310                          
3311                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3312                 }
3313         }
3314 }
3315 
3316 
3317 /*
3318  *      This routine deals with incoming acks, but not outgoing ones.
3319  */
3320 
3321 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3322 {
3323         u32 ack;
3324         int flag = 0;
3325 
3326         /* 
3327          * 1 - there was data in packet as well as ack or new data is sent or 
3328          *     in shutdown state
3329          * 2 - data from retransmit queue was acked and removed
3330          * 4 - window shrunk or data from retransmit queue was acked and removed
3331          */
3332 
3333         if(sk->zapped)
3334                 return(1);      /* Dead, cant ack any more so why bother */
3335 
3336         /*
3337          *      Have we discovered a larger window
3338          */
3339          
3340         ack = ntohl(th->ack_seq);
3341 
3342         if (ntohs(th->window) > sk->max_window) 
3343         {
3344                 sk->max_window = ntohs(th->window);
3345 #ifdef CONFIG_INET_PCTCP
3346                 /* Hack because we don't send partial packets to non SWS
3347                    handling hosts */
3348                 sk->mss = min(sk->max_window>>1, sk->mtu);
3349 #else
3350                 sk->mss = min(sk->max_window, sk->mtu);
3351 #endif  
3352         }
3353 
3354         /*
3355          *      We have dropped back to keepalive timeouts. Thus we have
3356          *      no retransmits pending.
3357          */
3358          
3359         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3360                 sk->retransmits = 0;
3361 
3362         /*
3363          *      If the ack is newer than sent or older than previous acks
3364          *      then we can probably ignore it.
3365          */
3366          
3367         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3368         {
3369                 if(sk->debug)
3370                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3371                         
3372                 /*
3373                  *      Keepalive processing.
3374                  */
3375                  
3376                 if (after(ack, sk->sent_seq)) 
3377                 {
3378                         return(0);
3379                 }
3380                 
3381                 /*
3382                  *      Restart the keepalive timer.
3383                  */
3384                  
3385                 if (sk->keepopen) 
3386                 {
3387                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3388                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3389                 }
3390                 return(1);
3391         }
3392 
3393         /*
3394          *      If there is data set flag 1
3395          */
3396          
3397         if (len != th->doff*4) 
3398                 flag |= 1;
3399 
3400         /*
3401          *      See if our window has been shrunk. 
3402          */
3403 
3404         if (after(sk->window_seq, ack+ntohs(th->window))) 
3405         {
3406                 /*
3407                  * We may need to move packets from the send queue
3408                  * to the write queue, if the window has been shrunk on us.
3409                  * The RFC says you are not allowed to shrink your window
3410                  * like this, but if the other end does, you must be able
3411                  * to deal with it.
3412                  */
3413                 struct sk_buff *skb;
3414                 struct sk_buff *skb2;
3415                 struct sk_buff *wskb = NULL;
3416         
3417                 skb2 = sk->send_head;
3418                 sk->send_head = NULL;
3419                 sk->send_tail = NULL;
3420         
3421                 /*
3422                  *      This is an artifact of a flawed concept. We want one
3423                  *      queue and a smarter send routine when we send all.
3424                  */
3425         
3426                 flag |= 4;      /* Window changed */
3427         
3428                 sk->window_seq = ack + ntohs(th->window);
3429                 cli();
3430                 while (skb2 != NULL) 
3431                 {
3432                         skb = skb2;
3433                         skb2 = skb->link3;
3434                         skb->link3 = NULL;
3435                         if (after(skb->h.seq, sk->window_seq)) 
3436                         {
3437                                 if (sk->packets_out > 0) 
3438                                         sk->packets_out--;
3439                                 /* We may need to remove this from the dev send list. */
3440                                 if (skb->next != NULL) 
3441                                 {
3442                                         skb_unlink(skb);                                
3443                                 }
3444                                 /* Now add it to the write_queue. */
3445                                 if (wskb == NULL)
3446                                         skb_queue_head(&sk->write_queue,skb);
3447                                 else
3448                                         skb_append(wskb,skb);
3449                                 wskb = skb;
3450                         } 
3451                         else 
3452                         {
3453                                 if (sk->send_head == NULL) 
3454                                 {
3455                                         sk->send_head = skb;
3456                                         sk->send_tail = skb;
3457                                 }
3458                                 else
3459                                 {
3460                                         sk->send_tail->link3 = skb;
3461                                         sk->send_tail = skb;
3462                                 }
3463                                 skb->link3 = NULL;
3464                         }
3465                 }
3466                 sti();
3467         }
3468 
3469         /*
3470          *      Pipe has emptied
3471          */
3472          
3473         if (sk->send_tail == NULL || sk->send_head == NULL) 
3474         {
3475                 sk->send_head = NULL;
3476                 sk->send_tail = NULL;
3477                 sk->packets_out= 0;
3478         }
3479 
3480         /*
3481          *      Update the right hand window edge of the host
3482          */
3483          
3484         sk->window_seq = ack + ntohs(th->window);
3485 
3486         /*
3487          *      We don't want too many packets out there. 
3488          */
3489          
3490         if (sk->ip_xmit_timeout == TIME_WRITE && 
3491                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3492         {
3493                 /* 
3494                  * This is Jacobson's slow start and congestion avoidance. 
3495                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3496                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3497                  * counter and increment it once every cwnd times.  It's possible
3498                  * that this should be done only if sk->retransmits == 0.  I'm
3499                  * interpreting "new data is acked" as including data that has
3500                  * been retransmitted but is just now being acked.
3501                  */
3502                 if (sk->cong_window < sk->ssthresh)  
3503                         /* 
3504                          *      In "safe" area, increase
3505                          */
3506                         sk->cong_window++;
3507                 else 
3508                 {
3509                         /*
3510                          *      In dangerous area, increase slowly.  In theory this is
3511                          *      sk->cong_window += 1 / sk->cong_window
3512                          */
3513                         if (sk->cong_count >= sk->cong_window) 
3514                         {
3515                                 sk->cong_window++;
3516                                 sk->cong_count = 0;
3517                         }
3518                         else 
3519                                 sk->cong_count++;
3520                 }
3521         }
3522 
3523         /*
3524          *      Remember the highest ack received.
3525          */
3526          
3527         sk->rcv_ack_seq = ack;
3528 
3529         /*
3530          *      If this ack opens up a zero window, clear backoff.  It was
3531          *      being used to time the probes, and is probably far higher than
3532          *      it needs to be for normal retransmission.
3533          */
3534 
3535         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3536         {
3537                 sk->retransmits = 0;    /* Our probe was answered */
3538                 
3539                 /*
3540                  *      Was it a usable window open ?
3541                  */
3542                  
3543                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3544                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3545                 {
3546                         sk->backoff = 0;
3547                         
3548                         /*
3549                          *      Recompute rto from rtt.  this eliminates any backoff.
3550                          */
3551 
3552                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3553                         if (sk->rto > 120*HZ)
3554                                 sk->rto = 120*HZ;
3555                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3556                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3557                                                    .2 of a second is going to need huge windows (SIGH) */
3558                         sk->rto = 20;
3559                 }
3560         }
3561 
3562         /* 
3563          *      See if we can take anything off of the retransmit queue.
3564          */
3565    
3566         while(sk->send_head != NULL) 
3567         {
3568                 /* Check for a bug. */
3569                 if (sk->send_head->link3 &&
3570                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3571                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3572                         
3573                 /*
3574                  *      If our packet is before the ack sequence we can
3575                  *      discard it as it's confirmed to have arrived the other end.
3576                  */
3577                  
3578                 if (before(sk->send_head->h.seq, ack+1)) 
3579                 {
3580                         struct sk_buff *oskb;   
3581                         if (sk->retransmits) 
3582                         {       
3583                                 /*
3584                                  *      We were retransmitting.  don't count this in RTT est 
3585                                  */
3586                                 flag |= 2;
3587 
3588                                 /*
3589                                  * even though we've gotten an ack, we're still
3590                                  * retransmitting as long as we're sending from
3591                                  * the retransmit queue.  Keeping retransmits non-zero
3592                                  * prevents us from getting new data interspersed with
3593                                  * retransmissions.
3594                                  */
3595 
3596                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3597                                         sk->retransmits = 1;
3598                                 else
3599                                         sk->retransmits = 0;
3600                         }
3601                         /*
3602                          * Note that we only reset backoff and rto in the
3603                          * rtt recomputation code.  And that doesn't happen
3604                          * if there were retransmissions in effect.  So the
3605                          * first new packet after the retransmissions is
3606                          * sent with the backoff still in effect.  Not until
3607                          * we get an ack from a non-retransmitted packet do
3608                          * we reset the backoff and rto.  This allows us to deal
3609                          * with a situation where the network delay has increased
3610                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3611                          */
3612 
3613                         /*
3614                          *      We have one less packet out there. 
3615                          */
3616                          
3617                         if (sk->packets_out > 0) 
3618                                 sk->packets_out --;
3619                         /* 
3620                          *      Wake up the process, it can probably write more. 
3621                          */
3622                         if (!sk->dead) 
3623                                 sk->write_space(sk);
3624                         oskb = sk->send_head;
3625 
3626                         if (!(flag&2))  /* Not retransmitting */
3627                         {
3628                                 long m;
3629         
3630                                 /*
3631                                  *      The following amusing code comes from Jacobson's
3632                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3633                                  *      are scaled versions of rtt and mean deviation.
3634                                  *      This is designed to be as fast as possible 
3635                                  *      m stands for "measurement".
3636                                  */
3637         
3638                                 m = jiffies - oskb->when;  /* RTT */
3639                                 if(m<=0)
3640                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3641                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3642                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3643                                 if (m < 0)
3644                                         m = -m;         /* m is now abs(error) */
3645                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3646                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3647         
3648                                 /*
3649                                  *      Now update timeout.  Note that this removes any backoff.
3650                                  */
3651                          
3652                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3653                                 if (sk->rto > 120*HZ)
3654                                         sk->rto = 120*HZ;
3655                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3656                                         sk->rto = 20;
3657                                 sk->backoff = 0;
3658                         }
3659                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3660                                            In this case as we just set it up */
3661                         cli();
3662                         oskb = sk->send_head;
3663                         IS_SKB(oskb);
3664                         sk->send_head = oskb->link3;
3665                         if (sk->send_head == NULL) 
3666                         {
3667                                 sk->send_tail = NULL;
3668                         }
3669 
3670                 /*
3671                  *      We may need to remove this from the dev send list. 
3672                  */
3673 
3674                         if (oskb->next)
3675                                 skb_unlink(oskb);
3676                         sti();
3677                         kfree_skb(oskb, FREE_WRITE); /* write. */
3678                         if (!sk->dead) 
3679                                 sk->write_space(sk);
3680                 }
3681                 else
3682                 {
3683                         break;
3684                 }
3685         }
3686 
3687         /*
3688          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3689          * returns non-NULL, we complete ignore the timer stuff in the else
3690          * clause.  We ought to organize the code so that else clause can
3691          * (should) be executed regardless, possibly moving the PROBE timer
3692          * reset over.  The skb_peek() thing should only move stuff to the
3693          * write queue, NOT also manage the timer functions.
3694          */
3695 
3696         /*
3697          * Maybe we can take some stuff off of the write queue,
3698          * and put it onto the xmit queue.
3699          */
3700         if (skb_peek(&sk->write_queue) != NULL) 
3701         {
3702                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3703                         (sk->retransmits == 0 || 
3704                          sk->ip_xmit_timeout != TIME_WRITE ||
3705                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3706                         && sk->packets_out < sk->cong_window) 
3707                 {
3708                         /*
3709                          *      Add more data to the send queue.
3710                          */
3711                         flag |= 1;
3712                         tcp_write_xmit(sk);
3713                 }
3714                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3715                         sk->send_head == NULL &&
3716                         sk->ack_backlog == 0 &&
3717                         sk->state != TCP_TIME_WAIT) 
3718                 {
3719                         /*
3720                          *      Data to queue but no room.
3721                          */
3722                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3723                 }               
3724         }
3725         else
3726         {
3727                 /*
3728                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3729                  * from TCP_CLOSE we don't do anything
3730                  *
3731                  * from anything else, if there is write data (or fin) pending,
3732                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3733                  * a KEEPALIVE timeout, else we delete the timer.
3734                  *
3735                  * We do not set flag for nominal write data, otherwise we may
3736                  * force a state where we start to write itsy bitsy tidbits
3737                  * of data.
3738                  */
3739 
3740                 switch(sk->state) {
3741                 case TCP_TIME_WAIT:
3742                         /*
3743                          * keep us in TIME_WAIT until we stop getting packets,
3744                          * reset the timeout.
3745                          */
3746                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3747                         break;
3748                 case TCP_CLOSE:
3749                         /*
3750                          * don't touch the timer.
3751                          */
3752                         break;
3753                 default:
3754                         /*
3755                          *      Must check send_head, write_queue, and ack_backlog
3756                          *      to determine which timeout to use.
3757                          */
3758                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3759                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3760                         } else if (sk->keepopen) {
3761                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3762                         } else {
3763                                 del_timer(&sk->retransmit_timer);
3764                                 sk->ip_xmit_timeout = 0;
3765                         }
3766                         break;
3767                 }
3768         }
3769 
3770         /*
3771          *      We have nothing queued but space to send. Send any partial
3772          *      packets immediately (end of Nagle rule application).
3773          */
3774          
3775         if (sk->packets_out == 0 && sk->partial != NULL &&
3776                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3777         {
3778                 flag |= 1;
3779                 tcp_send_partial(sk);
3780         }
3781 
3782         /*
3783          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3784          * we are now waiting for an acknowledge to our FIN.  The other end is
3785          * already in TIME_WAIT.
3786          *
3787          * Move to TCP_CLOSE on success.
3788          */
3789 
3790         if (sk->state == TCP_LAST_ACK) 
3791         {
3792                 if (!sk->dead)
3793                         sk->state_change(sk);
3794                 if(sk->debug)
3795                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3796                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3797                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3798                 {
3799                         flag |= 1;
3800                         tcp_set_state(sk,TCP_CLOSE);
3801                         sk->shutdown = SHUTDOWN_MASK;
3802                 }
3803         }
3804 
3805         /*
3806          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3807          *
3808          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3809          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3810          */
3811 
3812         if (sk->state == TCP_FIN_WAIT1) 
3813         {
3814 
3815                 if (!sk->dead) 
3816                         sk->state_change(sk);
3817                 if (sk->rcv_ack_seq == sk->write_seq) 
3818                 {
3819                         flag |= 1;
3820                         sk->shutdown |= SEND_SHUTDOWN;
3821                         tcp_set_state(sk, TCP_FIN_WAIT2);
3822                 }
3823         }
3824 
3825         /*
3826          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3827          *
3828          *      Move to TIME_WAIT
3829          */
3830 
3831         if (sk->state == TCP_CLOSING) 
3832         {
3833 
3834                 if (!sk->dead) 
3835                         sk->state_change(sk);
3836                 if (sk->rcv_ack_seq == sk->write_seq) 
3837                 {
3838                         flag |= 1;
3839                         tcp_time_wait(sk);
3840                 }
3841         }
3842         
3843         /*
3844          *      Final ack of a three way shake 
3845          */
3846          
3847         if(sk->state==TCP_SYN_RECV)
3848         {
3849                 tcp_set_state(sk, TCP_ESTABLISHED);
3850                 tcp_options(sk,th);
3851                 sk->dummy_th.dest=th->source;
3852                 sk->copied_seq = sk->acked_seq;
3853                 if(!sk->dead)
3854                         sk->state_change(sk);
3855                 if(sk->max_window==0)
3856                 {
3857                         sk->max_window=32;      /* Sanity check */
3858                         sk->mss=min(sk->max_window,sk->mtu);
3859                 }
3860         }
3861         
3862         /*
3863          * I make no guarantees about the first clause in the following
3864          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3865          * what conditions "!flag" would be true.  However I think the rest
3866          * of the conditions would prevent that from causing any
3867          * unnecessary retransmission. 
3868          *   Clearly if the first packet has expired it should be 
3869          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3870          * harder to explain:  You have to look carefully at how and when the
3871          * timer is set and with what timeout.  The most recent transmission always
3872          * sets the timer.  So in general if the most recent thing has timed
3873          * out, everything before it has as well.  So we want to go ahead and
3874          * retransmit some more.  If we didn't explicitly test for this
3875          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3876          * would not be true.  If you look at the pattern of timing, you can
3877          * show that rto is increased fast enough that the next packet would
3878          * almost never be retransmitted immediately.  Then you'd end up
3879          * waiting for a timeout to send each packet on the retransmission
3880          * queue.  With my implementation of the Karn sampling algorithm,
3881          * the timeout would double each time.  The net result is that it would
3882          * take a hideous amount of time to recover from a single dropped packet.
3883          * It's possible that there should also be a test for TIME_WRITE, but
3884          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3885          * got to be in real retransmission mode.
3886          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3887          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3888          * As long as no further losses occur, this seems reasonable.
3889          */
3890         
3891         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3892                (((flag&2) && sk->retransmits) ||
3893                (sk->send_head->when + sk->rto < jiffies))) 
3894         {
3895                 if(sk->send_head->when + sk->rto < jiffies)
3896                         tcp_retransmit(sk,0);   
3897                 else
3898                 {
3899                         tcp_do_retransmit(sk, 1);
3900                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3901                 }
3902         }
3903 
3904         return(1);
3905 }
3906 
3907 
3908 /*
3909  *      Process the FIN bit. This now behaves as it is supposed to work
3910  *      and the FIN takes effect when it is validly part of sequence
3911  *      space. Not before when we get holes.
3912  *
3913  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3914  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3915  *      TIME-WAIT)
3916  *
3917  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3918  *      close and we go into CLOSING (and later onto TIME-WAIT)
3919  *
3920  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3921  *
3922  */
3923  
3924 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3925 {
3926         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3927 
3928         if (!sk->dead) 
3929         {
3930                 sk->state_change(sk);
3931                 sock_wake_async(sk->socket, 1);
3932         }
3933 
3934         switch(sk->state) 
3935         {
3936                 case TCP_SYN_RECV:
3937                 case TCP_SYN_SENT:
3938                 case TCP_ESTABLISHED:
3939                         /*
3940                          * move to CLOSE_WAIT, tcp_data() already handled
3941                          * sending the ack.
3942                          */
3943                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3944                         if (th->rst)
3945                                 sk->shutdown = SHUTDOWN_MASK;
3946                         break;
3947 
3948                 case TCP_CLOSE_WAIT:
3949                 case TCP_CLOSING:
3950                         /*
3951                          * received a retransmission of the FIN, do
3952                          * nothing.
3953                          */
3954                         break;
3955                 case TCP_TIME_WAIT:
3956                         /*
3957                          * received a retransmission of the FIN,
3958                          * restart the TIME_WAIT timer.
3959                          */
3960                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3961                         return(0);
3962                 case TCP_FIN_WAIT1:
3963                         /*
3964                          * This case occurs when a simultaneous close
3965                          * happens, we must ack the received FIN and
3966                          * enter the CLOSING state.
3967                          *
3968                          * This causes a WRITE timeout, which will either
3969                          * move on to TIME_WAIT when we timeout, or resend
3970                          * the FIN properly (maybe we get rid of that annoying
3971                          * FIN lost hang). The TIME_WRITE code is already correct
3972                          * for handling this timeout.
3973                          */
3974 
3975                         if(sk->ip_xmit_timeout != TIME_WRITE)
3976                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3977                         tcp_set_state(sk,TCP_CLOSING);
3978                         break;
3979                 case TCP_FIN_WAIT2:
3980                         /*
3981                          * received a FIN -- send ACK and enter TIME_WAIT
3982                          */
3983                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3984                         sk->shutdown|=SHUTDOWN_MASK;
3985                         tcp_set_state(sk,TCP_TIME_WAIT);
3986                         break;
3987                 case TCP_CLOSE:
3988                         /*
3989                          * already in CLOSE
3990                          */
3991                         break;
3992                 default:
3993                         tcp_set_state(sk,TCP_LAST_ACK);
3994         
3995                         /* Start the timers. */
3996                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3997                         return(0);
3998         }
3999 
4000         return(0);
4001 }
4002 
4003 
4004 
4005 /*
4006  *      This routine handles the data.  If there is room in the buffer,
4007  *      it will be have already been moved into it.  If there is no
4008  *      room, then we will just have to discard the packet.
4009  */
4010 
4011 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
4012          unsigned long saddr, unsigned short len)
4013 {
4014         struct sk_buff *skb1, *skb2;
4015         struct tcphdr *th;
4016         int dup_dumped=0;
4017         u32 new_seq, shut_seq;
4018 
4019         th = skb->h.th;
4020         skb_pull(skb,th->doff*4);
4021         skb_trim(skb,len-(th->doff*4));
4022 
4023         /*
4024          *      The bytes in the receive read/assembly queue has increased. Needed for the
4025          *      low memory discard algorithm 
4026          */
4027            
4028         sk->bytes_rcv += skb->len;
4029         
4030         if (skb->len == 0 && !th->fin) 
4031         {
4032                 /* 
4033                  *      Don't want to keep passing ack's back and forth. 
4034                  *      (someone sent us dataless, boring frame)
4035                  */
4036                 if (!th->ack)
4037                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4038                 kfree_skb(skb, FREE_READ);
4039                 return(0);
4040         }
4041         
4042         /*
4043          *      We no longer have anyone receiving data on this connection.
4044          */
4045 
4046 #ifndef TCP_DONT_RST_SHUTDOWN            
4047 
4048         if(sk->shutdown & RCV_SHUTDOWN)
4049         {
4050                 /*
4051                  *      FIXME: BSD has some magic to avoid sending resets to
4052                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4053                  *      BSD stacks still have broken keepalives so we want to
4054                  *      cope with it.
4055                  */
4056 
4057                 if(skb->len)    /* We don't care if it's just an ack or
4058                                    a keepalive/window probe */
4059                 {
4060                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4061                         
4062                         /* Do this the way 4.4BSD treats it. Not what I'd
4063                            regard as the meaning of the spec but it's what BSD
4064                            does and clearly they know everything 8) */
4065 
4066                         /*
4067                          *      This is valid because of two things
4068                          *
4069                          *      a) The way tcp_data behaves at the bottom.
4070                          *      b) A fin takes effect when read not when received.
4071                          */
4072                          
4073                         shut_seq=sk->acked_seq+1;       /* Last byte */
4074                         
4075                         if(after(new_seq,shut_seq))
4076                         {
4077                                 if(sk->debug)
4078                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4079                                                 sk, new_seq, shut_seq, sk->blog);
4080                                 if(sk->dead)
4081                                 {
4082                                         sk->acked_seq = new_seq + th->fin;
4083                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4084                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4085                                         tcp_statistics.TcpEstabResets++;
4086                                         tcp_set_state(sk,TCP_CLOSE);
4087                                         sk->err = EPIPE;
4088                                         sk->shutdown = SHUTDOWN_MASK;
4089                                         kfree_skb(skb, FREE_READ);
4090                                         return 0;
4091                                 }
4092                         }
4093                 }
4094         }
4095 
4096 #endif
4097 
4098         /*
4099          *      Now we have to walk the chain, and figure out where this one
4100          *      goes into it.  This is set up so that the last packet we received
4101          *      will be the first one we look at, that way if everything comes
4102          *      in order, there will be no performance loss, and if they come
4103          *      out of order we will be able to fit things in nicely.
4104          *
4105          *      [AC: This is wrong. We should assume in order first and then walk
4106          *       forwards from the first hole based upon real traffic patterns.]
4107          *      
4108          */
4109 
4110         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4111         {
4112                 skb_queue_head(&sk->receive_queue,skb);
4113                 skb1= NULL;
4114         } 
4115         else
4116         {
4117                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4118                 {
4119                         if(sk->debug)
4120                         {
4121                                 printk("skb1=%p :", skb1);
4122                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4123                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4124                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4125                                                 sk->acked_seq);
4126                         }
4127                         
4128                         /*
4129                          *      Optimisation: Duplicate frame or extension of previous frame from
4130                          *      same sequence point (lost ack case).
4131                          *      The frame contains duplicate data or replaces a previous frame
4132                          *      discard the previous frame (safe as sk->inuse is set) and put
4133                          *      the new one in its place.
4134                          */
4135                          
4136                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4137                         {
4138                                 skb_append(skb1,skb);
4139                                 skb_unlink(skb1);
4140                                 kfree_skb(skb1,FREE_READ);
4141                                 dup_dumped=1;
4142                                 skb1=NULL;
4143                                 break;
4144                         }
4145                         
4146                         /*
4147                          *      Found where it fits
4148                          */
4149                          
4150                         if (after(th->seq+1, skb1->h.th->seq))
4151                         {
4152                                 skb_append(skb1,skb);
4153                                 break;
4154                         }
4155                         
4156                         /*
4157                          *      See if we've hit the start. If so insert.
4158                          */
4159                         if (skb1 == skb_peek(&sk->receive_queue))
4160                         {
4161                                 skb_queue_head(&sk->receive_queue, skb);
4162                                 break;
4163                         }
4164                 }
4165         }
4166 
4167         /*
4168          *      Figure out what the ack value for this frame is
4169          */
4170          
4171         th->ack_seq = th->seq + skb->len;
4172         if (th->syn) 
4173                 th->ack_seq++;
4174         if (th->fin)
4175                 th->ack_seq++;
4176 
4177         if (before(sk->acked_seq, sk->copied_seq)) 
4178         {
4179                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4180                 sk->acked_seq = sk->copied_seq;
4181         }
4182 
4183         /*
4184          *      Now figure out if we can ack anything. This is very messy because we really want two
4185          *      receive queues, a completed and an assembly queue. We also want only one transmit
4186          *      queue.
4187          */
4188 
4189         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4190         {
4191                 if (before(th->seq, sk->acked_seq+1)) 
4192                 {
4193                         int newwindow;
4194 
4195                         if (after(th->ack_seq, sk->acked_seq)) 
4196                         {
4197                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4198                                 if (newwindow < 0)
4199                                         newwindow = 0;  
4200                                 sk->window = newwindow;
4201                                 sk->acked_seq = th->ack_seq;
4202                         }
4203                         skb->acked = 1;
4204 
4205                         /*
4206                          *      When we ack the fin, we do the FIN 
4207                          *      processing.
4208                          */
4209 
4210                         if (skb->h.th->fin) 
4211                         {
4212                                 tcp_fin(skb,sk,skb->h.th);
4213                         }
4214           
4215                         for(skb2 = skb->next;
4216                             skb2 != (struct sk_buff *)&sk->receive_queue;
4217                             skb2 = skb2->next) 
4218                         {
4219                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4220                                 {
4221                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4222                                         {
4223                                                 newwindow = sk->window -
4224                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4225                                                 if (newwindow < 0)
4226                                                         newwindow = 0;  
4227                                                 sk->window = newwindow;
4228                                                 sk->acked_seq = skb2->h.th->ack_seq;
4229                                         }
4230                                         skb2->acked = 1;
4231                                         /*
4232                                          *      When we ack the fin, we do
4233                                          *      the fin handling.
4234                                          */
4235                                         if (skb2->h.th->fin) 
4236                                         {
4237                                                 tcp_fin(skb,sk,skb->h.th);
4238                                         }
4239 
4240                                         /*
4241                                          *      Force an immediate ack.
4242                                          */
4243                                          
4244                                         sk->ack_backlog = sk->max_ack_backlog;
4245                                 }
4246                                 else
4247                                 {
4248                                         break;
4249                                 }
4250                         }
4251 
4252                         /*
4253                          *      This also takes care of updating the window.
4254                          *      This if statement needs to be simplified.
4255                          */
4256                         if (!sk->delay_acks ||
4257                             sk->ack_backlog >= sk->max_ack_backlog || 
4258                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4259         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4260                         }
4261                         else 
4262                         {
4263                                 sk->ack_backlog++;
4264                                 if(sk->debug)
4265                                         printk("Ack queued.\n");
4266                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4267                         }
4268                 }
4269         }
4270 
4271         /*
4272          *      If we've missed a packet, send an ack.
4273          *      Also start a timer to send another.
4274          */
4275          
4276         if (!skb->acked) 
4277         {
4278         
4279         /*
4280          *      This is important.  If we don't have much room left,
4281          *      we need to throw out a few packets so we have a good
4282          *      window.  Note that mtu is used, not mss, because mss is really
4283          *      for the send side.  He could be sending us stuff as large as mtu.
4284          */
4285                  
4286                 while (sk->prot->rspace(sk) < sk->mtu) 
4287                 {
4288                         skb1 = skb_peek(&sk->receive_queue);
4289                         if (skb1 == NULL) 
4290                         {
4291                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4292                                 break;
4293                         }
4294 
4295                         /*
4296                          *      Don't throw out something that has been acked. 
4297                          */
4298                  
4299                         if (skb1->acked) 
4300                         {
4301                                 break;
4302                         }
4303                 
4304                         skb_unlink(skb1);
4305                         kfree_skb(skb1, FREE_READ);
4306                 }
4307                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4308                 sk->ack_backlog++;
4309                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4310         }
4311         else
4312         {
4313                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4314         }
4315 
4316         /*
4317          *      Now tell the user we may have some data. 
4318          */
4319          
4320         if (!sk->dead) 
4321         {
4322                 if(sk->debug)
4323                         printk("Data wakeup.\n");
4324                 sk->data_ready(sk,0);
4325         } 
4326         return(0);
4327 }
4328 
4329 
4330 /*
4331  *      This routine is only called when we have urgent data
4332  *      signalled. Its the 'slow' part of tcp_urg. It could be
4333  *      moved inline now as tcp_urg is only called from one
4334  *      place. We handle URGent data wrong. We have to - as
4335  *      BSD still doesn't use the correction from RFC961.
4336  */
4337  
4338 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4339 {
4340         u32 ptr = ntohs(th->urg_ptr);
4341 
4342         if (ptr)
4343                 ptr--;
4344         ptr += th->seq;
4345 
4346         /* ignore urgent data that we've already seen and read */
4347         if (after(sk->copied_seq, ptr))
4348                 return;
4349 
4350         /* do we already have a newer (or duplicate) urgent pointer? */
4351         if (sk->urg_data && !after(ptr, sk->urg_seq))
4352                 return;
4353 
4354         /* tell the world about our new urgent pointer */
4355         if (sk->proc != 0) {
4356                 if (sk->proc > 0) {
4357                         kill_proc(sk->proc, SIGURG, 1);
4358                 } else {
4359                         kill_pg(-sk->proc, SIGURG, 1);
4360                 }
4361         }
4362         sk->urg_data = URG_NOTYET;
4363         sk->urg_seq = ptr;
4364 }
4365 
4366 /*
4367  *      This is the 'fast' part of urgent handling.
4368  */
4369  
4370 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4371         unsigned long saddr, unsigned long len)
4372 {
4373         u32 ptr;
4374 
4375         /*
4376          *      Check if we get a new urgent pointer - normally not 
4377          */
4378          
4379         if (th->urg)
4380                 tcp_check_urg(sk,th);
4381 
4382         /*
4383          *      Do we wait for any urgent data? - normally not
4384          */
4385          
4386         if (sk->urg_data != URG_NOTYET)
4387                 return 0;
4388 
4389         /*
4390          *      Is the urgent pointer pointing into this packet? 
4391          */
4392          
4393         ptr = sk->urg_seq - th->seq + th->doff*4;
4394         if (ptr >= len)
4395                 return 0;
4396 
4397         /*
4398          *      Ok, got the correct packet, update info 
4399          */
4400          
4401         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4402         if (!sk->dead)
4403                 sk->data_ready(sk,0);
4404         return 0;
4405 }
4406 
4407 /*
4408  *      This will accept the next outstanding connection. 
4409  */
4410  
4411 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4412 {
4413         struct sock *newsk;
4414         struct sk_buff *skb;
4415   
4416   /*
4417    * We need to make sure that this socket is listening,
4418    * and that it has something pending.
4419    */
4420 
4421         if (sk->state != TCP_LISTEN) 
4422         {
4423                 sk->err = EINVAL;
4424                 return(NULL); 
4425         }
4426 
4427         /* Avoid the race. */
4428         cli();
4429         sk->inuse = 1;
4430 
4431         while((skb = tcp_dequeue_established(sk)) == NULL) 
4432         {
4433                 if (flags & O_NONBLOCK) 
4434                 {
4435                         sti();
4436                         release_sock(sk);
4437                         sk->err = EAGAIN;
4438                         return(NULL);
4439                 }
4440 
4441                 release_sock(sk);
4442                 interruptible_sleep_on(sk->sleep);
4443                 if (current->signal & ~current->blocked) 
4444                 {
4445                         sti();
4446                         sk->err = ERESTARTSYS;
4447                         return(NULL);
4448                 }
4449                 sk->inuse = 1;
4450         }
4451         sti();
4452 
4453         /*
4454          *      Now all we need to do is return skb->sk. 
4455          */
4456 
4457         newsk = skb->sk;
4458 
4459         kfree_skb(skb, FREE_READ);
4460         sk->ack_backlog--;
4461         release_sock(sk);
4462         return(newsk);
4463 }
4464 
4465 
4466 /*
4467  *      This will initiate an outgoing connection. 
4468  */
4469  
4470 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4471 {
4472         struct sk_buff *buff;
4473         struct device *dev=NULL;
4474         unsigned char *ptr;
4475         int tmp;
4476         int atype;
4477         struct tcphdr *t1;
4478         struct rtable *rt;
4479 
4480         if (sk->state != TCP_CLOSE) 
4481         {
4482                 return(-EISCONN);
4483         }
4484         
4485         if (addr_len < 8) 
4486                 return(-EINVAL);
4487 
4488         if (usin->sin_family && usin->sin_family != AF_INET) 
4489                 return(-EAFNOSUPPORT);
4490 
4491         /*
4492          *      connect() to INADDR_ANY means loopback (BSD'ism).
4493          */
4494         
4495         if(usin->sin_addr.s_addr==INADDR_ANY)
4496                 usin->sin_addr.s_addr=ip_my_addr();
4497                   
4498         /*
4499          *      Don't want a TCP connection going to a broadcast address 
4500          */
4501 
4502         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4503                 return -ENETUNREACH;
4504   
4505         sk->inuse = 1;
4506         sk->daddr = usin->sin_addr.s_addr;
4507         sk->write_seq = tcp_init_seq();
4508         sk->window_seq = sk->write_seq;
4509         sk->rcv_ack_seq = sk->write_seq -1;
4510         sk->err = 0;
4511         sk->dummy_th.dest = usin->sin_port;
4512         release_sock(sk);
4513 
4514         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4515         if (buff == NULL) 
4516         {
4517                 return(-ENOMEM);
4518         }
4519         sk->inuse = 1;
4520         buff->sk = sk;
4521         buff->free = 0;
4522         buff->localroute = sk->localroute;
4523         
4524 
4525         /*
4526          *      Put in the IP header and routing stuff. 
4527          */
4528          
4529         if (sk->localroute)
4530           rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4531         else
4532           rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4533 
4534         /*
4535          *      We need to build the routing stuff from the things saved in skb. 
4536          */
4537 
4538         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4539                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4540         if (tmp < 0) 
4541         {
4542                 sk->prot->wfree(sk, buff);
4543                 release_sock(sk);
4544                 return(-ENETUNREACH);
4545         }
4546 
4547         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4548 
4549         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4550         t1->seq = ntohl(sk->write_seq++);
4551         sk->sent_seq = sk->write_seq;
4552         buff->h.seq = sk->write_seq;
4553         t1->ack = 0;
4554         t1->window = 2;
4555         t1->res1=0;
4556         t1->res2=0;
4557         t1->rst = 0;
4558         t1->urg = 0;
4559         t1->psh = 0;
4560         t1->syn = 1;
4561         t1->urg_ptr = 0;
4562         t1->doff = 6;
4563         /* use 512 or whatever user asked for */
4564         
4565         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4566                 sk->window_clamp=rt->rt_window;
4567         else
4568                 sk->window_clamp=0;
4569 
4570         if (sk->user_mss)
4571                 sk->mtu = sk->user_mss;
4572         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4573                 sk->mtu = rt->rt_mss;
4574         else 
4575         {
4576 #ifdef CONFIG_INET_SNARL
4577                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4578 #else
4579                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4580 #endif
4581                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4582                 else
4583                         sk->mtu = MAX_WINDOW;
4584         }
4585         /*
4586          *      but not bigger than device MTU 
4587          */
4588 
4589         if(sk->mtu <32)
4590                 sk->mtu = 32;   /* Sanity limit */
4591                 
4592         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4593         
4594         /*
4595          *      Put in the TCP options to say MTU. 
4596          */
4597 
4598         ptr = skb_put(buff,4);
4599         ptr[0] = 2;
4600         ptr[1] = 4;
4601         ptr[2] = (sk->mtu) >> 8;
4602         ptr[3] = (sk->mtu) & 0xff;
4603         tcp_send_check(t1, sk->saddr, sk->daddr,
4604                   sizeof(struct tcphdr) + 4, sk);
4605 
4606         /*
4607          *      This must go first otherwise a really quick response will get reset. 
4608          */
4609 
4610         tcp_cache_zap();
4611         tcp_set_state(sk,TCP_SYN_SENT);
4612         if(rt&&rt->rt_flags&RTF_IRTT)
4613                 sk->rto = rt->rt_irtt;
4614         else
4615                 sk->rto = TCP_TIMEOUT_INIT;
4616         sk->retransmit_timer.function=&retransmit_timer;
4617         sk->retransmit_timer.data = (unsigned long)sk;
4618         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4619         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4620 
4621         sk->prot->queue_xmit(sk, dev, buff, 0);  
4622         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4623         tcp_statistics.TcpActiveOpens++;
4624         tcp_statistics.TcpOutSegs++;
4625   
4626         release_sock(sk);
4627         return(0);
4628 }
4629 
4630 
4631 /* This functions checks to see if the tcp header is actually acceptable. */
4632 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4633              struct options *opt, unsigned long saddr, struct device *dev)
4634 {
4635         u32 next_seq;
4636 
4637         next_seq = len - 4*th->doff;
4638         if (th->fin)
4639                 next_seq++;
4640         /* if we have a zero window, we can't have any data in the packet.. */
4641         if (next_seq && !sk->window)
4642                 goto ignore_it;
4643         next_seq += th->seq;
4644 
4645         /*
4646          * This isn't quite right.  sk->acked_seq could be more recent
4647          * than sk->window.  This is however close enough.  We will accept
4648          * slightly more packets than we should, but it should not cause
4649          * problems unless someone is trying to forge packets.
4650          */
4651 
4652         /* have we already seen all of this packet? */
4653         if (!after(next_seq+1, sk->acked_seq))
4654                 goto ignore_it;
4655         /* or does it start beyond the window? */
4656         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4657                 goto ignore_it;
4658 
4659         /* ok, at least part of this packet would seem interesting.. */
4660         return 1;
4661 
4662 ignore_it:
4663         if (th->rst)
4664                 return 0;
4665 
4666         /*
4667          *      Send a reset if we get something not ours and we are
4668          *      unsynchronized. Note: We don't do anything to our end. We
4669          *      are just killing the bogus remote connection then we will
4670          *      connect again and it will work (with luck).
4671          */
4672          
4673         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4674         {
4675                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4676                 return 1;
4677         }
4678 
4679         /* Try to resync things. */
4680         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4681         return 0;
4682 }
4683 
4684 /*
4685  *      When we get a reset we do this.
4686  */
4687 
4688 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4689 {
4690         sk->zapped = 1;
4691         sk->err = ECONNRESET;
4692         if (sk->state == TCP_SYN_SENT)
4693                 sk->err = ECONNREFUSED;
4694         if (sk->state == TCP_CLOSE_WAIT)
4695                 sk->err = EPIPE;
4696 #ifdef TCP_DO_RFC1337           
4697         /*
4698          *      Time wait assassination protection [RFC1337]
4699          */
4700         if(sk->state!=TCP_TIME_WAIT)
4701         {       
4702                 tcp_set_state(sk,TCP_CLOSE);
4703                 sk->shutdown = SHUTDOWN_MASK;
4704         }
4705 #else   
4706         tcp_set_state(sk,TCP_CLOSE);
4707         sk->shutdown = SHUTDOWN_MASK;
4708 #endif  
4709         if (!sk->dead) 
4710                 sk->state_change(sk);
4711         kfree_skb(skb, FREE_READ);
4712         release_sock(sk);
4713         return(0);
4714 }
4715 
4716 /*
4717  *      A TCP packet has arrived.
4718  *              skb->h.raw is the TCP header.
4719  */
4720  
4721 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4722         __u32 daddr, unsigned short len,
4723         __u32 saddr, int redo, struct inet_protocol * protocol)
4724 {
4725         struct tcphdr *th;
4726         struct sock *sk;
4727         int syn_ok=0;
4728         
4729         tcp_statistics.TcpInSegs++;
4730         if(skb->pkt_type!=PACKET_HOST)
4731         {
4732                 kfree_skb(skb,FREE_READ);
4733                 return(0);
4734         }
4735   
4736         th = skb->h.th;
4737 
4738         /*
4739          *      Find the socket, using the last hit cache if applicable.
4740          */
4741 
4742         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4743         {
4744                 sk=(struct sock *)th_cache_sk;
4745                 /*
4746                  *      We think this is causing the bug so
4747                  */
4748                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4749                         printk("Cache mismatch on TCP.\n");
4750         }
4751         else
4752         {
4753                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4754                 th_cache_saddr=saddr;
4755                 th_cache_daddr=daddr;
4756                 th_cache_dport=th->dest;
4757                 th_cache_sport=th->source;
4758                 th_cache_sk=sk;
4759         }               
4760 
4761         /*
4762          *      If this socket has got a reset it's to all intents and purposes 
4763          *      really dead. Count closed sockets as dead.
4764          *
4765          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4766          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4767          *      exist so should cause resets as if the port was unreachable.
4768          */
4769          
4770         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4771                 sk=NULL;
4772 
4773         if (!redo) 
4774         {
4775                 /*
4776                  *      Pull up the IP header.
4777                  */
4778                 skb_pull(skb, skb->h.raw-skb->data);
4779                 /*
4780                  *      Try to use the device checksum if provided.
4781                  */
4782                 if (
4783                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4784                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4785                     )
4786                 {
4787                         skb->sk = NULL;
4788                         kfree_skb(skb,FREE_READ);
4789                         /*
4790                          *      We don't release the socket because it was
4791                          *      never marked in use.
4792                          */
4793                         return(0);
4794                 }
4795                 th->seq = ntohl(th->seq);
4796 
4797                 /* See if we know about the socket. */
4798                 if (sk == NULL) 
4799                 {
4800                         /*
4801                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4802                          */
4803                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4804                         skb->sk = NULL;
4805                         /*
4806                          *      Discard frame
4807                          */
4808                         kfree_skb(skb, FREE_READ);
4809                         return(0);
4810                 }
4811 
4812 /*              skb->len = len;*/
4813                 skb->acked = 0;
4814                 skb->used = 0;
4815                 skb->free = 0;
4816                 skb->saddr = daddr;
4817                 skb->daddr = saddr;
4818         
4819                 /* We may need to add it to the backlog here. */
4820                 cli();
4821                 if (sk->inuse) 
4822                 {
4823                         skb_queue_tail(&sk->back_log, skb);
4824                         sti();
4825                         return(0);
4826                 }
4827                 sk->inuse = 1;
4828                 sti();
4829         }
4830         else
4831         {
4832                 if (sk==NULL) 
4833                 {
4834                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4835                         skb->sk = NULL;
4836                         kfree_skb(skb, FREE_READ);
4837                         return(0);
4838                 }
4839         }
4840 
4841 
4842         if (!sk->prot) 
4843         {
4844                 printk("IMPOSSIBLE 3\n");
4845                 return(0);
4846         }
4847 
4848 
4849         /*
4850          *      Charge the memory to the socket. 
4851          */
4852          
4853         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4854         {
4855                 kfree_skb(skb, FREE_READ);
4856                 release_sock(sk);
4857                 return(0);
4858         }
4859 
4860         skb->sk=sk;
4861         sk->rmem_alloc += skb->truesize;
4862 
4863         /*
4864          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4865          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4866          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4867          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4868          */
4869 
4870         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4871         {
4872         
4873                 /*
4874                  *      Now deal with unusual cases.
4875                  */
4876          
4877                 if(sk->state==TCP_LISTEN)
4878                 {
4879                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4880                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4881 
4882                         /*
4883                          *      We don't care for RST, and non SYN are absorbed (old segments)
4884                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4885                          *      netmask on a running connection it can go broadcast. Even Sun's have
4886                          *      this problem so I'm ignoring it 
4887                          */
4888                            
4889                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4890                         {
4891                                 kfree_skb(skb, FREE_READ);
4892                                 release_sock(sk);
4893                                 return 0;
4894                         }
4895                 
4896                         /*      
4897                          *      Guess we need to make a new socket up 
4898                          */
4899                 
4900                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4901                 
4902                         /*
4903                          *      Now we have several options: In theory there is nothing else
4904                          *      in the frame. KA9Q has an option to send data with the syn,
4905                          *      BSD accepts data with the syn up to the [to be] advertised window
4906                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4907                          *      it, that fits the spec precisely and avoids incompatibilities. It
4908                          *      would be nice in future to drop through and process the data.
4909                          */
4910                          
4911                         release_sock(sk);
4912                         return 0;
4913                 }
4914         
4915                 /* retransmitted SYN? */
4916                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4917                 {
4918                         kfree_skb(skb, FREE_READ);
4919                         release_sock(sk);
4920                         return 0;
4921                 }
4922                 
4923                 /*
4924                  *      SYN sent means we have to look for a suitable ack and either reset
4925                  *      for bad matches or go to connected 
4926                  */
4927            
4928                 if(sk->state==TCP_SYN_SENT)
4929                 {
4930                         /* Crossed SYN or previous junk segment */
4931                         if(th->ack)
4932                         {
4933                                 /* We got an ack, but it's not a good ack */
4934                                 if(!tcp_ack(sk,th,saddr,len))
4935                                 {
4936                                         /* Reset the ack - its an ack from a 
4937                                            different connection  [ th->rst is checked in tcp_reset()] */
4938                                         tcp_statistics.TcpAttemptFails++;
4939                                         tcp_reset(daddr, saddr, th,
4940                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4941                                         kfree_skb(skb, FREE_READ);
4942                                         release_sock(sk);
4943                                         return(0);
4944                                 }
4945                                 if(th->rst)
4946                                         return tcp_std_reset(sk,skb);
4947                                 if(!th->syn)
4948                                 {
4949                                         /* A valid ack from a different connection
4950                                            start. Shouldn't happen but cover it */
4951                                         kfree_skb(skb, FREE_READ);
4952                                         release_sock(sk);
4953                                         return 0;
4954                                 }
4955                                 /*
4956                                  *      Ok.. it's good. Set up sequence numbers and
4957                                  *      move to established.
4958                                  */
4959                                 syn_ok=1;       /* Don't reset this connection for the syn */
4960                                 sk->acked_seq=th->seq+1;
4961                                 sk->fin_seq=th->seq;
4962                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4963                                 tcp_set_state(sk, TCP_ESTABLISHED);
4964                                 tcp_options(sk,th);
4965                                 sk->dummy_th.dest=th->source;
4966                                 sk->copied_seq = sk->acked_seq;
4967                                 if(!sk->dead)
4968                                 {
4969                                         sk->state_change(sk);
4970                                         sock_wake_async(sk->socket, 0);
4971                                 }
4972                                 if(sk->max_window==0)
4973                                 {
4974                                         sk->max_window = 32;
4975                                         sk->mss = min(sk->max_window, sk->mtu);
4976                                 }
4977                         }
4978                         else
4979                         {
4980                                 /* See if SYN's cross. Drop if boring */
4981                                 if(th->syn && !th->rst)
4982                                 {
4983                                         /* Crossed SYN's are fine - but talking to
4984                                            yourself is right out... */
4985                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4986                                                 sk->dummy_th.source==th->source &&
4987                                                 sk->dummy_th.dest==th->dest)
4988                                         {
4989                                                 tcp_statistics.TcpAttemptFails++;
4990                                                 return tcp_std_reset(sk,skb);
4991                                         }
4992                                         tcp_set_state(sk,TCP_SYN_RECV);
4993                                         
4994                                         /*
4995                                          *      FIXME:
4996                                          *      Must send SYN|ACK here
4997                                          */
4998                                 }               
4999                                 /* Discard junk segment */
5000                                 kfree_skb(skb, FREE_READ);
5001                                 release_sock(sk);
5002                                 return 0;
5003                         }
5004                         /*
5005                          *      SYN_RECV with data maybe.. drop through
5006                          */
5007                         goto rfc_step6;
5008                 }
5009 
5010         /*
5011          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5012          *      a more complex suggestion for fixing these reuse issues in RFC1644
5013          *      but not yet ready for general use. Also see RFC1379.
5014          */
5015         
5016 #define BSD_TIME_WAIT
5017 #ifdef BSD_TIME_WAIT
5018                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5019                         after(th->seq, sk->acked_seq) && !th->rst)
5020                 {
5021                         u32 seq = sk->write_seq;
5022                         if(sk->debug)
5023                                 printk("Doing a BSD time wait\n");
5024                         tcp_statistics.TcpEstabResets++;           
5025                         sk->rmem_alloc -= skb->truesize;
5026                         skb->sk = NULL;
5027                         sk->err=ECONNRESET;
5028                         tcp_set_state(sk, TCP_CLOSE);
5029                         sk->shutdown = SHUTDOWN_MASK;
5030                         release_sock(sk);
5031                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5032                         if (sk && sk->state==TCP_LISTEN)
5033                         {
5034                                 sk->inuse=1;
5035                                 skb->sk = sk;
5036                                 sk->rmem_alloc += skb->truesize;
5037                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5038                                 release_sock(sk);
5039                                 return 0;
5040                         }
5041                         kfree_skb(skb, FREE_READ);
5042                         return 0;
5043                 }
5044 #endif  
5045         }
5046 
5047         /*
5048          *      We are now in normal data flow (see the step list in the RFC)
5049          *      Note most of these are inline now. I'll inline the lot when
5050          *      I have time to test it hard and look at what gcc outputs 
5051          */
5052         
5053         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5054         {
5055                 kfree_skb(skb, FREE_READ);
5056                 release_sock(sk);
5057                 return 0;
5058         }
5059 
5060         if(th->rst)
5061                 return tcp_std_reset(sk,skb);
5062         
5063         /*
5064          *      !syn_ok is effectively the state test in RFC793.
5065          */
5066          
5067         if(th->syn && !syn_ok)
5068         {
5069                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5070                 return tcp_std_reset(sk,skb);   
5071         }
5072 
5073         /*
5074          *      Process the ACK
5075          */
5076          
5077 
5078         if(th->ack && !tcp_ack(sk,th,saddr,len))
5079         {
5080                 /*
5081                  *      Our three way handshake failed.
5082                  */
5083                  
5084                 if(sk->state==TCP_SYN_RECV)
5085                 {
5086                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5087                 }
5088                 kfree_skb(skb, FREE_READ);
5089                 release_sock(sk);
5090                 return 0;
5091         }
5092         
5093 rfc_step6:              /* I'll clean this up later */
5094 
5095         /*
5096          *      Process urgent data
5097          */
5098                 
5099         if(tcp_urg(sk, th, saddr, len))
5100         {
5101                 kfree_skb(skb, FREE_READ);
5102                 release_sock(sk);
5103                 return 0;
5104         }
5105         
5106         
5107         /*
5108          *      Process the encapsulated data
5109          */
5110         
5111         if(tcp_data(skb,sk, saddr, len))
5112         {
5113                 kfree_skb(skb, FREE_READ);
5114                 release_sock(sk);
5115                 return 0;
5116         }
5117 
5118         /*
5119          *      And done
5120          */     
5121         
5122         release_sock(sk);
5123         return 0;
5124 }
5125 
5126 /*
5127  *      This routine sends a packet with an out of date sequence
5128  *      number. It assumes the other end will try to ack it.
5129  */
5130 
5131 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5132 {
5133         struct sk_buff *buff,*skb;
5134         struct tcphdr *t1;
5135         struct device *dev=NULL;
5136         int tmp;
5137 
5138         if (sk->zapped)
5139                 return; /* After a valid reset we can send no more */
5140 
5141         /*
5142          *      Write data can still be transmitted/retransmitted in the
5143          *      following states.  If any other state is encountered, return.
5144          *      [listen/close will never occur here anyway]
5145          */
5146 
5147         if (sk->state != TCP_ESTABLISHED && 
5148             sk->state != TCP_CLOSE_WAIT &&
5149             sk->state != TCP_FIN_WAIT1 && 
5150             sk->state != TCP_LAST_ACK &&
5151             sk->state != TCP_CLOSING
5152         ) 
5153         {
5154                 return;
5155         }
5156         if ( before(sk->sent_seq, sk->window_seq) && 
5157             (skb=skb_peek(&sk->write_queue)))
5158         {
5159                 /*
5160                  * We are probing the opening of a window
5161                  * but the window size is != 0
5162                  * must have been a result SWS advoidance ( sender )
5163                  */
5164             
5165                 struct iphdr *iph;
5166                 struct tcphdr *th;
5167                 struct tcphdr *nth;
5168                 unsigned long win_size;
5169 #if 0
5170                 unsigned long ow_size;
5171 #endif
5172                 void * tcp_data_start;
5173         
5174                 /*
5175                  *      How many bytes can we send ?
5176                  */
5177                  
5178                 win_size = sk->window_seq - sk->sent_seq;
5179 
5180                 /*
5181                  *      Recover the buffer pointers
5182                  */
5183                  
5184                 iph = (struct iphdr *)skb->ip_hdr;
5185                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5186 
5187                 /*
5188                  *      Grab the data for a temporary frame
5189                  */
5190                  
5191                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
5192                                      (iph->ihl << 2) +
5193                                      sk->prot->max_header + 15, 
5194                                      1, GFP_ATOMIC);
5195                 if ( buff == NULL )
5196                         return;
5197 
5198                 /* 
5199                  *      If we strip the packet on the write queue we must
5200                  *      be ready to retransmit this one 
5201                  */
5202             
5203                 buff->free = /*0*/1;
5204 
5205                 buff->sk = sk;
5206                 buff->localroute = sk->localroute;
5207                 
5208                 /*
5209                  *      Put headers on the new packet
5210                  */
5211 
5212                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5213                                          IPPROTO_TCP, sk->opt, buff->truesize,
5214                                          sk->ip_tos,sk->ip_ttl);
5215                 if (tmp < 0) 
5216                 {
5217                         sk->prot->wfree(sk, buff);
5218                         return;
5219                 }
5220                 
5221                 /*
5222                  *      Move the TCP header over
5223                  */
5224 
5225                 buff->dev = dev;
5226 
5227                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5228 
5229                 memcpy(nth, th, th->doff * 4);
5230                 
5231                 /*
5232                  *      Correct the new header
5233                  */
5234                  
5235                 nth->ack = 1; 
5236                 nth->ack_seq = ntohl(sk->acked_seq);
5237                 nth->window = ntohs(tcp_select_window(sk));
5238                 nth->check = 0;
5239 
5240                 /*
5241                  *      Find the first data byte.
5242                  */
5243                  
5244                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5245                                 (iph->ihl << 2) + th->doff * 4;
5246 
5247                 /*
5248                  *      Add it to our new buffer
5249                  */
5250                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5251                 
5252                 /*
5253                  *      Remember our right edge sequence number.
5254                  */
5255                  
5256                 buff->h.seq = sk->sent_seq + win_size;
5257                 sk->sent_seq = buff->h.seq;             /* Hack */
5258 #if 0
5259 
5260                 /*
5261                  *      now: shrink the queue head segment 
5262                  */
5263                  
5264                 th->check = 0;
5265                 ow_size = skb->len - win_size - 
5266                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5267 
5268                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5269                 skb_trim(skb,skb->len-win_size);
5270                 sk->sent_seq += win_size;
5271                 th->seq = htonl(sk->sent_seq);
5272                 if (th->urg)
5273                 {
5274                         unsigned short urg_ptr;
5275         
5276                         urg_ptr = ntohs(th->urg_ptr);
5277                         if (urg_ptr <= win_size)
5278                                 th->urg = 0;
5279                         else
5280                         {
5281                                 urg_ptr -= win_size;
5282                                 th->urg_ptr = htons(urg_ptr);
5283                                 nth->urg_ptr = htons(win_size);
5284                         }
5285                 }
5286 #else
5287                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5288                         nth->urg = 0;
5289 #endif          
5290 
5291                 /*
5292                  *      Checksum the split buffer
5293                  */
5294                  
5295                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5296                            nth->doff * 4 + win_size , sk);
5297         }
5298         else
5299         {       
5300                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5301                 if (buff == NULL) 
5302                         return;
5303 
5304                 buff->free = 1;
5305                 buff->sk = sk;
5306                 buff->localroute = sk->localroute;
5307 
5308                 /*
5309                  *      Put in the IP header and routing stuff. 
5310                  */
5311                  
5312                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5313                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5314                 if (tmp < 0) 
5315                 {
5316                         sk->prot->wfree(sk, buff);
5317                         return;
5318                 }
5319 
5320                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5321                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5322 
5323                 /*
5324                  *      Use a previous sequence.
5325                  *      This should cause the other end to send an ack.
5326                  */
5327          
5328                 t1->seq = htonl(sk->sent_seq-1);
5329                 t1->ack = 1; 
5330                 t1->res1= 0;
5331                 t1->res2= 0;
5332                 t1->rst = 0;
5333                 t1->urg = 0;
5334                 t1->psh = 0;
5335                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5336                 t1->syn = 0;
5337                 t1->ack_seq = ntohl(sk->acked_seq);
5338                 t1->window = ntohs(tcp_select_window(sk));
5339                 t1->doff = sizeof(*t1)/4;
5340                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5341 
5342         }               
5343 
5344         /*
5345          *      Send it.
5346          */
5347         
5348         sk->prot->queue_xmit(sk, dev, buff, 1);
5349         tcp_statistics.TcpOutSegs++;
5350 }
5351 
5352 /*
5353  *      A window probe timeout has occurred.
5354  */
5355 
5356 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5357 {
5358         if (sk->zapped)
5359                 return;         /* After a valid reset we can send no more */
5360 
5361         tcp_write_wakeup(sk);
5362 
5363         sk->backoff++;
5364         sk->rto = min(sk->rto << 1, 120*HZ);
5365         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5366         sk->retransmits++;
5367         sk->prot->retransmits ++;
5368 }
5369 
5370 /*
5371  *      Socket option code for TCP. 
5372  */
5373   
5374 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5375 {
5376         int val,err;
5377 
5378         if(level!=SOL_TCP)
5379                 return ip_setsockopt(sk,level,optname,optval,optlen);
5380 
5381         if (optval == NULL) 
5382                 return(-EINVAL);
5383 
5384         err=verify_area(VERIFY_READ, optval, sizeof(int));
5385         if(err)
5386                 return err;
5387         
5388         val = get_user((int *)optval);
5389 
5390         switch(optname)
5391         {
5392                 case TCP_MAXSEG:
5393 /*
5394  * values greater than interface MTU won't take effect.  however at
5395  * the point when this call is done we typically don't yet know
5396  * which interface is going to be used
5397  */
5398                         if(val<1||val>MAX_WINDOW)
5399                                 return -EINVAL;
5400                         sk->user_mss=val;
5401                         return 0;
5402                 case TCP_NODELAY:
5403                         sk->nonagle=(val==0)?0:1;
5404                         return 0;
5405                 default:
5406                         return(-ENOPROTOOPT);
5407         }
5408 }
5409 
5410 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5411 {
5412         int val,err;
5413 
5414         if(level!=SOL_TCP)
5415                 return ip_getsockopt(sk,level,optname,optval,optlen);
5416                         
5417         switch(optname)
5418         {
5419                 case TCP_MAXSEG:
5420                         val=sk->user_mss;
5421                         break;
5422                 case TCP_NODELAY:
5423                         val=sk->nonagle;
5424                         break;
5425                 default:
5426                         return(-ENOPROTOOPT);
5427         }
5428         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5429         if(err)
5430                 return err;
5431         put_user(sizeof(int),(int *) optlen);
5432 
5433         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5434         if(err)
5435                 return err;
5436         put_user(val,(int *)optval);
5437 
5438         return(0);
5439 }       
5440 
5441 
5442 struct proto tcp_prot = {
5443         sock_wmalloc,
5444         sock_rmalloc,
5445         sock_wfree,
5446         sock_rfree,
5447         sock_rspace,
5448         sock_wspace,
5449         tcp_close,
5450         tcp_read,
5451         tcp_write,
5452         tcp_sendto,
5453         tcp_recvfrom,
5454         ip_build_header,
5455         tcp_connect,
5456         tcp_accept,
5457         ip_queue_xmit,
5458         tcp_retransmit,
5459         tcp_write_wakeup,
5460         tcp_read_wakeup,
5461         tcp_rcv,
5462         tcp_select,
5463         tcp_ioctl,
5464         NULL,
5465         tcp_shutdown,
5466         tcp_setsockopt,
5467         tcp_getsockopt,
5468         128,
5469         0,
5470         "TCP",
5471         0, 0,
5472         {NULL,}
5473 };

/* [previous][next][first][last][top][bottom][index][help] */