root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_find_established
  6. tcp_dequeue_established
  7. tcp_close_pending
  8. tcp_time_wait
  9. tcp_do_retransmit
  10. reset_xmit_timer
  11. tcp_retransmit_time
  12. tcp_retransmit
  13. tcp_write_timeout
  14. retransmit_timer
  15. tcp_err
  16. tcp_readable
  17. tcp_listen_select
  18. tcp_select
  19. tcp_ioctl
  20. tcp_check
  21. tcp_send_check
  22. tcp_send_skb
  23. tcp_dequeue_partial
  24. tcp_send_partial
  25. tcp_enqueue_partial
  26. tcp_send_ack
  27. tcp_build_header
  28. tcp_write
  29. tcp_sendto
  30. tcp_read_wakeup
  31. cleanup_rbuf
  32. tcp_read_urg
  33. tcp_read
  34. tcp_close_state
  35. tcp_send_fin
  36. tcp_shutdown
  37. tcp_recvfrom
  38. tcp_reset
  39. tcp_options
  40. default_mask
  41. tcp_init_seq
  42. tcp_conn_request
  43. tcp_close
  44. tcp_write_xmit
  45. tcp_ack
  46. tcp_fin
  47. tcp_data
  48. tcp_check_urg
  49. tcp_urg
  50. tcp_accept
  51. tcp_connect
  52. tcp_sequence
  53. tcp_std_reset
  54. tcp_rcv
  55. tcp_write_wakeup
  56. tcp_send_probe0
  57. tcp_setsockopt
  58. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications
 178  *
 179  *
 180  * To Fix:
 181  *              Fast path the code. Two things here - fix the window calculation
 182  *              so it doesn't iterate over the queue, also spot packets with no funny
 183  *              options arriving in order and process directly.
 184  *
 185  *              Implement RFC 1191 [Path MTU discovery]
 186  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 187  *              Rewrite output state machine to use a single queue and do low window
 188  *              situations as per the spec (RFC 1122)
 189  *              Speed up input assembly algorithm.
 190  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 191  *              could do with it working on IPv4
 192  *              User settable/learned rtt/max window/mtu
 193  *              Cope with MTU/device switches when retransmitting in tcp.
 194  *              Fix the window handling to use PR's new code.
 195  *
 196  *              Change the fundamental structure to a single send queue maintained
 197  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 198  *              active routes too]). Cut the queue off in tcp_retransmit/
 199  *              tcp_transmit.
 200  *              Change the receive queue to assemble as it goes. This lets us
 201  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 202  *              tcp_data/tcp_read as well as the window shrink crud.
 203  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 204  *              tcp_queue_skb seem obvious routines to extract.
 205  *      
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *              
 243  *      TCP_CLOSE               socket is finished
 244  */
 245 
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * 
 253  * Use of PSH (4.2.2.2)
 254  *   MAY aggregate data sent without the PSH flag. (does)
 255  *   MAY queue data recieved without the PSH flag. (does)
 256  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 257  *   MAY implement PSH on send calls. (doesn't, thus:)
 258  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 259  *     MUST set PSH on last segment (does)
 260  *   MAY pass received PSH to application layer (doesn't)
 261  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 262  * 
 263  * Window Size (4.2.2.3, 4.2.2.16)
 264  *   MUST treat window size as an unsigned number (does)
 265  *   SHOULD treat window size as a 32-bit number (does not)
 266  *   MUST NOT shrink window once it is offered (does not normally)
 267  *   
 268  * Urgent Pointer (4.2.2.4)
 269  * **MUST point urgent pointer to last byte of urgent data (not right
 270  *     after). (doesn't, to be like BSD)
 271  *   MUST inform application layer asynchronously of incoming urgent
 272  *     data. (does)
 273  *   MUST provide application with means of determining the amount of
 274  *     urgent data pending. (does)
 275  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 276  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 277  *      [Follows BSD 1 byte of urgent data]
 278  * 
 279  * TCP Options (4.2.2.5)
 280  *   MUST be able to recieve TCP options in any segment. (does)
 281  *   MUST ignore unsupported options (does)
 282  *   
 283  * Maximum Segment Size Option (4.2.2.6)
 284  *   MUST implement both sending and receiving MSS. (does)
 285  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 286  *     it always). (does, even when MSS == 536, which is legal)
 287  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 288  *   MUST calculate "effective send MSS" correctly:
 289  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 290  *     (does - but allows operator override)
 291  *  
 292  * TCP Checksum (4.2.2.7)
 293  *   MUST generate and check TCP checksum. (does)
 294  * 
 295  * Initial Sequence Number Selection (4.2.2.8)
 296  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 297  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 298  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 299  * 
 300  * Simultaneous Open Attempts (4.2.2.10)
 301  *   MUST support simultaneous open attempts (does)
 302  * 
 303  * Recovery from Old Duplicate SYN (4.2.2.11)
 304  *   MUST keep track of active vs. passive open (does)
 305  * 
 306  * RST segment (4.2.2.12)
 307  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 308  *     anything with it, which is standard)
 309  * 
 310  * Closing a Connection (4.2.2.13)
 311  *   MUST inform application of whether connectin was closed by RST or
 312  *     normal close. (does)
 313  *   MAY allow "half-duplex" close (treat connection as closed for the
 314  *     local app, even before handshake is done). (does)
 315  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 316  * 
 317  * Retransmission Timeout (4.2.2.15)
 318  *   MUST implement Jacobson's slow start and congestion avoidance
 319  *     stuff. (does) 
 320  * 
 321  * Probing Zero Windows (4.2.2.17)
 322  *   MUST support probing of zero windows. (does)
 323  *   MAY keep offered window closed indefinitely. (does)
 324  *   MUST allow remote window to stay closed indefinitely. (does)
 325  * 
 326  * Passive Open Calls (4.2.2.18)
 327  *   MUST NOT let new passive open affect other connections. (doesn't)
 328  *   MUST support passive opens (LISTENs) concurrently. (does)
 329  *   
 330  * Time to Live (4.2.2.19)
 331  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 332  * 
 333  * Event Processing (4.2.2.20)
 334  *   SHOULD queue out-of-order segments. (does)
 335  *   MUST aggregate ACK segments whenever possible. (does but badly)
 336  *   
 337  * Retransmission Timeout Calculation (4.2.3.1)
 338  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 339  *     calculation. (does, or at least explains them in the comments 8*b)
 340  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 341  * 
 342  * When to Send an ACK Segment (4.2.3.2)
 343  *   SHOULD implement delayed ACK. (does not)
 344  *   MUST keep ACK delay < 0.5 sec. (N/A)
 345  * 
 346  * When to Send a Window Update (4.2.3.3)
 347  *   MUST implement receiver-side SWS. (does)
 348  *   
 349  * When to Send Data (4.2.3.4)
 350  *   MUST implement sender-side SWS. (does - imperfectly)
 351  *   SHOULD implement Nagle algorithm. (does)
 352  * 
 353  * TCP Connection Failures (4.2.3.5)
 354  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 355  *   SHOULD inform application layer of soft errors. (doesn't)
 356  *   
 357  * TCP Keep-Alives (4.2.3.6)
 358  *   MAY provide keep-alives. (does)
 359  *   MUST make keep-alives configurable on a per-connection basis. (does)
 360  *   MUST default to no keep-alives. (does)
 361  * **MUST make keep-alive interval configurable. (doesn't)
 362  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 363  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 364  *     connection. (doesn't)
 365  *   SHOULD send keep-alive with no data. (does)
 366  * 
 367  * TCP Multihoming (4.2.3.7)
 368  *   MUST get source address from IP layer before sending first
 369  *     SYN. (does)
 370  *   MUST use same local address for all segments of a connection. (does)
 371  * 
 372  * IP Options (4.2.3.8)
 373  *   (I don't think the IP layer sees the IP options, yet.)
 374  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 375  *   MAY support Time Stamp and Record Route. (doesn't)
 376  * **MUST allow application to specify a source route. (doesn't?)
 377  * **MUST allow receieved Source Route option to set route for all future
 378  *     segments on this connection. (doesn't, not that I think it's a
 379  *     huge problem)
 380  * 
 381  * ICMP messages (4.2.3.9)
 382  *   MUST act on ICMP errors. (does)
 383  *   MUST slow transmission upon receipt of a Source Quench. (does)
 384  *   MUST NOT abort connection upon receipt of soft Destination
 385  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 386  *     Problems. (doesn't)
 387  *   SHOULD report soft Destination Unreachables etc. to the
 388  *     application. (doesn't)
 389  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 390  *     messages (2, 3, 4). (does)
 391  * 
 392  * Remote Address Validation (4.2.3.10)
 393  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 394  *   MUST ignore SYN with invalid source address. (does)
 395  *   MUST silently discard incoming SYN for broadcast/multicast
 396  *     address. (does) 
 397  * 
 398  * Asynchronous Reports (4.2.4.1)
 399  * **MUST provide mechanism for reporting soft errors to application
 400  *     layer. (doesn't)
 401  * 
 402  * Type of Service (4.2.4.2)
 403  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 404  * 
 405  * (Whew. -- MS 950903)
 406  **/
 407 
 408 #include <linux/types.h>
 409 #include <linux/sched.h>
 410 #include <linux/mm.h>
 411 #include <linux/time.h>
 412 #include <linux/string.h>
 413 #include <linux/config.h>
 414 #include <linux/socket.h>
 415 #include <linux/sockios.h>
 416 #include <linux/termios.h>
 417 #include <linux/in.h>
 418 #include <linux/fcntl.h>
 419 #include <linux/inet.h>
 420 #include <linux/netdevice.h>
 421 #include <net/snmp.h>
 422 #include <net/ip.h>
 423 #include <net/protocol.h>
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426 #include <net/arp.h>
 427 #include <linux/skbuff.h>
 428 #include <net/sock.h>
 429 #include <net/route.h>
 430 #include <linux/errno.h>
 431 #include <linux/timer.h>
 432 #include <asm/system.h>
 433 #include <asm/segment.h>
 434 #include <linux/mm.h>
 435 #include <net/checksum.h>
 436 
 437 /*
 438  *      The MSL timer is the 'normal' timer.
 439  */
 440  
 441 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 442 
 443 #define SEQ_TICK 3
 444 unsigned long seq_offset;
 445 struct tcp_mib  tcp_statistics;
 446 
 447 /*
 448  *      Cached last hit socket
 449  */
 450  
 451 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 452 volatile unsigned short  th_cache_dport, th_cache_sport;
 453 volatile struct sock *th_cache_sk;
 454 
 455 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 456 {
 457         unsigned long flags;
 458         save_flags(flags);
 459         cli();
 460         th_cache_saddr=0;
 461         th_cache_daddr=0;
 462         th_cache_dport=0;
 463         th_cache_sport=0;
 464         th_cache_sk=NULL;
 465         restore_flags(flags);
 466 }
 467 
 468 static void tcp_close(struct sock *sk, int timeout);
 469 
 470 
 471 /*
 472  *      The less said about this the better, but it works and will do for 1.2 
 473  */
 474 
 475 static struct wait_queue *master_select_wakeup;
 476 
 477 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 478 {
 479         if (a < b) 
 480                 return(a);
 481         return(b);
 482 }
 483 
 484 #undef STATE_TRACE
 485 
 486 #ifdef STATE_TRACE
 487 static char *statename[]={
 488         "Unused","Established","Syn Sent","Syn Recv",
 489         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 490         "Close Wait","Last ACK","Listen","Closing"
 491 };
 492 #endif
 493 
 494 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 495 {
 496         if(sk->state==TCP_ESTABLISHED)
 497                 tcp_statistics.TcpCurrEstab--;
 498 #ifdef STATE_TRACE
 499         if(sk->debug)
 500                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 501 #endif  
 502         /* This is a hack but it doesn't occur often and it's going to
 503            be a real        to fix nicely */
 504            
 505         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 506         {
 507                 wake_up_interruptible(&master_select_wakeup);
 508         }
 509         sk->state=state;
 510         if(state==TCP_ESTABLISHED)
 511                 tcp_statistics.TcpCurrEstab++;
 512 }
 513 
 514 /*
 515  *      This routine picks a TCP windows for a socket based on
 516  *      the following constraints
 517  *  
 518  *      1. The window can never be shrunk once it is offered (RFC 793)
 519  *      2. We limit memory per socket
 520  *   
 521  *      For now we use NET2E3's heuristic of offering half the memory
 522  *      we have handy. All is not as bad as this seems however because
 523  *      of two things. Firstly we will bin packets even within the window
 524  *      in order to get the data we are waiting for into the memory limit.
 525  *      Secondly we bin common duplicate forms at receive time
 526  *      Better heuristics welcome
 527  */
 528    
 529 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 530 {
 531         int new_window = sk->prot->rspace(sk);
 532         
 533         if(sk->window_clamp)
 534                 new_window=min(sk->window_clamp,new_window);
 535         /*
 536          *      Two things are going on here.  First, we don't ever offer a
 537          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 538          *      receiver side of SWS as specified in RFC1122.
 539          *      Second, we always give them at least the window they
 540          *      had before, in order to avoid retracting window.  This
 541          *      is technically allowed, but RFC1122 advises against it and
 542          *      in practice it causes trouble.
 543          *
 544          *      Fixme: This doesn't correctly handle the case where
 545          *      new_window > sk->window but not by enough to allow for the
 546          *      shift in sequence space. 
 547          */
 548         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 549                 return(sk->window);
 550         return(new_window);
 551 }
 552 
 553 /*
 554  *      Find someone to 'accept'. Must be called with
 555  *      sk->inuse=1 or cli()
 556  */ 
 557 
 558 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 559 {
 560         struct sk_buff *p=skb_peek(&s->receive_queue);
 561         if(p==NULL)
 562                 return NULL;
 563         do
 564         {
 565                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 566                         return p;
 567                 p=p->next;
 568         }
 569         while(p!=(struct sk_buff *)&s->receive_queue);
 570         return NULL;
 571 }
 572 
 573 /*
 574  *      Remove a completed connection and return it. This is used by
 575  *      tcp_accept() to get connections from the queue.
 576  */
 577 
 578 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 579 {
 580         struct sk_buff *skb;
 581         unsigned long flags;
 582         save_flags(flags);
 583         cli(); 
 584         skb=tcp_find_established(s);
 585         if(skb!=NULL)
 586                 skb_unlink(skb);        /* Take it off the queue */
 587         restore_flags(flags);
 588         return skb;
 589 }
 590 
 591 /* 
 592  *      This routine closes sockets which have been at least partially
 593  *      opened, but not yet accepted. Currently it is only called by
 594  *      tcp_close, and timeout mirrors the value there. 
 595  */
 596 
 597 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 598 {
 599         struct sk_buff *skb;
 600 
 601         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 602         {
 603                 skb->sk->dead=1;
 604                 tcp_close(skb->sk, 0);
 605                 kfree_skb(skb, FREE_READ);
 606         }
 607         return;
 608 }
 609 
 610 /*
 611  *      Enter the time wait state. 
 612  */
 613 
 614 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 615 {
 616         tcp_set_state(sk,TCP_TIME_WAIT);
 617         sk->shutdown = SHUTDOWN_MASK;
 618         if (!sk->dead)
 619                 sk->state_change(sk);
 620         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 621 }
 622 
 623 /*
 624  *      A socket has timed out on its send queue and wants to do a
 625  *      little retransmitting. Currently this means TCP.
 626  */
 627 
 628 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 629 {
 630         struct sk_buff * skb;
 631         struct proto *prot;
 632         struct device *dev;
 633         int ct=0;
 634         struct rtable *rt;
 635 
 636         prot = sk->prot;
 637         skb = sk->send_head;
 638 
 639         while (skb != NULL)
 640         {
 641                 struct tcphdr *th;
 642                 struct iphdr *iph;
 643                 int size;
 644 
 645                 dev = skb->dev;
 646                 IS_SKB(skb);
 647                 skb->when = jiffies;
 648 
 649                 /*
 650                  *      Discard the surplus MAC header
 651                  */
 652                  
 653                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 654 
 655                 /*
 656                  * In general it's OK just to use the old packet.  However we
 657                  * need to use the current ack and window fields.  Urg and
 658                  * urg_ptr could possibly stand to be updated as well, but we
 659                  * don't keep the necessary data.  That shouldn't be a problem,
 660                  * if the other end is doing the right thing.  Since we're
 661                  * changing the packet, we have to issue a new IP identifier.
 662                  */
 663 
 664                 iph = (struct iphdr *)skb->data;
 665                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 666                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 667                 
 668                 /*
 669                  *      Note: We ought to check for window limits here but
 670                  *      currently this is done (less efficiently) elsewhere.
 671                  */
 672 
 673                 iph->id = htons(ip_id_count++);
 674                 ip_send_check(iph);
 675                 
 676                 /*
 677                  *      Put a MAC header back on (may cause ARPing)
 678                  */
 679                  
 680                 if(skb->localroute)
 681                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 682                 else
 683                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 684                         
 685                 if(rt==NULL)    /* Deep poo */
 686                 {
 687                         if(skb->sk)
 688                         {
 689                                 skb->sk->err=ENETUNREACH;
 690                                 skb->sk->error_report(skb->sk);
 691                         }
 692                 }
 693                 else
 694                 {
 695                         dev=rt->rt_dev;
 696                         skb->raddr=rt->rt_gateway;
 697                         if(skb->raddr==0)
 698                                 skb->raddr=iph->daddr;
 699                         skb->dev=dev;
 700                         skb->arp=1;
 701                         if(dev->hard_header)
 702                         {
 703                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 704                                         skb->arp=0;
 705                         }
 706                 
 707                         /*
 708                          *      This is not the right way to handle this. We have to
 709                          *      issue an up to date window and ack report with this 
 710                          *      retransmit to keep the odd buggy tcp that relies on 
 711                          *      the fact BSD does this happy. 
 712                          *      We don't however need to recalculate the entire 
 713                          *      checksum, so someone wanting a small problem to play
 714                          *      with might like to implement RFC1141/RFC1624 and speed
 715                          *      this up by avoiding a full checksum.
 716                          */
 717                  
 718                         th->ack_seq = ntohl(sk->acked_seq);
 719                         th->window = ntohs(tcp_select_window(sk));
 720                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 721                 
 722                         /*
 723                          *      If the interface is (still) up and running, kick it.
 724                          */
 725         
 726                         if (dev->flags & IFF_UP)
 727                         {
 728                                 /*
 729                                  *      If the packet is still being sent by the device/protocol
 730                                  *      below then don't retransmit. This is both needed, and good -
 731                                  *      especially with connected mode AX.25 where it stops resends
 732                                  *      occurring of an as yet unsent anyway frame!
 733                                  *      We still add up the counts as the round trip time wants
 734                                  *      adjusting.
 735                                  */
 736                                 if (sk && !skb_device_locked(skb))
 737                                 {
 738                                         /* Remove it from any existing driver queue first! */
 739                                         skb_unlink(skb);
 740                                         /* Now queue it */
 741                                         ip_statistics.IpOutRequests++;
 742                                         dev_queue_xmit(skb, dev, sk->priority);
 743                                 }
 744                         }
 745                 }
 746                 
 747                 /*
 748                  *      Count retransmissions
 749                  */
 750                  
 751                 ct++;
 752                 sk->prot->retransmits ++;
 753                 tcp_statistics.TcpRetransSegs++;
 754                 
 755 
 756                 /*
 757                  *      Only one retransmit requested.
 758                  */
 759         
 760                 if (!all)
 761                         break;
 762 
 763                 /*
 764                  *      This should cut it off before we send too many packets.
 765                  */
 766 
 767                 if (ct >= sk->cong_window)
 768                         break;
 769                 skb = skb->link3;
 770         }
 771 }
 772 
 773 /*
 774  *      Reset the retransmission timer
 775  */
 776  
 777 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 778 {
 779         del_timer(&sk->retransmit_timer);
 780         sk->ip_xmit_timeout = why;
 781         if((int)when < 0)
 782         {
 783                 when=3;
 784                 printk("Error: Negative timer in xmit_timer\n");
 785         }
 786         sk->retransmit_timer.expires=jiffies+when;
 787         add_timer(&sk->retransmit_timer);
 788 }
 789 
 790 /*
 791  *      This is the normal code called for timeouts.  It does the retransmission
 792  *      and then does backoff.  tcp_do_retransmit is separated out because
 793  *      tcp_ack needs to send stuff from the retransmit queue without
 794  *      initiating a backoff.
 795  */
 796 
 797 
 798 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 799 {
 800         tcp_do_retransmit(sk, all);
 801 
 802         /*
 803          * Increase the timeout each time we retransmit.  Note that
 804          * we do not increase the rtt estimate.  rto is initialized
 805          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 806          * that doubling rto each time is the least we can get away with.
 807          * In KA9Q, Karn uses this for the first few times, and then
 808          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 809          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 810          * defined in the protocol as the maximum possible RTT.  I guess
 811          * we'll have to use something other than TCP to talk to the
 812          * University of Mars.
 813          *
 814          * PAWS allows us longer timeouts and large windows, so once
 815          * implemented ftp to mars will work nicely. We will have to fix
 816          * the 120 second clamps though!
 817          */
 818 
 819         sk->retransmits++;
 820         sk->prot->retransmits++;
 821         sk->backoff++;
 822         sk->rto = min(sk->rto << 1, 120*HZ);
 823         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 824 }
 825 
 826 
 827 /*
 828  *      A timer event has trigger a tcp retransmit timeout. The
 829  *      socket xmit queue is ready and set up to send. Because
 830  *      the ack receive code keeps the queue straight we do
 831  *      nothing clever here.
 832  */
 833 
 834 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 835 {
 836         if (all) 
 837         {
 838                 tcp_retransmit_time(sk, all);
 839                 return;
 840         }
 841 
 842         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 843         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 844         sk->cong_count = 0;
 845 
 846         sk->cong_window = 1;
 847 
 848         /* Do the actual retransmit. */
 849         tcp_retransmit_time(sk, all);
 850 }
 851 
 852 /*
 853  *      A write timeout has occurred. Process the after effects.
 854  */
 855 
 856 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 857 {
 858         /*
 859          *      Look for a 'soft' timeout.
 860          */
 861         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 862                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 863         {
 864                 /*
 865                  *      Attempt to recover if arp has changed (unlikely!) or
 866                  *      a route has shifted (not supported prior to 1.3).
 867                  */
 868                 arp_destroy (sk->daddr, 0);
 869                 /*ip_route_check (sk->daddr);*/
 870         }
 871         
 872         /*
 873          *      Have we tried to SYN too many times (repent repent 8))
 874          */
 875          
 876         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 877         {
 878                 sk->err=ETIMEDOUT;
 879                 sk->error_report(sk);
 880                 del_timer(&sk->retransmit_timer);
 881                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 882                 tcp_set_state(sk,TCP_CLOSE);
 883                 /* Don't FIN, we got nothing back */
 884                 release_sock(sk);
 885                 return 0;
 886         }
 887         /*
 888          *      Has it gone just too far ?
 889          */
 890         if (sk->retransmits > TCP_RETR2) 
 891         {
 892                 sk->err = ETIMEDOUT;
 893                 sk->error_report(sk);
 894                 del_timer(&sk->retransmit_timer);
 895                 /*
 896                  *      Time wait the socket 
 897                  */
 898                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 899                 {
 900                         tcp_set_state(sk,TCP_TIME_WAIT);
 901                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 902                 }
 903                 else
 904                 {
 905                         /*
 906                          *      Clean up time.
 907                          */
 908                         tcp_set_state(sk, TCP_CLOSE);
 909                         release_sock(sk);
 910                         return 0;
 911                 }
 912         }
 913         return 1;
 914 }
 915 
 916 /*
 917  *      The TCP retransmit timer. This lacks a few small details.
 918  *
 919  *      1.      An initial rtt timeout on the probe0 should cause what we can
 920  *              of the first write queue buffer to be split and sent.
 921  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 922  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 923  *              tcp_err should save a 'soft error' for us.
 924  */
 925 
 926 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 927 {
 928         struct sock *sk = (struct sock*)data;
 929         int why = sk->ip_xmit_timeout;
 930 
 931         /* 
 932          * only process if socket is not in use
 933          */
 934 
 935         cli();
 936         if (sk->inuse || in_bh) 
 937         {
 938                 /* Try again in 1 second */
 939                 sk->retransmit_timer.expires = jiffies+HZ;
 940                 add_timer(&sk->retransmit_timer);
 941                 sti();
 942                 return;
 943         }
 944 
 945         sk->inuse = 1;
 946         sti();
 947 
 948         /* Always see if we need to send an ack. */
 949 
 950         if (sk->ack_backlog && !sk->zapped) 
 951         {
 952                 sk->prot->read_wakeup (sk);
 953                 if (! sk->dead)
 954                         sk->data_ready(sk,0);
 955         }
 956 
 957         /* Now we need to figure out why the socket was on the timer. */
 958 
 959         switch (why) 
 960         {
 961                 /* Window probing */
 962                 case TIME_PROBE0:
 963                         tcp_send_probe0(sk);
 964                         tcp_write_timeout(sk);
 965                         break;
 966                 /* Retransmitting */
 967                 case TIME_WRITE:
 968                         /* It could be we got here because we needed to send an ack.
 969                          * So we need to check for that.
 970                          */
 971                 {
 972                         struct sk_buff *skb;
 973                         unsigned long flags;
 974 
 975                         save_flags(flags);
 976                         cli();
 977                         skb = sk->send_head;
 978                         if (!skb) 
 979                         {
 980                                 restore_flags(flags);
 981                         } 
 982                         else 
 983                         {
 984                                 /*
 985                                  *      Kicked by a delayed ack. Reset timer
 986                                  *      correctly now
 987                                  */
 988                                 if (jiffies < skb->when + sk->rto) 
 989                                 {
 990                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 991                                         restore_flags(flags);
 992                                         break;
 993                                 }
 994                                 restore_flags(flags);
 995                                 /*
 996                                  *      Retransmission
 997                                  */
 998                                 sk->retransmits++;
 999                                 sk->prot->retransmits++;
1000                                 sk->prot->retransmit (sk, 0);
1001                                 tcp_write_timeout(sk);
1002                         }
1003                         break;
1004                 }
1005                 /* Sending Keepalives */
1006                 case TIME_KEEPOPEN:
1007                         /* 
1008                          * this reset_timer() call is a hack, this is not
1009                          * how KEEPOPEN is supposed to work.
1010                          */
1011                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1012 
1013                         /* Send something to keep the connection open. */
1014                         if (sk->prot->write_wakeup)
1015                                   sk->prot->write_wakeup (sk);
1016                         sk->retransmits++;
1017                         sk->prot->retransmits++;
1018                         tcp_write_timeout(sk);
1019                         break;
1020                 default:
1021                         printk ("rexmit_timer: timer expired - reason unknown\n");
1022                         break;
1023         }
1024         release_sock(sk);
1025 }
1026 
1027 /*
1028  * This routine is called by the ICMP module when it gets some
1029  * sort of error condition.  If err < 0 then the socket should
1030  * be closed and the error returned to the user.  If err > 0
1031  * it's just the icmp type << 8 | icmp code.  After adjustment
1032  * header points to the first 8 bytes of the tcp header.  We need
1033  * to find the appropriate port.
1034  */
1035 
1036 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
1037         __u32 saddr, struct inet_protocol *protocol)
1038 {
1039         struct tcphdr *th;
1040         struct sock *sk;
1041         struct iphdr *iph=(struct iphdr *)header;
1042   
1043         header+=4*iph->ihl;
1044    
1045 
1046         th =(struct tcphdr *)header;
1047         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1048 
1049         if (sk == NULL) 
1050                 return;
1051   
1052         if (type == ICMP_SOURCE_QUENCH) 
1053         {
1054                 /*
1055                  * FIXME:
1056                  * For now we will just trigger a linear backoff.
1057                  * The slow start code should cause a real backoff here.
1058                  */
1059                 if (sk->cong_window > 4)
1060                         sk->cong_window--;
1061                 return;
1062         }
1063         
1064         if (type == ICMP_PARAMETERPROB)
1065         {
1066                 sk->err=EPROTO;
1067                 sk->error_report(sk);
1068         }
1069 
1070         /*
1071          * If we've already connected we will keep trying
1072          * until we time out, or the user gives up.
1073          */
1074 
1075         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1076         {
1077                 sk->err = icmp_err_convert[code].errno;
1078                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1079                 {
1080                         tcp_statistics.TcpAttemptFails++;
1081                         tcp_set_state(sk,TCP_CLOSE);
1082                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1083                 }
1084         }
1085         return;
1086 }
1087 
1088 
1089 /*
1090  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1091  *      in the received data queue (ie a frame missing that needs sending to us). Not
1092  *      sorting using two queues as data arrives makes life so much harder.
1093  */
1094 
1095 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1096 {
1097         unsigned long counted;
1098         unsigned long amount;
1099         struct sk_buff *skb;
1100         int sum;
1101         unsigned long flags;
1102 
1103         if(sk && sk->debug)
1104                 printk("tcp_readable: %p - ",sk);
1105 
1106         save_flags(flags);
1107         cli();
1108         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1109         {
1110                 restore_flags(flags);
1111                 if(sk && sk->debug) 
1112                         printk("empty\n");
1113                 return(0);
1114         }
1115   
1116         counted = sk->copied_seq;       /* Where we are at the moment */
1117         amount = 0;
1118   
1119         /* 
1120          *      Do until a push or until we are out of data. 
1121          */
1122          
1123         do 
1124         {
1125                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1126                         break;
1127                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1128                 if (skb->h.th->syn)
1129                         sum++;
1130                 if (sum > 0) 
1131                 {                                       /* Add it up, move on */
1132                         amount += sum;
1133                         if (skb->h.th->syn) 
1134                                 amount--;
1135                         counted += sum;
1136                 }
1137                 /*
1138                  * Don't count urg data ... but do it in the right place!
1139                  * Consider: "old_data (ptr is here) URG PUSH data"
1140                  * The old code would stop at the first push because
1141                  * it counted the urg (amount==1) and then does amount--
1142                  * *after* the loop.  This means tcp_readable() always
1143                  * returned zero if any URG PUSH was in the queue, even
1144                  * though there was normal data available. If we subtract
1145                  * the urg data right here, we even get it to work for more
1146                  * than one URG PUSH skb without normal data.
1147                  * This means that select() finally works now with urg data
1148                  * in the queue.  Note that rlogin was never affected
1149                  * because it doesn't use select(); it uses two processes
1150                  * and a blocking read().  And the queue scan in tcp_read()
1151                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1152                  */
1153                 if (skb->h.th->urg)
1154                         amount--;       /* don't count urg data */
1155                 if (amount && skb->h.th->psh) break;
1156                 skb = skb->next;
1157         }
1158         while(skb != (struct sk_buff *)&sk->receive_queue);
1159 
1160         restore_flags(flags);
1161         if(sk->debug)
1162                 printk("got %lu bytes.\n",amount);
1163         return(amount);
1164 }
1165 
1166 /*
1167  * LISTEN is a special case for select..
1168  */
1169 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1170 {
1171         if (sel_type == SEL_IN) {
1172                 int retval;
1173 
1174                 sk->inuse = 1;
1175                 retval = (tcp_find_established(sk) != NULL);
1176                 release_sock(sk);
1177                 if (!retval)
1178                         select_wait(&master_select_wakeup,wait);
1179                 return retval;
1180         }
1181         return 0;
1182 }
1183 
1184 
1185 /*
1186  *      Wait for a TCP event.
1187  *
1188  *      Note that we don't need to set "sk->inuse", as the upper select layers
1189  *      take care of normal races (between the test and the event) and we don't
1190  *      go look at any of the socket buffers directly.
1191  */
1192 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
1193 {
1194         if (sk->state == TCP_LISTEN)
1195                 return tcp_listen_select(sk, sel_type, wait);
1196 
1197         switch(sel_type) {
1198         case SEL_IN:
1199                 if (sk->err)
1200                         return 1;
1201                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1202                         break;
1203 
1204                 if (sk->shutdown & RCV_SHUTDOWN)
1205                         return 1;
1206                         
1207                 if (sk->acked_seq == sk->copied_seq)
1208                         break;
1209 
1210                 if (sk->urg_seq != sk->copied_seq ||
1211                     sk->acked_seq != sk->copied_seq+1 ||
1212                     sk->urginline || !sk->urg_data)
1213                         return 1;
1214                 break;
1215 
1216         case SEL_OUT:
1217                 if (sk->err)
1218                         return 1;
1219                 if (sk->shutdown & SEND_SHUTDOWN) 
1220                         return 0;
1221                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1222                         break;
1223                 /*
1224                  * This is now right thanks to a small fix
1225                  * by Matt Dillon.
1226                  */
1227 
1228                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1229                         break;
1230                 return 1;
1231 
1232         case SEL_EX:
1233                 if (sk->urg_data)
1234                         return 1;
1235                 break;
1236         }
1237         select_wait(sk->sleep, wait);
1238         return 0;
1239 }
1240 
1241 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
1242 {
1243         int err;
1244         switch(cmd) 
1245         {
1246 
1247                 case TIOCINQ:
1248 #ifdef FIXME    /* FIXME: */
1249                 case FIONREAD:
1250 #endif
1251                 {
1252                         unsigned long amount;
1253 
1254                         if (sk->state == TCP_LISTEN) 
1255                                 return(-EINVAL);
1256 
1257                         sk->inuse = 1;
1258                         amount = tcp_readable(sk);
1259                         release_sock(sk);
1260                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1261                         if(err)
1262                                 return err;
1263                         put_user(amount, (int *)arg);
1264                         return(0);
1265                 }
1266                 case SIOCATMARK:
1267                 {
1268                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1269 
1270                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1271                         if (err)
1272                                 return err;
1273                         put_user(answ,(int *) arg);
1274                         return(0);
1275                 }
1276                 case TIOCOUTQ:
1277                 {
1278                         unsigned long amount;
1279 
1280                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1281                         amount = sk->prot->wspace(sk);
1282                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1283                         if(err)
1284                                 return err;
1285                         put_user(amount, (int *)arg);
1286                         return(0);
1287                 }
1288                 default:
1289                         return(-EINVAL);
1290         }
1291 }
1292 
1293 
1294 /*
1295  *      This routine computes a TCP checksum. 
1296  *
1297  *      Modified January 1995 from a go-faster DOS routine by
1298  *      Jorge Cwik <jorge@laser.satlink.net>
1299  */
1300  
1301 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1302           unsigned long saddr, unsigned long daddr, unsigned long base)
1303 {     
1304         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1305 }
1306 
1307 
1308 
1309 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1310                 unsigned long daddr, int len, struct sock *sk)
1311 {
1312         th->check = 0;
1313         th->check = tcp_check(th, len, saddr, daddr,
1314                 csum_partial((char *)th,len,0));
1315         return;
1316 }
1317 
1318 /*
1319  *      This is the main buffer sending routine. We queue the buffer
1320  *      having checked it is sane seeming.
1321  */
1322  
1323 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1324 {
1325         int size;
1326         struct tcphdr * th = skb->h.th;
1327 
1328         /*
1329          *      length of packet (not counting length of pre-tcp headers) 
1330          */
1331          
1332         size = skb->len - ((unsigned char *) th - skb->data);
1333 
1334         /*
1335          *      Sanity check it.. 
1336          */
1337          
1338         if (size < sizeof(struct tcphdr) || size > skb->len) 
1339         {
1340                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1341                         skb, skb->data, th, skb->len);
1342                 kfree_skb(skb, FREE_WRITE);
1343                 return;
1344         }
1345 
1346         /*
1347          *      If we have queued a header size packet.. (these crash a few
1348          *      tcp stacks if ack is not set)
1349          */
1350          
1351         if (size == sizeof(struct tcphdr)) 
1352         {
1353                 /* If it's got a syn or fin it's notionally included in the size..*/
1354                 if(!th->syn && !th->fin) 
1355                 {
1356                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1357                         kfree_skb(skb,FREE_WRITE);
1358                         return;
1359                 }
1360         }
1361 
1362         /*
1363          *      Actual processing.
1364          */
1365          
1366         tcp_statistics.TcpOutSegs++;  
1367         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1368         
1369         /*
1370          *      We must queue if
1371          *
1372          *      a) The right edge of this frame exceeds the window
1373          *      b) We are retransmitting (Nagle's rule)
1374          *      c) We have too many packets 'in flight'
1375          */
1376          
1377         if (after(skb->h.seq, sk->window_seq) ||
1378             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1379              sk->packets_out >= sk->cong_window) 
1380         {
1381                 /* checksum will be supplied by tcp_write_xmit.  So
1382                  * we shouldn't need to set it at all.  I'm being paranoid */
1383                 th->check = 0;
1384                 if (skb->next != NULL) 
1385                 {
1386                         printk("tcp_send_partial: next != NULL\n");
1387                         skb_unlink(skb);
1388                 }
1389                 skb_queue_tail(&sk->write_queue, skb);
1390                 
1391                 /*
1392                  *      If we don't fit we have to start the zero window
1393                  *      probes. This is broken - we really need to do a partial
1394                  *      send _first_ (This is what causes the Cisco and PC/TCP
1395                  *      grief).
1396                  */
1397                  
1398                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1399                     sk->send_head == NULL && sk->ack_backlog == 0)
1400                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1401         } 
1402         else 
1403         {
1404                 /*
1405                  *      This is going straight out
1406                  */
1407                  
1408                 th->ack_seq = ntohl(sk->acked_seq);
1409                 th->window = ntohs(tcp_select_window(sk));
1410 
1411                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1412 
1413                 sk->sent_seq = sk->write_seq;
1414                 
1415                 /*
1416                  *      This is mad. The tcp retransmit queue is put together
1417                  *      by the ip layer. This causes half the problems with
1418                  *      unroutable FIN's and other things.
1419                  */
1420                  
1421                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1422                 
1423                 /*
1424                  *      Set for next retransmit based on expected ACK time.
1425                  *      FIXME: We set this every time which means our 
1426                  *      retransmits are really about a window behind.
1427                  */
1428 
1429                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1430         }
1431 }
1432 
1433 /*
1434  *      Locking problems lead us to a messy situation where we can have
1435  *      multiple partially complete buffers queued up. This is really bad
1436  *      as we don't want to be sending partial buffers. Fix this with
1437  *      a semaphore or similar to lock tcp_write per socket.
1438  *
1439  *      These routines are pretty self descriptive.
1440  */
1441  
1442 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1443 {
1444         struct sk_buff * skb;
1445         unsigned long flags;
1446 
1447         save_flags(flags);
1448         cli();
1449         skb = sk->partial;
1450         if (skb) {
1451                 sk->partial = NULL;
1452                 del_timer(&sk->partial_timer);
1453         }
1454         restore_flags(flags);
1455         return skb;
1456 }
1457 
1458 /*
1459  *      Empty the partial queue
1460  */
1461  
1462 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1463 {
1464         struct sk_buff *skb;
1465 
1466         if (sk == NULL)
1467                 return;
1468         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1469                 tcp_send_skb(sk, skb);
1470 }
1471 
1472 /*
1473  *      Queue a partial frame
1474  */
1475  
1476 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1477 {
1478         struct sk_buff * tmp;
1479         unsigned long flags;
1480 
1481         save_flags(flags);
1482         cli();
1483         tmp = sk->partial;
1484         if (tmp)
1485                 del_timer(&sk->partial_timer);
1486         sk->partial = skb;
1487         init_timer(&sk->partial_timer);
1488         /*
1489          *      Wait up to 1 second for the buffer to fill.
1490          */
1491         sk->partial_timer.expires = jiffies+HZ;
1492         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1493         sk->partial_timer.data = (unsigned long) sk;
1494         add_timer(&sk->partial_timer);
1495         restore_flags(flags);
1496         if (tmp)
1497                 tcp_send_skb(sk, tmp);
1498 }
1499 
1500 
1501 /*
1502  *      This routine sends an ack and also updates the window. 
1503  */
1504  
1505 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1506              struct sock *sk,
1507              struct tcphdr *th, unsigned long daddr)
1508 {
1509         struct sk_buff *buff;
1510         struct tcphdr *t1;
1511         struct device *dev = NULL;
1512         int tmp;
1513 
1514         if(sk->zapped)
1515                 return;         /* We have been reset, we may not send again */
1516                 
1517         /*
1518          * We need to grab some memory, and put together an ack,
1519          * and then put it into the queue to be sent.
1520          */
1521 
1522         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1523         if (buff == NULL) 
1524         {
1525                 /* 
1526                  *      Force it to send an ack. We don't have to do this
1527                  *      (ACK is unreliable) but it's much better use of 
1528                  *      bandwidth on slow links to send a spare ack than
1529                  *      resend packets. 
1530                  */
1531                  
1532                 sk->ack_backlog++;
1533                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1534                 {
1535                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1536                 }
1537                 return;
1538         }
1539 
1540         /*
1541          *      Assemble a suitable TCP frame
1542          */
1543          
1544         buff->sk = sk;
1545         buff->localroute = sk->localroute;
1546 
1547         /* 
1548          *      Put in the IP header and routing stuff. 
1549          */
1550          
1551         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1552                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1553         if (tmp < 0) 
1554         {
1555                 buff->free = 1;
1556                 sk->prot->wfree(sk, buff);
1557                 return;
1558         }
1559         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1560 
1561         memcpy(t1, th, sizeof(*t1));
1562 
1563         /*
1564          *      Swap the send and the receive. 
1565          */
1566          
1567         t1->dest = th->source;
1568         t1->source = th->dest;
1569         t1->seq = ntohl(sequence);
1570         t1->ack = 1;
1571         sk->window = tcp_select_window(sk);
1572         t1->window = ntohs(sk->window);
1573         t1->res1 = 0;
1574         t1->res2 = 0;
1575         t1->rst = 0;
1576         t1->urg = 0;
1577         t1->syn = 0;
1578         t1->psh = 0;
1579         t1->fin = 0;
1580         
1581         /*
1582          *      If we have nothing queued for transmit and the transmit timer
1583          *      is on we are just doing an ACK timeout and need to switch
1584          *      to a keepalive.
1585          */
1586          
1587         if (ack == sk->acked_seq) 
1588         {
1589                 sk->ack_backlog = 0;
1590                 sk->bytes_rcv = 0;
1591                 sk->ack_timed = 0;
1592                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1593                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1594                 {
1595                         if(sk->keepopen) {
1596                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1597                         } else {
1598                                 delete_timer(sk);
1599                         }
1600                 }
1601         }
1602         
1603         /*
1604          *      Fill in the packet and send it
1605          */
1606          
1607         t1->ack_seq = ntohl(ack);
1608         t1->doff = sizeof(*t1)/4;
1609         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1610         if (sk->debug)
1611                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1612         tcp_statistics.TcpOutSegs++;
1613         sk->prot->queue_xmit(sk, dev, buff, 1);
1614 }
1615 
1616 
1617 /* 
1618  *      This routine builds a generic TCP header. 
1619  */
1620  
1621 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1622 {
1623 
1624         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1625         th->seq = htonl(sk->write_seq);
1626         th->psh =(push == 0) ? 1 : 0;
1627         th->doff = sizeof(*th)/4;
1628         th->ack = 1;
1629         th->fin = 0;
1630         sk->ack_backlog = 0;
1631         sk->bytes_rcv = 0;
1632         sk->ack_timed = 0;
1633         th->ack_seq = htonl(sk->acked_seq);
1634         sk->window = tcp_select_window(sk);
1635         th->window = htons(sk->window);
1636 
1637         return(sizeof(*th));
1638 }
1639 
1640 /*
1641  *      This routine copies from a user buffer into a socket,
1642  *      and starts the transmit system.
1643  */
1644 
1645 static int tcp_write(struct sock *sk, const unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1646           int len, int nonblock, unsigned flags)
1647 {
1648         int copied = 0;
1649         int copy;
1650         int tmp;
1651         struct sk_buff *skb;
1652         struct sk_buff *send_tmp;
1653         struct proto *prot;
1654         struct device *dev = NULL;
1655 
1656         sk->inuse=1;
1657         prot = sk->prot;
1658         while(len > 0) 
1659         {
1660                 if (sk->err) 
1661                 {                       /* Stop on an error */
1662                         release_sock(sk);
1663                         if (copied) 
1664                                 return(copied);
1665                         tmp = -sk->err;
1666                         sk->err = 0;
1667                         return(tmp);
1668                 }
1669 
1670                 /*
1671                  *      First thing we do is make sure that we are established. 
1672                  */
1673         
1674                 if (sk->shutdown & SEND_SHUTDOWN) 
1675                 {
1676                         release_sock(sk);
1677                         sk->err = EPIPE;
1678                         if (copied) 
1679                                 return(copied);
1680                         sk->err = 0;
1681                         return(-EPIPE);
1682                 }
1683 
1684                 /* 
1685                  *      Wait for a connection to finish.
1686                  */
1687         
1688                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1689                 {
1690                         if (sk->err) 
1691                         {
1692                                 release_sock(sk);
1693                                 if (copied) 
1694                                         return(copied);
1695                                 tmp = -sk->err;
1696                                 sk->err = 0;
1697                                 return(tmp);
1698                         }
1699 
1700                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1701                         {
1702                                 release_sock(sk);
1703                                 if (copied) 
1704                                         return(copied);
1705 
1706                                 if (sk->err) 
1707                                 {
1708                                         tmp = -sk->err;
1709                                         sk->err = 0;
1710                                         return(tmp);
1711                                 }
1712 
1713                                 if (sk->keepopen) 
1714                                 {
1715                                         send_sig(SIGPIPE, current, 0);
1716                                 }
1717                                 return(-EPIPE);
1718                         }
1719 
1720                         if (nonblock || copied) 
1721                         {
1722                                 release_sock(sk);
1723                                 if (copied) 
1724                                         return(copied);
1725                                 return(-EAGAIN);
1726                         }
1727 
1728                         release_sock(sk);
1729                         cli();
1730                 
1731                         if (sk->state != TCP_ESTABLISHED &&
1732                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1733                         {
1734                                 interruptible_sleep_on(sk->sleep);
1735                                 if (current->signal & ~current->blocked) 
1736                                 {
1737                                         sti();
1738                                         if (copied) 
1739                                                 return(copied);
1740                                         return(-ERESTARTSYS);
1741                                 }
1742                         }
1743                         sk->inuse = 1;
1744                         sti();
1745                 }
1746 
1747         /*
1748          * The following code can result in copy <= if sk->mss is ever
1749          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1750          * sk->mtu is constant once SYN processing is finished.  I.e. we
1751          * had better not get here until we've seen his SYN and at least one
1752          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1753          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1754          * non-decreasing.  Note that any ioctl to set user_mss must be done
1755          * before the exchange of SYN's.  If the initial ack from the other
1756          * end has a window of 0, max_window and thus mss will both be 0.
1757          */
1758 
1759         /* 
1760          *      Now we need to check if we have a half built packet. 
1761          */
1762 
1763                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1764                 {
1765                         int hdrlen;
1766 
1767                          /* IP header + TCP header */
1768                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1769                                  + sizeof(struct tcphdr);
1770         
1771                         /* Add more stuff to the end of skb->len */
1772                         if (!(flags & MSG_OOB)) 
1773                         {
1774                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1775                                 /* FIXME: this is really a bug. */
1776                                 if (copy <= 0) 
1777                                 {
1778                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1779                                         copy = 0;
1780                                 }
1781           
1782                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1783                                 from += copy;
1784                                 copied += copy;
1785                                 len -= copy;
1786                                 sk->write_seq += copy;
1787                         }
1788                         if ((skb->len - hdrlen) >= sk->mss ||
1789                                 (flags & MSG_OOB) || !sk->packets_out)
1790                                 tcp_send_skb(sk, skb);
1791                         else
1792                                 tcp_enqueue_partial(skb, sk);
1793                         continue;
1794                 }
1795 
1796         /*
1797          * We also need to worry about the window.
1798          * If window < 1/2 the maximum window we've seen from this
1799          *   host, don't use it.  This is sender side
1800          *   silly window prevention, as specified in RFC1122.
1801          *   (Note that this is different than earlier versions of
1802          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1803          *   use the whole MSS.  Since the results in the right
1804          *   edge of the packet being outside the window, it will
1805          *   be queued for later rather than sent.
1806          */
1807 
1808                 copy = sk->window_seq - sk->write_seq;
1809                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1810                         copy = sk->mss;
1811                 if (copy > len)
1812                         copy = len;
1813 
1814         /*
1815          *      We should really check the window here also. 
1816          */
1817          
1818                 send_tmp = NULL;
1819                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1820                 {
1821                         /*
1822                          *      We will release the socket in case we sleep here. 
1823                          */
1824                         release_sock(sk);
1825                         /*
1826                          *      NB: following must be mtu, because mss can be increased.
1827                          *      mss is always <= mtu 
1828                          */
1829                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1830                         sk->inuse = 1;
1831                         send_tmp = skb;
1832                 } 
1833                 else 
1834                 {
1835                         /*
1836                          *      We will release the socket in case we sleep here. 
1837                          */
1838                         release_sock(sk);
1839                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1840                         sk->inuse = 1;
1841                 }
1842 
1843                 /*
1844                  *      If we didn't get any memory, we need to sleep. 
1845                  */
1846 
1847                 if (skb == NULL) 
1848                 {
1849                         sk->socket->flags |= SO_NOSPACE;
1850                         if (nonblock) 
1851                         {
1852                                 release_sock(sk);
1853                                 if (copied) 
1854                                         return(copied);
1855                                 return(-EAGAIN);
1856                         }
1857 
1858                         /*
1859                          *      FIXME: here is another race condition. 
1860                          */
1861 
1862                         tmp = sk->wmem_alloc;
1863                         release_sock(sk);
1864                         cli();
1865                         /*
1866                          *      Again we will try to avoid it. 
1867                          */
1868                         if (tmp <= sk->wmem_alloc &&
1869                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1870                                 && sk->err == 0) 
1871                         {
1872                                 sk->socket->flags &= ~SO_NOSPACE;
1873                                 interruptible_sleep_on(sk->sleep);
1874                                 if (current->signal & ~current->blocked) 
1875                                 {
1876                                         sti();
1877                                         if (copied) 
1878                                                 return(copied);
1879                                         return(-ERESTARTSYS);
1880                                 }
1881                         }
1882                         sk->inuse = 1;
1883                         sti();
1884                         continue;
1885                 }
1886 
1887                 skb->sk = sk;
1888                 skb->free = 0;
1889                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1890         
1891                 /*
1892                  * FIXME: we need to optimize this.
1893                  * Perhaps some hints here would be good.
1894                  */
1895                 
1896                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1897                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1898                 if (tmp < 0 ) 
1899                 {
1900                         prot->wfree(sk, skb);
1901                         release_sock(sk);
1902                         if (copied) 
1903                                 return(copied);
1904                         return(tmp);
1905                 }
1906                 skb->dev = dev;
1907                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1908                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1909                 if (tmp < 0) 
1910                 {
1911                         prot->wfree(sk, skb);
1912                         release_sock(sk);
1913                         if (copied) 
1914                                 return(copied);
1915                         return(tmp);
1916                 }
1917 
1918                 if (flags & MSG_OOB) 
1919                 {
1920                         skb->h.th->urg = 1;
1921                         skb->h.th->urg_ptr = ntohs(copy);
1922                 }
1923 
1924                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1925                 
1926                 from += copy;
1927                 copied += copy;
1928                 len -= copy;
1929                 skb->free = 0;
1930                 sk->write_seq += copy;
1931         
1932                 if (send_tmp != NULL && sk->packets_out) 
1933                 {
1934                         tcp_enqueue_partial(send_tmp, sk);
1935                         continue;
1936                 }
1937                 tcp_send_skb(sk, skb);
1938         }
1939         sk->err = 0;
1940 
1941 /*
1942  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1943  *      interactive fast network servers. It's meant to be on and
1944  *      it really improves the throughput though not the echo time
1945  *      on my slow slip link - Alan
1946  */
1947 
1948 /*
1949  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1950  */
1951  
1952         if(sk->partial && ((!sk->packets_out) 
1953      /* If not nagling we can send on the before case too.. */
1954               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1955         ))
1956                 tcp_send_partial(sk);
1957 
1958         release_sock(sk);
1959         return(copied);
1960 }
1961 
1962 /*
1963  *      This is just a wrapper. 
1964  */
1965 
1966 static int tcp_sendto(struct sock *sk, const unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1967            int len, int nonblock, unsigned flags,
1968            struct sockaddr_in *addr, int addr_len)
1969 {
1970         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1971                 return -EINVAL;
1972         if (sk->state == TCP_CLOSE)
1973                 return -ENOTCONN;
1974         if (addr_len < sizeof(*addr))
1975                 return -EINVAL;
1976         if (addr->sin_family && addr->sin_family != AF_INET) 
1977                 return -EINVAL;
1978         if (addr->sin_port != sk->dummy_th.dest) 
1979                 return -EISCONN;
1980         if (addr->sin_addr.s_addr != sk->daddr) 
1981                 return -EISCONN;
1982         return tcp_write(sk, from, len, nonblock, flags);
1983 }
1984 
1985 
1986 /*
1987  *      Send an ack if one is backlogged at this point. Ought to merge
1988  *      this with tcp_send_ack().
1989  */
1990  
1991 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1992 {
1993         int tmp;
1994         struct device *dev = NULL;
1995         struct tcphdr *t1;
1996         struct sk_buff *buff;
1997 
1998         if (!sk->ack_backlog) 
1999                 return;
2000 
2001         /*
2002          * If we're closed, don't send an ack, or we'll get a RST
2003          * from the closed destination.
2004          */
2005         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2006                 return; 
2007 
2008         /*
2009          * FIXME: we need to put code here to prevent this routine from
2010          * being called.  Being called once in a while is ok, so only check
2011          * if this is the second time in a row.
2012          */
2013 
2014         /*
2015          * We need to grab some memory, and put together an ack,
2016          * and then put it into the queue to be sent.
2017          */
2018 
2019         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2020         if (buff == NULL) 
2021         {
2022                 /* Try again real soon. */
2023                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2024                 return;
2025         }
2026 
2027         buff->sk = sk;
2028         buff->localroute = sk->localroute;
2029         
2030         /*
2031          *      Put in the IP header and routing stuff. 
2032          */
2033 
2034         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2035                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2036         if (tmp < 0) 
2037         {
2038                 buff->free = 1;
2039                 sk->prot->wfree(sk, buff);
2040                 return;
2041         }
2042 
2043         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2044 
2045         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2046         t1->seq = htonl(sk->sent_seq);
2047         t1->ack = 1;
2048         t1->res1 = 0;
2049         t1->res2 = 0;
2050         t1->rst = 0;
2051         t1->urg = 0;
2052         t1->syn = 0;
2053         t1->psh = 0;
2054         sk->ack_backlog = 0;
2055         sk->bytes_rcv = 0;
2056         sk->window = tcp_select_window(sk);
2057         t1->window = ntohs(sk->window);
2058         t1->ack_seq = ntohl(sk->acked_seq);
2059         t1->doff = sizeof(*t1)/4;
2060         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2061         sk->prot->queue_xmit(sk, dev, buff, 1);
2062         tcp_statistics.TcpOutSegs++;
2063 }
2064 
2065 
2066 /*
2067  *      FIXME:
2068  *      This routine frees used buffers.
2069  *      It should consider sending an ACK to let the
2070  *      other end know we now have a bigger window.
2071  */
2072 
2073 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2074 {
2075         unsigned long flags;
2076         unsigned long left;
2077         struct sk_buff *skb;
2078         unsigned long rspace;
2079 
2080         if(sk->debug)
2081                 printk("cleaning rbuf for sk=%p\n", sk);
2082   
2083         save_flags(flags);
2084         cli();
2085   
2086         left = sk->prot->rspace(sk);
2087  
2088         /*
2089          *      We have to loop through all the buffer headers,
2090          *      and try to free up all the space we can.
2091          */
2092 
2093         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2094         {
2095                 if (!skb->used || skb->users) 
2096                         break;
2097                 skb_unlink(skb);
2098                 skb->sk = sk;
2099                 kfree_skb(skb, FREE_READ);
2100         }
2101 
2102         restore_flags(flags);
2103 
2104         /*
2105          *      FIXME:
2106          *      At this point we should send an ack if the difference
2107          *      in the window, and the amount of space is bigger than
2108          *      TCP_WINDOW_DIFF.
2109          */
2110 
2111         if(sk->debug)
2112                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
2113                                             left);
2114         if ((rspace=sk->prot->rspace(sk)) != left) 
2115         {
2116                 /*
2117                  * This area has caused the most trouble.  The current strategy
2118                  * is to simply do nothing if the other end has room to send at
2119                  * least 3 full packets, because the ack from those will auto-
2120                  * matically update the window.  If the other end doesn't think
2121                  * we have much space left, but we have room for at least 1 more
2122                  * complete packet than it thinks we do, we will send an ack
2123                  * immediately.  Otherwise we will wait up to .5 seconds in case
2124                  * the user reads some more.
2125                  */
2126                 sk->ack_backlog++;
2127         /*
2128          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2129          * if the other end is offering a window smaller than the agreed on MSS
2130          * (called sk->mtu here).  In theory there's no connection between send
2131          * and receive, and so no reason to think that they're going to send
2132          * small packets.  For the moment I'm using the hack of reducing the mss
2133          * only on the send side, so I'm putting mtu here.
2134          */
2135 
2136                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2137                 {
2138                         /* Send an ack right now. */
2139                         tcp_read_wakeup(sk);
2140                 } 
2141                 else 
2142                 {
2143                         /* Force it to send an ack soon. */
2144                         int was_active = del_timer(&sk->retransmit_timer);
2145                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2146                         {
2147                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2148                         } 
2149                         else
2150                                 add_timer(&sk->retransmit_timer);
2151                 }
2152         }
2153 } 
2154 
2155 
2156 /*
2157  *      Handle reading urgent data. BSD has very simple semantics for
2158  *      this, no blocking and very strange errors 8)
2159  */
2160  
2161 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
2162              unsigned char *to, int len, unsigned flags)
2163 {
2164         /*
2165          *      No URG data to read
2166          */
2167         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2168                 return -EINVAL; /* Yes this is right ! */
2169                 
2170         if (sk->err) 
2171         {
2172                 int tmp = -sk->err;
2173                 sk->err = 0;
2174                 return tmp;
2175         }
2176 
2177         if (sk->state == TCP_CLOSE || sk->done) 
2178         {
2179                 if (!sk->done) {
2180                         sk->done = 1;
2181                         return 0;
2182                 }
2183                 return -ENOTCONN;
2184         }
2185 
2186         if (sk->shutdown & RCV_SHUTDOWN) 
2187         {
2188                 sk->done = 1;
2189                 return 0;
2190         }
2191         sk->inuse = 1;
2192         if (sk->urg_data & URG_VALID) 
2193         {
2194                 char c = sk->urg_data;
2195                 if (!(flags & MSG_PEEK))
2196                         sk->urg_data = URG_READ;
2197                 put_fs_byte(c, to);
2198                 release_sock(sk);
2199                 return 1;
2200         }
2201         release_sock(sk);
2202         
2203         /*
2204          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2205          * the available implementations agree in this case:
2206          * this call should never block, independent of the
2207          * blocking state of the socket.
2208          * Mike <pall@rz.uni-karlsruhe.de>
2209          */
2210         return -EAGAIN;
2211 }
2212 
2213 
2214 /*
2215  *      This routine copies from a sock struct into the user buffer. 
2216  */
2217  
2218 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2219         int len, int nonblock, unsigned flags)
2220 {
2221         struct wait_queue wait = { current, NULL };
2222         int copied = 0;
2223         u32 peek_seq;
2224         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2225         unsigned long used;
2226 
2227         /* 
2228          *      This error should be checked. 
2229          */
2230          
2231         if (sk->state == TCP_LISTEN)
2232                 return -ENOTCONN;
2233 
2234         /*
2235          *      Urgent data needs to be handled specially. 
2236          */
2237          
2238         if (flags & MSG_OOB)
2239                 return tcp_read_urg(sk, nonblock, to, len, flags);
2240 
2241         /*
2242          *      Copying sequence to update. This is volatile to handle
2243          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2244          *      inline and thus not flush cached variables otherwise).
2245          */
2246          
2247         peek_seq = sk->copied_seq;
2248         seq = &sk->copied_seq;
2249         if (flags & MSG_PEEK)
2250                 seq = &peek_seq;
2251 
2252         add_wait_queue(sk->sleep, &wait);
2253         sk->inuse = 1;
2254         while (len > 0) 
2255         {
2256                 struct sk_buff * skb;
2257                 u32 offset;
2258         
2259                 /*
2260                  * Are we at urgent data? Stop if we have read anything.
2261                  */
2262                  
2263                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2264                         break;
2265 
2266                 /*
2267                  *      Next get a buffer.
2268                  */
2269                  
2270                 current->state = TASK_INTERRUPTIBLE;
2271 
2272                 skb = skb_peek(&sk->receive_queue);
2273                 do 
2274                 {
2275                         if (!skb)
2276                                 break;
2277                         if (before(*seq, skb->h.th->seq))
2278                                 break;
2279                         offset = *seq - skb->h.th->seq;
2280                         if (skb->h.th->syn)
2281                                 offset--;
2282                         if (offset < skb->len)
2283                                 goto found_ok_skb;
2284                         if (skb->h.th->fin)
2285                                 goto found_fin_ok;
2286                         if (!(flags & MSG_PEEK))
2287                                 skb->used = 1;
2288                         skb = skb->next;
2289                 }
2290                 while (skb != (struct sk_buff *)&sk->receive_queue);
2291 
2292                 if (copied)
2293                         break;
2294 
2295                 if (sk->err) 
2296                 {
2297                         copied = -sk->err;
2298                         sk->err = 0;
2299                         break;
2300                 }
2301 
2302                 if (sk->state == TCP_CLOSE) 
2303                 {
2304                         if (!sk->done) 
2305                         {
2306                                 sk->done = 1;
2307                                 break;
2308                         }
2309                         copied = -ENOTCONN;
2310                         break;
2311                 }
2312 
2313                 if (sk->shutdown & RCV_SHUTDOWN) 
2314                 {
2315                         sk->done = 1;
2316                         break;
2317                 }
2318                         
2319                 if (nonblock) 
2320                 {
2321                         copied = -EAGAIN;
2322                         break;
2323                 }
2324 
2325                 cleanup_rbuf(sk);
2326                 release_sock(sk);
2327                 sk->socket->flags |= SO_WAITDATA;
2328                 schedule();
2329                 sk->socket->flags &= ~SO_WAITDATA;
2330                 sk->inuse = 1;
2331 
2332                 if (current->signal & ~current->blocked) 
2333                 {
2334                         copied = -ERESTARTSYS;
2335                         break;
2336                 }
2337                 continue;
2338 
2339         found_ok_skb:
2340                 /*
2341                  *      Lock the buffer. We can be fairly relaxed as
2342                  *      an interrupt will never steal a buffer we are 
2343                  *      using unless I've missed something serious in
2344                  *      tcp_data.
2345                  */
2346                 
2347                 skb->users++;
2348                 
2349                 /*
2350                  *      Ok so how much can we use ? 
2351                  */
2352                  
2353                 used = skb->len - offset;
2354                 if (len < used)
2355                         used = len;
2356                 /*
2357                  *      Do we have urgent data here? 
2358                  */
2359                 
2360                 if (sk->urg_data) 
2361                 {
2362                         u32 urg_offset = sk->urg_seq - *seq;
2363                         if (urg_offset < used) 
2364                         {
2365                                 if (!urg_offset) 
2366                                 {
2367                                         if (!sk->urginline) 
2368                                         {
2369                                                 ++*seq;
2370                                                 offset++;
2371                                                 used--;
2372                                         }
2373                                 }
2374                                 else
2375                                         used = urg_offset;
2376                         }
2377                 }
2378                 
2379                 /*
2380                  *      Copy it - We _MUST_ update *seq first so that we
2381                  *      don't ever double read when we have dual readers
2382                  */
2383                  
2384                 *seq += used;
2385 
2386                 /*
2387                  *      This memcpy_tofs can sleep. If it sleeps and we
2388                  *      do a second read it relies on the skb->users to avoid
2389                  *      a crash when cleanup_rbuf() gets called.
2390                  */
2391                  
2392                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2393                         skb->h.th->doff*4 + offset, used);
2394                 copied += used;
2395                 len -= used;
2396                 to += used;
2397                 
2398                 /*
2399                  *      We now will not sleep again until we are finished
2400                  *      with skb. Sorry if you are doing the SMP port
2401                  *      but you'll just have to fix it neatly ;)
2402                  */
2403                  
2404                 skb->users --;
2405                 
2406                 if (after(sk->copied_seq,sk->urg_seq))
2407                         sk->urg_data = 0;
2408                 if (used + offset < skb->len)
2409                         continue;
2410                 
2411                 /*
2412                  *      Process the FIN.
2413                  */
2414 
2415                 if (skb->h.th->fin)
2416                         goto found_fin_ok;
2417                 if (flags & MSG_PEEK)
2418                         continue;
2419                 skb->used = 1;
2420                 continue;
2421 
2422         found_fin_ok:
2423                 ++*seq;
2424                 if (flags & MSG_PEEK)
2425                         break;
2426                         
2427                 /*
2428                  *      All is done
2429                  */
2430                  
2431                 skb->used = 1;
2432                 sk->shutdown |= RCV_SHUTDOWN;
2433                 break;
2434 
2435         }
2436         remove_wait_queue(sk->sleep, &wait);
2437         current->state = TASK_RUNNING;
2438 
2439         /* Clean up data we have read: This will do ACK frames */
2440         cleanup_rbuf(sk);
2441         release_sock(sk);
2442         return copied;
2443 }
2444 
2445 /*
2446  *      State processing on a close. This implements the state shift for
2447  *      sending our FIN frame. Note that we only send a FIN for some 
2448  *      states. A shutdown() may have already sent the FIN, or we may be
2449  *      closed.
2450  */
2451  
2452 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2453 {
2454         int ns=TCP_CLOSE;
2455         int send_fin=0;
2456         switch(sk->state)
2457         {
2458                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2459                         break;
2460                 case TCP_SYN_RECV:
2461                 case TCP_ESTABLISHED:   /* Closedown begin */
2462                         ns=TCP_FIN_WAIT1;
2463                         send_fin=1;
2464                         break;
2465                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2466                 case TCP_FIN_WAIT2:
2467                 case TCP_CLOSING:
2468                         ns=sk->state;
2469                         break;
2470                 case TCP_CLOSE:
2471                 case TCP_LISTEN:
2472                         break;
2473                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2474                                            wait only for the ACK */
2475                         ns=TCP_LAST_ACK;
2476                         send_fin=1;
2477         }
2478         
2479         tcp_set_state(sk,ns);
2480                 
2481         /*
2482          *      This is a (useful) BSD violating of the RFC. There is a
2483          *      problem with TCP as specified in that the other end could
2484          *      keep a socket open forever with no application left this end.
2485          *      We use a 3 minute timeout (about the same as BSD) then kill
2486          *      our end. If they send after that then tough - BUT: long enough
2487          *      that we won't make the old 4*rto = almost no time - whoops
2488          *      reset mistake.
2489          */
2490         if(dead && ns==TCP_FIN_WAIT2)
2491         {
2492                 int timer_active=del_timer(&sk->timer);
2493                 if(timer_active)
2494                         add_timer(&sk->timer);
2495                 else
2496                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2497         }
2498         
2499         return send_fin;
2500 }
2501 
2502 /*
2503  *      Send a fin.
2504  */
2505 
2506 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2507 {
2508         struct proto *prot =(struct proto *)sk->prot;
2509         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2510         struct tcphdr *t1;
2511         struct sk_buff *buff;
2512         struct device *dev=NULL;
2513         int tmp;
2514                 
2515         release_sock(sk); /* in case the malloc sleeps. */
2516         
2517         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2518         sk->inuse = 1;
2519 
2520         if (buff == NULL)
2521         {
2522                 /* This is a disaster if it occurs */
2523                 printk("tcp_send_fin: Impossible malloc failure");
2524                 return;
2525         }
2526 
2527         /*
2528          *      Administrivia
2529          */
2530          
2531         buff->sk = sk;
2532         buff->localroute = sk->localroute;
2533 
2534         /*
2535          *      Put in the IP header and routing stuff. 
2536          */
2537 
2538         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2539                            IPPROTO_TCP, sk->opt,
2540                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2541         if (tmp < 0) 
2542         {
2543                 int t;
2544                 /*
2545                  *      Finish anyway, treat this as a send that got lost. 
2546                  *      (Not good).
2547                  */
2548                  
2549                 buff->free = 1;
2550                 prot->wfree(sk,buff);
2551                 sk->write_seq++;
2552                 t=del_timer(&sk->timer);
2553                 if(t)
2554                         add_timer(&sk->timer);
2555                 else
2556                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2557                 return;
2558         }
2559         
2560         /*
2561          *      We ought to check if the end of the queue is a buffer and
2562          *      if so simply add the fin to that buffer, not send it ahead.
2563          */
2564 
2565         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2566         buff->dev = dev;
2567         memcpy(t1, th, sizeof(*t1));
2568         t1->seq = ntohl(sk->write_seq);
2569         sk->write_seq++;
2570         buff->h.seq = sk->write_seq;
2571         t1->ack = 1;
2572         t1->ack_seq = ntohl(sk->acked_seq);
2573         t1->window = ntohs(sk->window=tcp_select_window(sk));
2574         t1->fin = 1;
2575         t1->rst = 0;
2576         t1->doff = sizeof(*t1)/4;
2577         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2578 
2579         /*
2580          * If there is data in the write queue, the fin must be appended to
2581          * the write queue.
2582          */
2583         
2584         if (skb_peek(&sk->write_queue) != NULL) 
2585         {
2586                 buff->free = 0;
2587                 if (buff->next != NULL) 
2588                 {
2589                         printk("tcp_send_fin: next != NULL\n");
2590                         skb_unlink(buff);
2591                 }
2592                 skb_queue_tail(&sk->write_queue, buff);
2593         } 
2594         else 
2595         {
2596                 sk->sent_seq = sk->write_seq;
2597                 sk->prot->queue_xmit(sk, dev, buff, 0);
2598                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2599         }
2600 }
2601 
2602 /*
2603  *      Shutdown the sending side of a connection. Much like close except
2604  *      that we don't receive shut down or set sk->dead=1.
2605  */
2606 
2607 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2608 {
2609         /*
2610          *      We need to grab some memory, and put together a FIN,
2611          *      and then put it into the queue to be sent.
2612          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2613          */
2614 
2615         if (!(how & SEND_SHUTDOWN)) 
2616                 return;
2617          
2618         /*
2619          *      If we've already sent a FIN, or it's a closed state
2620          */
2621          
2622         if (sk->state == TCP_FIN_WAIT1 ||
2623             sk->state == TCP_FIN_WAIT2 ||
2624             sk->state == TCP_CLOSING ||
2625             sk->state == TCP_LAST_ACK ||
2626             sk->state == TCP_TIME_WAIT || 
2627             sk->state == TCP_CLOSE ||
2628             sk->state == TCP_LISTEN
2629           )
2630         {
2631                 return;
2632         }
2633         sk->inuse = 1;
2634 
2635         /*
2636          * flag that the sender has shutdown
2637          */
2638 
2639         sk->shutdown |= SEND_SHUTDOWN;
2640 
2641         /*
2642          *  Clear out any half completed packets. 
2643          */
2644 
2645         if (sk->partial)
2646                 tcp_send_partial(sk);
2647                 
2648         /*
2649          *      FIN if needed
2650          */
2651          
2652         if(tcp_close_state(sk,0))
2653                 tcp_send_fin(sk);
2654                 
2655         release_sock(sk);
2656 }
2657 
2658 
2659 static int
2660 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2661              int to_len, int nonblock, unsigned flags,
2662              struct sockaddr_in *addr, int *addr_len)
2663 {
2664         int result;
2665   
2666         /* 
2667          *      Have to check these first unlike the old code. If 
2668          *      we check them after we lose data on an error
2669          *      which is wrong 
2670          */
2671 
2672         if(addr_len)
2673                 *addr_len = sizeof(*addr);
2674         result=tcp_read(sk, to, to_len, nonblock, flags);
2675 
2676         if (result < 0) 
2677                 return(result);
2678   
2679         if(addr)
2680         {
2681                 addr->sin_family = AF_INET;
2682                 addr->sin_port = sk->dummy_th.dest;
2683                 addr->sin_addr.s_addr = sk->daddr;
2684         }
2685         return(result);
2686 }
2687 
2688 
2689 /*
2690  *      This routine will send an RST to the other tcp. 
2691  */
2692  
2693 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2694           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2695 {
2696         struct sk_buff *buff;
2697         struct tcphdr *t1;
2698         int tmp;
2699         struct device *ndev=NULL;
2700 
2701         /*
2702          *      Cannot reset a reset (Think about it).
2703          */
2704          
2705         if(th->rst)
2706                 return;
2707   
2708         /*
2709          * We need to grab some memory, and put together an RST,
2710          * and then put it into the queue to be sent.
2711          */
2712 
2713         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2714         if (buff == NULL) 
2715                 return;
2716 
2717         buff->sk = NULL;
2718         buff->dev = dev;
2719         buff->localroute = 0;
2720 
2721         /*
2722          *      Put in the IP header and routing stuff. 
2723          */
2724 
2725         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2726                            sizeof(struct tcphdr),tos,ttl);
2727         if (tmp < 0) 
2728         {
2729                 buff->free = 1;
2730                 prot->wfree(NULL, buff);
2731                 return;
2732         }
2733 
2734         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2735         memcpy(t1, th, sizeof(*t1));
2736 
2737         /*
2738          *      Swap the send and the receive. 
2739          */
2740 
2741         t1->dest = th->source;
2742         t1->source = th->dest;
2743         t1->rst = 1;  
2744         t1->window = 0;
2745   
2746         if(th->ack)
2747         {
2748                 t1->ack = 0;
2749                 t1->seq = th->ack_seq;
2750                 t1->ack_seq = 0;
2751         }
2752         else
2753         {
2754                 t1->ack = 1;
2755                 if(!th->syn)
2756                         t1->ack_seq=htonl(th->seq);
2757                 else
2758                         t1->ack_seq=htonl(th->seq+1);
2759                 t1->seq=0;
2760         }
2761 
2762         t1->syn = 0;
2763         t1->urg = 0;
2764         t1->fin = 0;
2765         t1->psh = 0;
2766         t1->doff = sizeof(*t1)/4;
2767         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2768         prot->queue_xmit(NULL, ndev, buff, 1);
2769         tcp_statistics.TcpOutSegs++;
2770 }
2771 
2772 
2773 /*
2774  *      Look for tcp options. Parses everything but only knows about MSS.
2775  *      This routine is always called with the packet containing the SYN.
2776  *      However it may also be called with the ack to the SYN.  So you
2777  *      can't assume this is always the SYN.  It's always called after
2778  *      we have set up sk->mtu to our own MTU.
2779  *
2780  *      We need at minimum to add PAWS support here. Possibly large windows
2781  *      as Linux gets deployed on 100Mb/sec networks.
2782  */
2783  
2784 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2785 {
2786         unsigned char *ptr;
2787         int length=(th->doff*4)-sizeof(struct tcphdr);
2788         int mss_seen = 0;
2789     
2790         ptr = (unsigned char *)(th + 1);
2791   
2792         while(length>0)
2793         {
2794                 int opcode=*ptr++;
2795                 int opsize=*ptr++;
2796                 switch(opcode)
2797                 {
2798                         case TCPOPT_EOL:
2799                                 return;
2800                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2801                                 length--;
2802                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2803                                 continue;
2804                         
2805                         default:
2806                                 if(opsize<=2)   /* Avoid silly options looping forever */
2807                                         return;
2808                                 switch(opcode)
2809                                 {
2810                                         case TCPOPT_MSS:
2811                                                 if(opsize==4 && th->syn)
2812                                                 {
2813                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2814                                                         mss_seen = 1;
2815                                                 }
2816                                                 break;
2817                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2818                                 }
2819                                 ptr+=opsize-2;
2820                                 length-=opsize;
2821                 }
2822         }
2823         if (th->syn) 
2824         {
2825                 if (! mss_seen)
2826                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2827         }
2828 #ifdef CONFIG_INET_PCTCP
2829         sk->mss = min(sk->max_window >> 1, sk->mtu);
2830 #else    
2831         sk->mss = min(sk->max_window, sk->mtu);
2832 #endif  
2833 }
2834 
2835 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2836 {
2837         dst = ntohl(dst);
2838         if (IN_CLASSA(dst))
2839                 return htonl(IN_CLASSA_NET);
2840         if (IN_CLASSB(dst))
2841                 return htonl(IN_CLASSB_NET);
2842         return htonl(IN_CLASSC_NET);
2843 }
2844 
2845 /*
2846  *      Default sequence number picking algorithm.
2847  *      As close as possible to RFC 793, which
2848  *      suggests using a 250kHz clock.
2849  *      Further reading shows this assumes 2MB/s networks.
2850  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2851  *      That's funny, Linux has one built in!  Use it!
2852  */
2853 
2854 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2855 {
2856         struct timeval tv;
2857         do_gettimeofday(&tv);
2858         return tv.tv_usec+tv.tv_sec*1000000;
2859 }
2860 
2861 /*
2862  *      This routine handles a connection request.
2863  *      It should make sure we haven't already responded.
2864  *      Because of the way BSD works, we have to send a syn/ack now.
2865  *      This also means it will be harder to close a socket which is
2866  *      listening.
2867  */
2868  
2869 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2870                  unsigned long daddr, unsigned long saddr,
2871                  struct options *opt, struct device *dev, u32 seq)
2872 {
2873         struct sk_buff *buff;
2874         struct tcphdr *t1;
2875         unsigned char *ptr;
2876         struct sock *newsk;
2877         struct tcphdr *th;
2878         struct device *ndev=NULL;
2879         int tmp;
2880         struct rtable *rt;
2881   
2882         th = skb->h.th;
2883 
2884         /* If the socket is dead, don't accept the connection. */
2885         if (!sk->dead) 
2886         {
2887                 sk->data_ready(sk,0);
2888         }
2889         else 
2890         {
2891                 if(sk->debug)
2892                         printk("Reset on %p: Connect on dead socket.\n",sk);
2893                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2894                 tcp_statistics.TcpAttemptFails++;
2895                 kfree_skb(skb, FREE_READ);
2896                 return;
2897         }
2898 
2899         /*
2900          * Make sure we can accept more.  This will prevent a
2901          * flurry of syns from eating up all our memory.
2902          */
2903 
2904         if (sk->ack_backlog >= sk->max_ack_backlog) 
2905         {
2906                 tcp_statistics.TcpAttemptFails++;
2907                 kfree_skb(skb, FREE_READ);
2908                 return;
2909         }
2910 
2911         /*
2912          * We need to build a new sock struct.
2913          * It is sort of bad to have a socket without an inode attached
2914          * to it, but the wake_up's will just wake up the listening socket,
2915          * and if the listening socket is destroyed before this is taken
2916          * off of the queue, this will take care of it.
2917          */
2918 
2919         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2920         if (newsk == NULL) 
2921         {
2922                 /* just ignore the syn.  It will get retransmitted. */
2923                 tcp_statistics.TcpAttemptFails++;
2924                 kfree_skb(skb, FREE_READ);
2925                 return;
2926         }
2927 
2928         memcpy(newsk, sk, sizeof(*newsk));
2929         newsk->opt = NULL;
2930         if (opt && opt->optlen) {
2931           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2932           if (!sk->opt) {
2933                 kfree_s(newsk, sizeof(struct sock));
2934                 tcp_statistics.TcpAttemptFails++;
2935                 kfree_skb(skb, FREE_READ);
2936                 return;
2937           }
2938           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2939                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2940                 kfree_s(newsk, sizeof(struct sock));
2941                 tcp_statistics.TcpAttemptFails++;
2942                 kfree_skb(skb, FREE_READ);
2943                 return;
2944           }
2945         }
2946         skb_queue_head_init(&newsk->write_queue);
2947         skb_queue_head_init(&newsk->receive_queue);
2948         newsk->send_head = NULL;
2949         newsk->send_tail = NULL;
2950         skb_queue_head_init(&newsk->back_log);
2951         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2952         newsk->rto = TCP_TIMEOUT_INIT;
2953         newsk->mdev = 0;
2954         newsk->max_window = 0;
2955         newsk->cong_window = 1;
2956         newsk->cong_count = 0;
2957         newsk->ssthresh = 0;
2958         newsk->backoff = 0;
2959         newsk->blog = 0;
2960         newsk->intr = 0;
2961         newsk->proc = 0;
2962         newsk->done = 0;
2963         newsk->partial = NULL;
2964         newsk->pair = NULL;
2965         newsk->wmem_alloc = 0;
2966         newsk->rmem_alloc = 0;
2967         newsk->localroute = sk->localroute;
2968 
2969         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2970 
2971         newsk->err = 0;
2972         newsk->shutdown = 0;
2973         newsk->ack_backlog = 0;
2974         newsk->acked_seq = skb->h.th->seq+1;
2975         newsk->copied_seq = skb->h.th->seq+1;
2976         newsk->fin_seq = skb->h.th->seq;
2977         newsk->state = TCP_SYN_RECV;
2978         newsk->timeout = 0;
2979         newsk->ip_xmit_timeout = 0;
2980         newsk->write_seq = seq; 
2981         newsk->window_seq = newsk->write_seq;
2982         newsk->rcv_ack_seq = newsk->write_seq;
2983         newsk->urg_data = 0;
2984         newsk->retransmits = 0;
2985         newsk->linger=0;
2986         newsk->destroy = 0;
2987         init_timer(&newsk->timer);
2988         newsk->timer.data = (unsigned long)newsk;
2989         newsk->timer.function = &net_timer;
2990         init_timer(&newsk->retransmit_timer);
2991         newsk->retransmit_timer.data = (unsigned long)newsk;
2992         newsk->retransmit_timer.function=&retransmit_timer;
2993         newsk->dummy_th.source = skb->h.th->dest;
2994         newsk->dummy_th.dest = skb->h.th->source;
2995         
2996         /*
2997          *      Swap these two, they are from our point of view. 
2998          */
2999          
3000         newsk->daddr = saddr;
3001         newsk->saddr = daddr;
3002 
3003         put_sock(newsk->num,newsk);
3004         newsk->dummy_th.res1 = 0;
3005         newsk->dummy_th.doff = 6;
3006         newsk->dummy_th.fin = 0;
3007         newsk->dummy_th.syn = 0;
3008         newsk->dummy_th.rst = 0;        
3009         newsk->dummy_th.psh = 0;
3010         newsk->dummy_th.ack = 0;
3011         newsk->dummy_th.urg = 0;
3012         newsk->dummy_th.res2 = 0;
3013         newsk->acked_seq = skb->h.th->seq + 1;
3014         newsk->copied_seq = skb->h.th->seq + 1;
3015         newsk->socket = NULL;
3016 
3017         /*
3018          *      Grab the ttl and tos values and use them 
3019          */
3020 
3021         newsk->ip_ttl=sk->ip_ttl;
3022         newsk->ip_tos=skb->ip_hdr->tos;
3023 
3024         /*
3025          *      Use 512 or whatever user asked for 
3026          */
3027 
3028         /*
3029          *      Note use of sk->user_mss, since user has no direct access to newsk 
3030          */
3031 
3032         rt=ip_rt_route(saddr, NULL,NULL);
3033         
3034         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3035                 newsk->window_clamp = rt->rt_window;
3036         else
3037                 newsk->window_clamp = 0;
3038                 
3039         if (sk->user_mss)
3040                 newsk->mtu = sk->user_mss;
3041         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3042                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3043         else 
3044         {
3045 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3046                 if ((saddr ^ daddr) & default_mask(saddr))
3047 #else
3048                 if ((saddr ^ daddr) & dev->pa_mask)
3049 #endif
3050                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3051                 else
3052                         newsk->mtu = MAX_WINDOW;
3053         }
3054 
3055         /*
3056          *      But not bigger than device MTU 
3057          */
3058 
3059         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3060 
3061         /*
3062          *      This will min with what arrived in the packet 
3063          */
3064 
3065         tcp_options(newsk,skb->h.th);
3066         
3067         tcp_cache_zap();
3068 
3069         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3070         if (buff == NULL) 
3071         {
3072                 sk->err = ENOMEM;
3073                 newsk->dead = 1;
3074                 newsk->state = TCP_CLOSE;
3075                 /* And this will destroy it */
3076                 release_sock(newsk);
3077                 kfree_skb(skb, FREE_READ);
3078                 tcp_statistics.TcpAttemptFails++;
3079                 return;
3080         }
3081   
3082         buff->sk = newsk;
3083         buff->localroute = newsk->localroute;
3084 
3085         /*
3086          *      Put in the IP header and routing stuff. 
3087          */
3088 
3089         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3090                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3091 
3092         /*
3093          *      Something went wrong. 
3094          */
3095 
3096         if (tmp < 0) 
3097         {
3098                 sk->err = tmp;
3099                 buff->free = 1;
3100                 kfree_skb(buff,FREE_WRITE);
3101                 newsk->dead = 1;
3102                 newsk->state = TCP_CLOSE;
3103                 release_sock(newsk);
3104                 skb->sk = sk;
3105                 kfree_skb(skb, FREE_READ);
3106                 tcp_statistics.TcpAttemptFails++;
3107                 return;
3108         }
3109 
3110         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3111   
3112         memcpy(t1, skb->h.th, sizeof(*t1));
3113         buff->h.seq = newsk->write_seq;
3114         /*
3115          *      Swap the send and the receive. 
3116          */
3117         t1->dest = skb->h.th->source;
3118         t1->source = newsk->dummy_th.source;
3119         t1->seq = ntohl(newsk->write_seq++);
3120         t1->ack = 1;
3121         newsk->window = tcp_select_window(newsk);
3122         newsk->sent_seq = newsk->write_seq;
3123         t1->window = ntohs(newsk->window);
3124         t1->res1 = 0;
3125         t1->res2 = 0;
3126         t1->rst = 0;
3127         t1->urg = 0;
3128         t1->psh = 0;
3129         t1->syn = 1;
3130         t1->ack_seq = ntohl(skb->h.th->seq+1);
3131         t1->doff = sizeof(*t1)/4+1;
3132         ptr = skb_put(buff,4);
3133         ptr[0] = 2;
3134         ptr[1] = 4;
3135         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3136         ptr[3] =(newsk->mtu) & 0xff;
3137 
3138         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3139         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3140         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3141         skb->sk = newsk;
3142 
3143         /*
3144          *      Charge the sock_buff to newsk. 
3145          */
3146          
3147         sk->rmem_alloc -= skb->truesize;
3148         newsk->rmem_alloc += skb->truesize;
3149         
3150         skb_queue_tail(&sk->receive_queue,skb);
3151         sk->ack_backlog++;
3152         release_sock(newsk);
3153         tcp_statistics.TcpOutSegs++;
3154 }
3155 
3156 
3157 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
3158 {
3159         /*
3160          * We need to grab some memory, and put together a FIN, 
3161          * and then put it into the queue to be sent.
3162          */
3163         
3164         sk->inuse = 1;
3165         
3166         if(th_cache_sk==sk)
3167                 tcp_cache_zap();
3168         if(sk->state == TCP_LISTEN)
3169         {
3170                 /* Special case */
3171                 tcp_set_state(sk, TCP_CLOSE);
3172                 tcp_close_pending(sk);
3173                 release_sock(sk);
3174                 return;
3175         }
3176         
3177         sk->keepopen = 1;
3178         sk->shutdown = SHUTDOWN_MASK;
3179 
3180         if (!sk->dead) 
3181                 sk->state_change(sk);
3182 
3183         if (timeout == 0) 
3184         {
3185                 struct sk_buff *skb;
3186                 
3187                 /*
3188                  *  We need to flush the recv. buffs.  We do this only on the
3189                  *  descriptor close, not protocol-sourced closes, because the
3190                  *  reader process may not have drained the data yet!
3191                  */
3192                  
3193                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3194                         kfree_skb(skb, FREE_READ);
3195                 /*
3196                  *      Get rid off any half-completed packets. 
3197                  */
3198 
3199                 if (sk->partial) 
3200                         tcp_send_partial(sk);
3201         }
3202 
3203                 
3204         /*
3205          *      Timeout is not the same thing - however the code likes
3206          *      to send both the same way (sigh).
3207          */
3208          
3209         if(timeout)
3210         {
3211                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3212         }
3213         else
3214         {
3215                 if(tcp_close_state(sk,1)==1)
3216                 {
3217                         tcp_send_fin(sk);
3218                 }
3219         }
3220         release_sock(sk);
3221 }
3222 
3223 
3224 /*
3225  *      This routine takes stuff off of the write queue,
3226  *      and puts it in the xmit queue. This happens as incoming acks
3227  *      open up the remote window for us.
3228  */
3229  
3230 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3231 {
3232         struct sk_buff *skb;
3233 
3234         /*
3235          *      The bytes will have to remain here. In time closedown will
3236          *      empty the write queue and all will be happy 
3237          */
3238 
3239         if(sk->zapped)
3240                 return;
3241 
3242         /*
3243          *      Anything on the transmit queue that fits the window can
3244          *      be added providing we are not
3245          *
3246          *      a) retransmitting (Nagle's rule)
3247          *      b) exceeding our congestion window.
3248          */
3249          
3250         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3251                 before(skb->h.seq, sk->window_seq + 1) &&
3252                 (sk->retransmits == 0 ||
3253                  sk->ip_xmit_timeout != TIME_WRITE ||
3254                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3255                 && sk->packets_out < sk->cong_window) 
3256         {
3257                 IS_SKB(skb);
3258                 skb_unlink(skb);
3259                 
3260                 /*
3261                  *      See if we really need to send the packet. 
3262                  */
3263                  
3264                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3265                 {
3266                         /*
3267                          *      This is acked data. We can discard it. This 
3268                          *      cannot currently occur.
3269                          */
3270                          
3271                         sk->retransmits = 0;
3272                         kfree_skb(skb, FREE_WRITE);
3273                         if (!sk->dead) 
3274                                 sk->write_space(sk);
3275                 } 
3276                 else
3277                 {
3278                         struct tcphdr *th;
3279                         struct iphdr *iph;
3280                         int size;
3281 /*
3282  * put in the ack seq and window at this point rather than earlier,
3283  * in order to keep them monotonic.  We really want to avoid taking
3284  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3285  * Ack and window will in general have changed since this packet was put
3286  * on the write queue.
3287  */
3288                         iph = skb->ip_hdr;
3289                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3290                         size = skb->len - (((unsigned char *) th) - skb->data);
3291                         
3292                         th->ack_seq = ntohl(sk->acked_seq);
3293                         th->window = ntohs(tcp_select_window(sk));
3294 
3295                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3296 
3297                         sk->sent_seq = skb->h.seq;
3298                         
3299                         /*
3300                          *      IP manages our queue for some crazy reason
3301                          */
3302                          
3303                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3304                         
3305                         /*
3306                          *      Again we slide the timer wrongly
3307                          */
3308                          
3309                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3310                 }
3311         }
3312 }
3313 
3314 
3315 /*
3316  *      This routine deals with incoming acks, but not outgoing ones.
3317  */
3318 
3319 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3320 {
3321         u32 ack;
3322         int flag = 0;
3323 
3324         /* 
3325          * 1 - there was data in packet as well as ack or new data is sent or 
3326          *     in shutdown state
3327          * 2 - data from retransmit queue was acked and removed
3328          * 4 - window shrunk or data from retransmit queue was acked and removed
3329          */
3330 
3331         if(sk->zapped)
3332                 return(1);      /* Dead, cant ack any more so why bother */
3333 
3334         /*
3335          *      Have we discovered a larger window
3336          */
3337          
3338         ack = ntohl(th->ack_seq);
3339 
3340         if (ntohs(th->window) > sk->max_window) 
3341         {
3342                 sk->max_window = ntohs(th->window);
3343 #ifdef CONFIG_INET_PCTCP
3344                 /* Hack because we don't send partial packets to non SWS
3345                    handling hosts */
3346                 sk->mss = min(sk->max_window>>1, sk->mtu);
3347 #else
3348                 sk->mss = min(sk->max_window, sk->mtu);
3349 #endif  
3350         }
3351 
3352         /*
3353          *      We have dropped back to keepalive timeouts. Thus we have
3354          *      no retransmits pending.
3355          */
3356          
3357         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3358                 sk->retransmits = 0;
3359 
3360         /*
3361          *      If the ack is newer than sent or older than previous acks
3362          *      then we can probably ignore it.
3363          */
3364          
3365         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3366         {
3367                 if(sk->debug)
3368                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3369                         
3370                 /*
3371                  *      Keepalive processing.
3372                  */
3373                  
3374                 if (after(ack, sk->sent_seq)) 
3375                 {
3376                         return(0);
3377                 }
3378                 
3379                 /*
3380                  *      Restart the keepalive timer.
3381                  */
3382                  
3383                 if (sk->keepopen) 
3384                 {
3385                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3386                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3387                 }
3388                 return(1);
3389         }
3390 
3391         /*
3392          *      If there is data set flag 1
3393          */
3394          
3395         if (len != th->doff*4) 
3396                 flag |= 1;
3397 
3398         /*
3399          *      See if our window has been shrunk. 
3400          */
3401 
3402         if (after(sk->window_seq, ack+ntohs(th->window))) 
3403         {
3404                 /*
3405                  * We may need to move packets from the send queue
3406                  * to the write queue, if the window has been shrunk on us.
3407                  * The RFC says you are not allowed to shrink your window
3408                  * like this, but if the other end does, you must be able
3409                  * to deal with it.
3410                  */
3411                 struct sk_buff *skb;
3412                 struct sk_buff *skb2;
3413                 struct sk_buff *wskb = NULL;
3414         
3415                 skb2 = sk->send_head;
3416                 sk->send_head = NULL;
3417                 sk->send_tail = NULL;
3418         
3419                 /*
3420                  *      This is an artifact of a flawed concept. We want one
3421                  *      queue and a smarter send routine when we send all.
3422                  */
3423         
3424                 flag |= 4;      /* Window changed */
3425         
3426                 sk->window_seq = ack + ntohs(th->window);
3427                 cli();
3428                 while (skb2 != NULL) 
3429                 {
3430                         skb = skb2;
3431                         skb2 = skb->link3;
3432                         skb->link3 = NULL;
3433                         if (after(skb->h.seq, sk->window_seq)) 
3434                         {
3435                                 if (sk->packets_out > 0) 
3436                                         sk->packets_out--;
3437                                 /* We may need to remove this from the dev send list. */
3438                                 if (skb->next != NULL) 
3439                                 {
3440                                         skb_unlink(skb);                                
3441                                 }
3442                                 /* Now add it to the write_queue. */
3443                                 if (wskb == NULL)
3444                                         skb_queue_head(&sk->write_queue,skb);
3445                                 else
3446                                         skb_append(wskb,skb);
3447                                 wskb = skb;
3448                         } 
3449                         else 
3450                         {
3451                                 if (sk->send_head == NULL) 
3452                                 {
3453                                         sk->send_head = skb;
3454                                         sk->send_tail = skb;
3455                                 }
3456                                 else
3457                                 {
3458                                         sk->send_tail->link3 = skb;
3459                                         sk->send_tail = skb;
3460                                 }
3461                                 skb->link3 = NULL;
3462                         }
3463                 }
3464                 sti();
3465         }
3466 
3467         /*
3468          *      Pipe has emptied
3469          */
3470          
3471         if (sk->send_tail == NULL || sk->send_head == NULL) 
3472         {
3473                 sk->send_head = NULL;
3474                 sk->send_tail = NULL;
3475                 sk->packets_out= 0;
3476         }
3477 
3478         /*
3479          *      Update the right hand window edge of the host
3480          */
3481          
3482         sk->window_seq = ack + ntohs(th->window);
3483 
3484         /*
3485          *      We don't want too many packets out there. 
3486          */
3487          
3488         if (sk->ip_xmit_timeout == TIME_WRITE && 
3489                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3490         {
3491                 /* 
3492                  * This is Jacobson's slow start and congestion avoidance. 
3493                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3494                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3495                  * counter and increment it once every cwnd times.  It's possible
3496                  * that this should be done only if sk->retransmits == 0.  I'm
3497                  * interpreting "new data is acked" as including data that has
3498                  * been retransmitted but is just now being acked.
3499                  */
3500                 if (sk->cong_window < sk->ssthresh)  
3501                         /* 
3502                          *      In "safe" area, increase
3503                          */
3504                         sk->cong_window++;
3505                 else 
3506                 {
3507                         /*
3508                          *      In dangerous area, increase slowly.  In theory this is
3509                          *      sk->cong_window += 1 / sk->cong_window
3510                          */
3511                         if (sk->cong_count >= sk->cong_window) 
3512                         {
3513                                 sk->cong_window++;
3514                                 sk->cong_count = 0;
3515                         }
3516                         else 
3517                                 sk->cong_count++;
3518                 }
3519         }
3520 
3521         /*
3522          *      Remember the highest ack received.
3523          */
3524          
3525         sk->rcv_ack_seq = ack;
3526 
3527         /*
3528          *      If this ack opens up a zero window, clear backoff.  It was
3529          *      being used to time the probes, and is probably far higher than
3530          *      it needs to be for normal retransmission.
3531          */
3532 
3533         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3534         {
3535                 sk->retransmits = 0;    /* Our probe was answered */
3536                 
3537                 /*
3538                  *      Was it a usable window open ?
3539                  */
3540                  
3541                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3542                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3543                 {
3544                         sk->backoff = 0;
3545                         
3546                         /*
3547                          *      Recompute rto from rtt.  this eliminates any backoff.
3548                          */
3549 
3550                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3551                         if (sk->rto > 120*HZ)
3552                                 sk->rto = 120*HZ;
3553                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3554                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3555                                                    .2 of a second is going to need huge windows (SIGH) */
3556                         sk->rto = 20;
3557                 }
3558         }
3559 
3560         /* 
3561          *      See if we can take anything off of the retransmit queue.
3562          */
3563    
3564         while(sk->send_head != NULL) 
3565         {
3566                 /* Check for a bug. */
3567                 if (sk->send_head->link3 &&
3568                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3569                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3570                         
3571                 /*
3572                  *      If our packet is before the ack sequence we can
3573                  *      discard it as it's confirmed to have arrived the other end.
3574                  */
3575                  
3576                 if (before(sk->send_head->h.seq, ack+1)) 
3577                 {
3578                         struct sk_buff *oskb;   
3579                         if (sk->retransmits) 
3580                         {       
3581                                 /*
3582                                  *      We were retransmitting.  don't count this in RTT est 
3583                                  */
3584                                 flag |= 2;
3585 
3586                                 /*
3587                                  * even though we've gotten an ack, we're still
3588                                  * retransmitting as long as we're sending from
3589                                  * the retransmit queue.  Keeping retransmits non-zero
3590                                  * prevents us from getting new data interspersed with
3591                                  * retransmissions.
3592                                  */
3593 
3594                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3595                                         sk->retransmits = 1;
3596                                 else
3597                                         sk->retransmits = 0;
3598                         }
3599                         /*
3600                          * Note that we only reset backoff and rto in the
3601                          * rtt recomputation code.  And that doesn't happen
3602                          * if there were retransmissions in effect.  So the
3603                          * first new packet after the retransmissions is
3604                          * sent with the backoff still in effect.  Not until
3605                          * we get an ack from a non-retransmitted packet do
3606                          * we reset the backoff and rto.  This allows us to deal
3607                          * with a situation where the network delay has increased
3608                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3609                          */
3610 
3611                         /*
3612                          *      We have one less packet out there. 
3613                          */
3614                          
3615                         if (sk->packets_out > 0) 
3616                                 sk->packets_out --;
3617                         /* 
3618                          *      Wake up the process, it can probably write more. 
3619                          */
3620                         if (!sk->dead) 
3621                                 sk->write_space(sk);
3622                         oskb = sk->send_head;
3623 
3624                         if (!(flag&2))  /* Not retransmitting */
3625                         {
3626                                 long m;
3627         
3628                                 /*
3629                                  *      The following amusing code comes from Jacobson's
3630                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3631                                  *      are scaled versions of rtt and mean deviation.
3632                                  *      This is designed to be as fast as possible 
3633                                  *      m stands for "measurement".
3634                                  */
3635         
3636                                 m = jiffies - oskb->when;  /* RTT */
3637                                 if(m<=0)
3638                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3639                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3640                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3641                                 if (m < 0)
3642                                         m = -m;         /* m is now abs(error) */
3643                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3644                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3645         
3646                                 /*
3647                                  *      Now update timeout.  Note that this removes any backoff.
3648                                  */
3649                          
3650                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3651                                 if (sk->rto > 120*HZ)
3652                                         sk->rto = 120*HZ;
3653                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3654                                         sk->rto = 20;
3655                                 sk->backoff = 0;
3656                         }
3657                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3658                                            In this case as we just set it up */
3659                         cli();
3660                         oskb = sk->send_head;
3661                         IS_SKB(oskb);
3662                         sk->send_head = oskb->link3;
3663                         if (sk->send_head == NULL) 
3664                         {
3665                                 sk->send_tail = NULL;
3666                         }
3667 
3668                 /*
3669                  *      We may need to remove this from the dev send list. 
3670                  */
3671 
3672                         if (oskb->next)
3673                                 skb_unlink(oskb);
3674                         sti();
3675                         kfree_skb(oskb, FREE_WRITE); /* write. */
3676                         if (!sk->dead) 
3677                                 sk->write_space(sk);
3678                 }
3679                 else
3680                 {
3681                         break;
3682                 }
3683         }
3684 
3685         /*
3686          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3687          * returns non-NULL, we complete ignore the timer stuff in the else
3688          * clause.  We ought to organize the code so that else clause can
3689          * (should) be executed regardless, possibly moving the PROBE timer
3690          * reset over.  The skb_peek() thing should only move stuff to the
3691          * write queue, NOT also manage the timer functions.
3692          */
3693 
3694         /*
3695          * Maybe we can take some stuff off of the write queue,
3696          * and put it onto the xmit queue.
3697          */
3698         if (skb_peek(&sk->write_queue) != NULL) 
3699         {
3700                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3701                         (sk->retransmits == 0 || 
3702                          sk->ip_xmit_timeout != TIME_WRITE ||
3703                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3704                         && sk->packets_out < sk->cong_window) 
3705                 {
3706                         /*
3707                          *      Add more data to the send queue.
3708                          */
3709                         flag |= 1;
3710                         tcp_write_xmit(sk);
3711                 }
3712                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3713                         sk->send_head == NULL &&
3714                         sk->ack_backlog == 0 &&
3715                         sk->state != TCP_TIME_WAIT) 
3716                 {
3717                         /*
3718                          *      Data to queue but no room.
3719                          */
3720                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3721                 }               
3722         }
3723         else
3724         {
3725                 /*
3726                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3727                  * from TCP_CLOSE we don't do anything
3728                  *
3729                  * from anything else, if there is write data (or fin) pending,
3730                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3731                  * a KEEPALIVE timeout, else we delete the timer.
3732                  *
3733                  * We do not set flag for nominal write data, otherwise we may
3734                  * force a state where we start to write itsy bitsy tidbits
3735                  * of data.
3736                  */
3737 
3738                 switch(sk->state) {
3739                 case TCP_TIME_WAIT:
3740                         /*
3741                          * keep us in TIME_WAIT until we stop getting packets,
3742                          * reset the timeout.
3743                          */
3744                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3745                         break;
3746                 case TCP_CLOSE:
3747                         /*
3748                          * don't touch the timer.
3749                          */
3750                         break;
3751                 default:
3752                         /*
3753                          *      Must check send_head, write_queue, and ack_backlog
3754                          *      to determine which timeout to use.
3755                          */
3756                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3757                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3758                         } else if (sk->keepopen) {
3759                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3760                         } else {
3761                                 del_timer(&sk->retransmit_timer);
3762                                 sk->ip_xmit_timeout = 0;
3763                         }
3764                         break;
3765                 }
3766         }
3767 
3768         /*
3769          *      We have nothing queued but space to send. Send any partial
3770          *      packets immediately (end of Nagle rule application).
3771          */
3772          
3773         if (sk->packets_out == 0 && sk->partial != NULL &&
3774                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3775         {
3776                 flag |= 1;
3777                 tcp_send_partial(sk);
3778         }
3779 
3780         /*
3781          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3782          * we are now waiting for an acknowledge to our FIN.  The other end is
3783          * already in TIME_WAIT.
3784          *
3785          * Move to TCP_CLOSE on success.
3786          */
3787 
3788         if (sk->state == TCP_LAST_ACK) 
3789         {
3790                 if (!sk->dead)
3791                         sk->state_change(sk);
3792                 if(sk->debug)
3793                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3794                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3795                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3796                 {
3797                         flag |= 1;
3798                         tcp_set_state(sk,TCP_CLOSE);
3799                         sk->shutdown = SHUTDOWN_MASK;
3800                 }
3801         }
3802 
3803         /*
3804          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3805          *
3806          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3807          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3808          */
3809 
3810         if (sk->state == TCP_FIN_WAIT1) 
3811         {
3812 
3813                 if (!sk->dead) 
3814                         sk->state_change(sk);
3815                 if (sk->rcv_ack_seq == sk->write_seq) 
3816                 {
3817                         flag |= 1;
3818                         sk->shutdown |= SEND_SHUTDOWN;
3819                         tcp_set_state(sk, TCP_FIN_WAIT2);
3820                 }
3821         }
3822 
3823         /*
3824          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3825          *
3826          *      Move to TIME_WAIT
3827          */
3828 
3829         if (sk->state == TCP_CLOSING) 
3830         {
3831 
3832                 if (!sk->dead) 
3833                         sk->state_change(sk);
3834                 if (sk->rcv_ack_seq == sk->write_seq) 
3835                 {
3836                         flag |= 1;
3837                         tcp_time_wait(sk);
3838                 }
3839         }
3840         
3841         /*
3842          *      Final ack of a three way shake 
3843          */
3844          
3845         if(sk->state==TCP_SYN_RECV)
3846         {
3847                 tcp_set_state(sk, TCP_ESTABLISHED);
3848                 tcp_options(sk,th);
3849                 sk->dummy_th.dest=th->source;
3850                 sk->copied_seq = sk->acked_seq;
3851                 if(!sk->dead)
3852                         sk->state_change(sk);
3853                 if(sk->max_window==0)
3854                 {
3855                         sk->max_window=32;      /* Sanity check */
3856                         sk->mss=min(sk->max_window,sk->mtu);
3857                 }
3858         }
3859         
3860         /*
3861          * I make no guarantees about the first clause in the following
3862          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3863          * what conditions "!flag" would be true.  However I think the rest
3864          * of the conditions would prevent that from causing any
3865          * unnecessary retransmission. 
3866          *   Clearly if the first packet has expired it should be 
3867          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3868          * harder to explain:  You have to look carefully at how and when the
3869          * timer is set and with what timeout.  The most recent transmission always
3870          * sets the timer.  So in general if the most recent thing has timed
3871          * out, everything before it has as well.  So we want to go ahead and
3872          * retransmit some more.  If we didn't explicitly test for this
3873          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3874          * would not be true.  If you look at the pattern of timing, you can
3875          * show that rto is increased fast enough that the next packet would
3876          * almost never be retransmitted immediately.  Then you'd end up
3877          * waiting for a timeout to send each packet on the retransmission
3878          * queue.  With my implementation of the Karn sampling algorithm,
3879          * the timeout would double each time.  The net result is that it would
3880          * take a hideous amount of time to recover from a single dropped packet.
3881          * It's possible that there should also be a test for TIME_WRITE, but
3882          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3883          * got to be in real retransmission mode.
3884          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3885          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3886          * As long as no further losses occur, this seems reasonable.
3887          */
3888         
3889         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3890                (((flag&2) && sk->retransmits) ||
3891                (sk->send_head->when + sk->rto < jiffies))) 
3892         {
3893                 if(sk->send_head->when + sk->rto < jiffies)
3894                         tcp_retransmit(sk,0);   
3895                 else
3896                 {
3897                         tcp_do_retransmit(sk, 1);
3898                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3899                 }
3900         }
3901 
3902         return(1);
3903 }
3904 
3905 
3906 /*
3907  *      Process the FIN bit. This now behaves as it is supposed to work
3908  *      and the FIN takes effect when it is validly part of sequence
3909  *      space. Not before when we get holes.
3910  *
3911  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3912  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3913  *      TIME-WAIT)
3914  *
3915  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3916  *      close and we go into CLOSING (and later onto TIME-WAIT)
3917  *
3918  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3919  *
3920  */
3921  
3922 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3923 {
3924         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3925 
3926         if (!sk->dead) 
3927         {
3928                 sk->state_change(sk);
3929                 sock_wake_async(sk->socket, 1);
3930         }
3931 
3932         switch(sk->state) 
3933         {
3934                 case TCP_SYN_RECV:
3935                 case TCP_SYN_SENT:
3936                 case TCP_ESTABLISHED:
3937                         /*
3938                          * move to CLOSE_WAIT, tcp_data() already handled
3939                          * sending the ack.
3940                          */
3941                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3942                         if (th->rst)
3943                                 sk->shutdown = SHUTDOWN_MASK;
3944                         break;
3945 
3946                 case TCP_CLOSE_WAIT:
3947                 case TCP_CLOSING:
3948                         /*
3949                          * received a retransmission of the FIN, do
3950                          * nothing.
3951                          */
3952                         break;
3953                 case TCP_TIME_WAIT:
3954                         /*
3955                          * received a retransmission of the FIN,
3956                          * restart the TIME_WAIT timer.
3957                          */
3958                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3959                         return(0);
3960                 case TCP_FIN_WAIT1:
3961                         /*
3962                          * This case occurs when a simultaneous close
3963                          * happens, we must ack the received FIN and
3964                          * enter the CLOSING state.
3965                          *
3966                          * This causes a WRITE timeout, which will either
3967                          * move on to TIME_WAIT when we timeout, or resend
3968                          * the FIN properly (maybe we get rid of that annoying
3969                          * FIN lost hang). The TIME_WRITE code is already correct
3970                          * for handling this timeout.
3971                          */
3972 
3973                         if(sk->ip_xmit_timeout != TIME_WRITE)
3974                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3975                         tcp_set_state(sk,TCP_CLOSING);
3976                         break;
3977                 case TCP_FIN_WAIT2:
3978                         /*
3979                          * received a FIN -- send ACK and enter TIME_WAIT
3980                          */
3981                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3982                         sk->shutdown|=SHUTDOWN_MASK;
3983                         tcp_set_state(sk,TCP_TIME_WAIT);
3984                         break;
3985                 case TCP_CLOSE:
3986                         /*
3987                          * already in CLOSE
3988                          */
3989                         break;
3990                 default:
3991                         tcp_set_state(sk,TCP_LAST_ACK);
3992         
3993                         /* Start the timers. */
3994                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3995                         return(0);
3996         }
3997 
3998         return(0);
3999 }
4000 
4001 
4002 
4003 /*
4004  *      This routine handles the data.  If there is room in the buffer,
4005  *      it will be have already been moved into it.  If there is no
4006  *      room, then we will just have to discard the packet.
4007  */
4008 
4009 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
4010          unsigned long saddr, unsigned short len)
4011 {
4012         struct sk_buff *skb1, *skb2;
4013         struct tcphdr *th;
4014         int dup_dumped=0;
4015         u32 new_seq, shut_seq;
4016 
4017         th = skb->h.th;
4018         skb_pull(skb,th->doff*4);
4019         skb_trim(skb,len-(th->doff*4));
4020 
4021         /*
4022          *      The bytes in the receive read/assembly queue has increased. Needed for the
4023          *      low memory discard algorithm 
4024          */
4025            
4026         sk->bytes_rcv += skb->len;
4027         
4028         if (skb->len == 0 && !th->fin) 
4029         {
4030                 /* 
4031                  *      Don't want to keep passing ack's back and forth. 
4032                  *      (someone sent us dataless, boring frame)
4033                  */
4034                 if (!th->ack)
4035                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4036                 kfree_skb(skb, FREE_READ);
4037                 return(0);
4038         }
4039         
4040         /*
4041          *      We no longer have anyone receiving data on this connection.
4042          */
4043 
4044 #ifndef TCP_DONT_RST_SHUTDOWN            
4045 
4046         if(sk->shutdown & RCV_SHUTDOWN)
4047         {
4048                 /*
4049                  *      FIXME: BSD has some magic to avoid sending resets to
4050                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4051                  *      BSD stacks still have broken keepalives so we want to
4052                  *      cope with it.
4053                  */
4054 
4055                 if(skb->len)    /* We don't care if it's just an ack or
4056                                    a keepalive/window probe */
4057                 {
4058                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4059                         
4060                         /* Do this the way 4.4BSD treats it. Not what I'd
4061                            regard as the meaning of the spec but it's what BSD
4062                            does and clearly they know everything 8) */
4063 
4064                         /*
4065                          *      This is valid because of two things
4066                          *
4067                          *      a) The way tcp_data behaves at the bottom.
4068                          *      b) A fin takes effect when read not when received.
4069                          */
4070                          
4071                         shut_seq=sk->acked_seq+1;       /* Last byte */
4072                         
4073                         if(after(new_seq,shut_seq))
4074                         {
4075                                 if(sk->debug)
4076                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4077                                                 sk, new_seq, shut_seq, sk->blog);
4078                                 if(sk->dead)
4079                                 {
4080                                         sk->acked_seq = new_seq + th->fin;
4081                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4082                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4083                                         tcp_statistics.TcpEstabResets++;
4084                                         tcp_set_state(sk,TCP_CLOSE);
4085                                         sk->err = EPIPE;
4086                                         sk->shutdown = SHUTDOWN_MASK;
4087                                         kfree_skb(skb, FREE_READ);
4088                                         return 0;
4089                                 }
4090                         }
4091                 }
4092         }
4093 
4094 #endif
4095 
4096         /*
4097          *      Now we have to walk the chain, and figure out where this one
4098          *      goes into it.  This is set up so that the last packet we received
4099          *      will be the first one we look at, that way if everything comes
4100          *      in order, there will be no performance loss, and if they come
4101          *      out of order we will be able to fit things in nicely.
4102          *
4103          *      [AC: This is wrong. We should assume in order first and then walk
4104          *       forwards from the first hole based upon real traffic patterns.]
4105          *      
4106          */
4107 
4108         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4109         {
4110                 skb_queue_head(&sk->receive_queue,skb);
4111                 skb1= NULL;
4112         } 
4113         else
4114         {
4115                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4116                 {
4117                         if(sk->debug)
4118                         {
4119                                 printk("skb1=%p :", skb1);
4120                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4121                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4122                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4123                                                 sk->acked_seq);
4124                         }
4125                         
4126                         /*
4127                          *      Optimisation: Duplicate frame or extension of previous frame from
4128                          *      same sequence point (lost ack case).
4129                          *      The frame contains duplicate data or replaces a previous frame
4130                          *      discard the previous frame (safe as sk->inuse is set) and put
4131                          *      the new one in its place.
4132                          */
4133                          
4134                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4135                         {
4136                                 skb_append(skb1,skb);
4137                                 skb_unlink(skb1);
4138                                 kfree_skb(skb1,FREE_READ);
4139                                 dup_dumped=1;
4140                                 skb1=NULL;
4141                                 break;
4142                         }
4143                         
4144                         /*
4145                          *      Found where it fits
4146                          */
4147                          
4148                         if (after(th->seq+1, skb1->h.th->seq))
4149                         {
4150                                 skb_append(skb1,skb);
4151                                 break;
4152                         }
4153                         
4154                         /*
4155                          *      See if we've hit the start. If so insert.
4156                          */
4157                         if (skb1 == skb_peek(&sk->receive_queue))
4158                         {
4159                                 skb_queue_head(&sk->receive_queue, skb);
4160                                 break;
4161                         }
4162                 }
4163         }
4164 
4165         /*
4166          *      Figure out what the ack value for this frame is
4167          */
4168          
4169         th->ack_seq = th->seq + skb->len;
4170         if (th->syn) 
4171                 th->ack_seq++;
4172         if (th->fin)
4173                 th->ack_seq++;
4174 
4175         if (before(sk->acked_seq, sk->copied_seq)) 
4176         {
4177                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4178                 sk->acked_seq = sk->copied_seq;
4179         }
4180 
4181         /*
4182          *      Now figure out if we can ack anything. This is very messy because we really want two
4183          *      receive queues, a completed and an assembly queue. We also want only one transmit
4184          *      queue.
4185          */
4186 
4187         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4188         {
4189                 if (before(th->seq, sk->acked_seq+1)) 
4190                 {
4191                         int newwindow;
4192 
4193                         if (after(th->ack_seq, sk->acked_seq)) 
4194                         {
4195                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4196                                 if (newwindow < 0)
4197                                         newwindow = 0;  
4198                                 sk->window = newwindow;
4199                                 sk->acked_seq = th->ack_seq;
4200                         }
4201                         skb->acked = 1;
4202 
4203                         /*
4204                          *      When we ack the fin, we do the FIN 
4205                          *      processing.
4206                          */
4207 
4208                         if (skb->h.th->fin) 
4209                         {
4210                                 tcp_fin(skb,sk,skb->h.th);
4211                         }
4212           
4213                         for(skb2 = skb->next;
4214                             skb2 != (struct sk_buff *)&sk->receive_queue;
4215                             skb2 = skb2->next) 
4216                         {
4217                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4218                                 {
4219                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4220                                         {
4221                                                 newwindow = sk->window -
4222                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4223                                                 if (newwindow < 0)
4224                                                         newwindow = 0;  
4225                                                 sk->window = newwindow;
4226                                                 sk->acked_seq = skb2->h.th->ack_seq;
4227                                         }
4228                                         skb2->acked = 1;
4229                                         /*
4230                                          *      When we ack the fin, we do
4231                                          *      the fin handling.
4232                                          */
4233                                         if (skb2->h.th->fin) 
4234                                         {
4235                                                 tcp_fin(skb,sk,skb->h.th);
4236                                         }
4237 
4238                                         /*
4239                                          *      Force an immediate ack.
4240                                          */
4241                                          
4242                                         sk->ack_backlog = sk->max_ack_backlog;
4243                                 }
4244                                 else
4245                                 {
4246                                         break;
4247                                 }
4248                         }
4249 
4250                         /*
4251                          *      This also takes care of updating the window.
4252                          *      This if statement needs to be simplified.
4253                          */
4254                         if (!sk->delay_acks ||
4255                             sk->ack_backlog >= sk->max_ack_backlog || 
4256                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4257         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4258                         }
4259                         else 
4260                         {
4261                                 sk->ack_backlog++;
4262                                 if(sk->debug)
4263                                         printk("Ack queued.\n");
4264                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4265                         }
4266                 }
4267         }
4268 
4269         /*
4270          *      If we've missed a packet, send an ack.
4271          *      Also start a timer to send another.
4272          */
4273          
4274         if (!skb->acked) 
4275         {
4276         
4277         /*
4278          *      This is important.  If we don't have much room left,
4279          *      we need to throw out a few packets so we have a good
4280          *      window.  Note that mtu is used, not mss, because mss is really
4281          *      for the send side.  He could be sending us stuff as large as mtu.
4282          */
4283                  
4284                 while (sk->prot->rspace(sk) < sk->mtu) 
4285                 {
4286                         skb1 = skb_peek(&sk->receive_queue);
4287                         if (skb1 == NULL) 
4288                         {
4289                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4290                                 break;
4291                         }
4292 
4293                         /*
4294                          *      Don't throw out something that has been acked. 
4295                          */
4296                  
4297                         if (skb1->acked) 
4298                         {
4299                                 break;
4300                         }
4301                 
4302                         skb_unlink(skb1);
4303                         kfree_skb(skb1, FREE_READ);
4304                 }
4305                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4306                 sk->ack_backlog++;
4307                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4308         }
4309         else
4310         {
4311                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4312         }
4313 
4314         /*
4315          *      Now tell the user we may have some data. 
4316          */
4317          
4318         if (!sk->dead) 
4319         {
4320                 if(sk->debug)
4321                         printk("Data wakeup.\n");
4322                 sk->data_ready(sk,0);
4323         } 
4324         return(0);
4325 }
4326 
4327 
4328 /*
4329  *      This routine is only called when we have urgent data
4330  *      signalled. Its the 'slow' part of tcp_urg. It could be
4331  *      moved inline now as tcp_urg is only called from one
4332  *      place. We handle URGent data wrong. We have to - as
4333  *      BSD still doesn't use the correction from RFC961.
4334  */
4335  
4336 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4337 {
4338         u32 ptr = ntohs(th->urg_ptr);
4339 
4340         if (ptr)
4341                 ptr--;
4342         ptr += th->seq;
4343 
4344         /* ignore urgent data that we've already seen and read */
4345         if (after(sk->copied_seq, ptr))
4346                 return;
4347 
4348         /* do we already have a newer (or duplicate) urgent pointer? */
4349         if (sk->urg_data && !after(ptr, sk->urg_seq))
4350                 return;
4351 
4352         /* tell the world about our new urgent pointer */
4353         if (sk->proc != 0) {
4354                 if (sk->proc > 0) {
4355                         kill_proc(sk->proc, SIGURG, 1);
4356                 } else {
4357                         kill_pg(-sk->proc, SIGURG, 1);
4358                 }
4359         }
4360         sk->urg_data = URG_NOTYET;
4361         sk->urg_seq = ptr;
4362 }
4363 
4364 /*
4365  *      This is the 'fast' part of urgent handling.
4366  */
4367  
4368 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4369         unsigned long saddr, unsigned long len)
4370 {
4371         u32 ptr;
4372 
4373         /*
4374          *      Check if we get a new urgent pointer - normally not 
4375          */
4376          
4377         if (th->urg)
4378                 tcp_check_urg(sk,th);
4379 
4380         /*
4381          *      Do we wait for any urgent data? - normally not
4382          */
4383          
4384         if (sk->urg_data != URG_NOTYET)
4385                 return 0;
4386 
4387         /*
4388          *      Is the urgent pointer pointing into this packet? 
4389          */
4390          
4391         ptr = sk->urg_seq - th->seq + th->doff*4;
4392         if (ptr >= len)
4393                 return 0;
4394 
4395         /*
4396          *      Ok, got the correct packet, update info 
4397          */
4398          
4399         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4400         if (!sk->dead)
4401                 sk->data_ready(sk,0);
4402         return 0;
4403 }
4404 
4405 /*
4406  *      This will accept the next outstanding connection. 
4407  */
4408  
4409 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4410 {
4411         struct sock *newsk;
4412         struct sk_buff *skb;
4413   
4414   /*
4415    * We need to make sure that this socket is listening,
4416    * and that it has something pending.
4417    */
4418 
4419         if (sk->state != TCP_LISTEN) 
4420         {
4421                 sk->err = EINVAL;
4422                 return(NULL); 
4423         }
4424 
4425         /* Avoid the race. */
4426         cli();
4427         sk->inuse = 1;
4428 
4429         while((skb = tcp_dequeue_established(sk)) == NULL) 
4430         {
4431                 if (flags & O_NONBLOCK) 
4432                 {
4433                         sti();
4434                         release_sock(sk);
4435                         sk->err = EAGAIN;
4436                         return(NULL);
4437                 }
4438 
4439                 release_sock(sk);
4440                 interruptible_sleep_on(sk->sleep);
4441                 if (current->signal & ~current->blocked) 
4442                 {
4443                         sti();
4444                         sk->err = ERESTARTSYS;
4445                         return(NULL);
4446                 }
4447                 sk->inuse = 1;
4448         }
4449         sti();
4450 
4451         /*
4452          *      Now all we need to do is return skb->sk. 
4453          */
4454 
4455         newsk = skb->sk;
4456 
4457         kfree_skb(skb, FREE_READ);
4458         sk->ack_backlog--;
4459         release_sock(sk);
4460         return(newsk);
4461 }
4462 
4463 
4464 /*
4465  *      This will initiate an outgoing connection. 
4466  */
4467  
4468 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4469 {
4470         struct sk_buff *buff;
4471         struct device *dev=NULL;
4472         unsigned char *ptr;
4473         int tmp;
4474         int atype;
4475         struct tcphdr *t1;
4476         struct rtable *rt;
4477 
4478         if (sk->state != TCP_CLOSE) 
4479         {
4480                 return(-EISCONN);
4481         }
4482         
4483         if (addr_len < 8) 
4484                 return(-EINVAL);
4485 
4486         if (usin->sin_family && usin->sin_family != AF_INET) 
4487                 return(-EAFNOSUPPORT);
4488 
4489         /*
4490          *      connect() to INADDR_ANY means loopback (BSD'ism).
4491          */
4492         
4493         if(usin->sin_addr.s_addr==INADDR_ANY)
4494                 usin->sin_addr.s_addr=ip_my_addr();
4495                   
4496         /*
4497          *      Don't want a TCP connection going to a broadcast address 
4498          */
4499 
4500         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4501                 return -ENETUNREACH;
4502   
4503         sk->inuse = 1;
4504         sk->daddr = usin->sin_addr.s_addr;
4505         sk->write_seq = tcp_init_seq();
4506         sk->window_seq = sk->write_seq;
4507         sk->rcv_ack_seq = sk->write_seq -1;
4508         sk->err = 0;
4509         sk->dummy_th.dest = usin->sin_port;
4510         release_sock(sk);
4511 
4512         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4513         if (buff == NULL) 
4514         {
4515                 return(-ENOMEM);
4516         }
4517         sk->inuse = 1;
4518         buff->sk = sk;
4519         buff->free = 0;
4520         buff->localroute = sk->localroute;
4521         
4522 
4523         /*
4524          *      Put in the IP header and routing stuff. 
4525          */
4526          
4527         if (sk->localroute)
4528           rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4529         else
4530           rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4531 
4532         /*
4533          *      We need to build the routing stuff from the things saved in skb. 
4534          */
4535 
4536         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4537                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4538         if (tmp < 0) 
4539         {
4540                 sk->prot->wfree(sk, buff);
4541                 release_sock(sk);
4542                 return(-ENETUNREACH);
4543         }
4544 
4545         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4546 
4547         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4548         t1->seq = ntohl(sk->write_seq++);
4549         sk->sent_seq = sk->write_seq;
4550         buff->h.seq = sk->write_seq;
4551         t1->ack = 0;
4552         t1->window = 2;
4553         t1->res1=0;
4554         t1->res2=0;
4555         t1->rst = 0;
4556         t1->urg = 0;
4557         t1->psh = 0;
4558         t1->syn = 1;
4559         t1->urg_ptr = 0;
4560         t1->doff = 6;
4561         /* use 512 or whatever user asked for */
4562         
4563         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4564                 sk->window_clamp=rt->rt_window;
4565         else
4566                 sk->window_clamp=0;
4567 
4568         if (sk->user_mss)
4569                 sk->mtu = sk->user_mss;
4570         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4571                 sk->mtu = rt->rt_mss;
4572         else 
4573         {
4574 #ifdef CONFIG_INET_SNARL
4575                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4576 #else
4577                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4578 #endif
4579                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4580                 else
4581                         sk->mtu = MAX_WINDOW;
4582         }
4583         /*
4584          *      but not bigger than device MTU 
4585          */
4586 
4587         if(sk->mtu <32)
4588                 sk->mtu = 32;   /* Sanity limit */
4589                 
4590         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4591         
4592         /*
4593          *      Put in the TCP options to say MTU. 
4594          */
4595 
4596         ptr = skb_put(buff,4);
4597         ptr[0] = 2;
4598         ptr[1] = 4;
4599         ptr[2] = (sk->mtu) >> 8;
4600         ptr[3] = (sk->mtu) & 0xff;
4601         tcp_send_check(t1, sk->saddr, sk->daddr,
4602                   sizeof(struct tcphdr) + 4, sk);
4603 
4604         /*
4605          *      This must go first otherwise a really quick response will get reset. 
4606          */
4607 
4608         tcp_cache_zap();
4609         tcp_set_state(sk,TCP_SYN_SENT);
4610         if(rt&&rt->rt_flags&RTF_IRTT)
4611                 sk->rto = rt->rt_irtt;
4612         else
4613                 sk->rto = TCP_TIMEOUT_INIT;
4614         sk->retransmit_timer.function=&retransmit_timer;
4615         sk->retransmit_timer.data = (unsigned long)sk;
4616         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4617         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4618 
4619         sk->prot->queue_xmit(sk, dev, buff, 0);  
4620         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4621         tcp_statistics.TcpActiveOpens++;
4622         tcp_statistics.TcpOutSegs++;
4623   
4624         release_sock(sk);
4625         return(0);
4626 }
4627 
4628 
4629 /* This functions checks to see if the tcp header is actually acceptable. */
4630 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4631              struct options *opt, unsigned long saddr, struct device *dev)
4632 {
4633         u32 next_seq;
4634 
4635         next_seq = len - 4*th->doff;
4636         if (th->fin)
4637                 next_seq++;
4638         /* if we have a zero window, we can't have any data in the packet.. */
4639         if (next_seq && !sk->window)
4640                 goto ignore_it;
4641         next_seq += th->seq;
4642 
4643         /*
4644          * This isn't quite right.  sk->acked_seq could be more recent
4645          * than sk->window.  This is however close enough.  We will accept
4646          * slightly more packets than we should, but it should not cause
4647          * problems unless someone is trying to forge packets.
4648          */
4649 
4650         /* have we already seen all of this packet? */
4651         if (!after(next_seq+1, sk->acked_seq))
4652                 goto ignore_it;
4653         /* or does it start beyond the window? */
4654         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4655                 goto ignore_it;
4656 
4657         /* ok, at least part of this packet would seem interesting.. */
4658         return 1;
4659 
4660 ignore_it:
4661         if (th->rst)
4662                 return 0;
4663 
4664         /*
4665          *      Send a reset if we get something not ours and we are
4666          *      unsynchronized. Note: We don't do anything to our end. We
4667          *      are just killing the bogus remote connection then we will
4668          *      connect again and it will work (with luck).
4669          */
4670          
4671         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4672         {
4673                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4674                 return 1;
4675         }
4676 
4677         /* Try to resync things. */
4678         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4679         return 0;
4680 }
4681 
4682 /*
4683  *      When we get a reset we do this.
4684  */
4685 
4686 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4687 {
4688         sk->zapped = 1;
4689         sk->err = ECONNRESET;
4690         if (sk->state == TCP_SYN_SENT)
4691                 sk->err = ECONNREFUSED;
4692         if (sk->state == TCP_CLOSE_WAIT)
4693                 sk->err = EPIPE;
4694 #ifdef TCP_DO_RFC1337           
4695         /*
4696          *      Time wait assassination protection [RFC1337]
4697          */
4698         if(sk->state!=TCP_TIME_WAIT)
4699         {       
4700                 tcp_set_state(sk,TCP_CLOSE);
4701                 sk->shutdown = SHUTDOWN_MASK;
4702         }
4703 #else   
4704         tcp_set_state(sk,TCP_CLOSE);
4705         sk->shutdown = SHUTDOWN_MASK;
4706 #endif  
4707         if (!sk->dead) 
4708                 sk->state_change(sk);
4709         kfree_skb(skb, FREE_READ);
4710         release_sock(sk);
4711         return(0);
4712 }
4713 
4714 /*
4715  *      A TCP packet has arrived.
4716  *              skb->h.raw is the TCP header.
4717  */
4718  
4719 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4720         __u32 daddr, unsigned short len,
4721         __u32 saddr, int redo, struct inet_protocol * protocol)
4722 {
4723         struct tcphdr *th;
4724         struct sock *sk;
4725         int syn_ok=0;
4726         
4727         tcp_statistics.TcpInSegs++;
4728         if(skb->pkt_type!=PACKET_HOST)
4729         {
4730                 kfree_skb(skb,FREE_READ);
4731                 return(0);
4732         }
4733   
4734         th = skb->h.th;
4735 
4736         /*
4737          *      Find the socket, using the last hit cache if applicable.
4738          */
4739 
4740         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4741         {
4742                 sk=(struct sock *)th_cache_sk;
4743                 /*
4744                  *      We think this is causing the bug so
4745                  */
4746                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4747                         printk("Cache mismatch on TCP.\n");
4748         }
4749         else
4750         {
4751                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4752                 th_cache_saddr=saddr;
4753                 th_cache_daddr=daddr;
4754                 th_cache_dport=th->dest;
4755                 th_cache_sport=th->source;
4756                 th_cache_sk=sk;
4757         }               
4758 
4759         /*
4760          *      If this socket has got a reset it's to all intents and purposes 
4761          *      really dead. Count closed sockets as dead.
4762          *
4763          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4764          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4765          *      exist so should cause resets as if the port was unreachable.
4766          */
4767          
4768         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4769                 sk=NULL;
4770 
4771         if (!redo) 
4772         {
4773                 /*
4774                  *      Pull up the IP header.
4775                  */
4776                 skb_pull(skb, skb->h.raw-skb->data);
4777                 /*
4778                  *      Try to use the device checksum if provided.
4779                  */
4780                 if (
4781                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4782                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4783                     )
4784                 {
4785                         skb->sk = NULL;
4786                         kfree_skb(skb,FREE_READ);
4787                         /*
4788                          *      We don't release the socket because it was
4789                          *      never marked in use.
4790                          */
4791                         return(0);
4792                 }
4793                 th->seq = ntohl(th->seq);
4794 
4795                 /* See if we know about the socket. */
4796                 if (sk == NULL) 
4797                 {
4798                         /*
4799                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4800                          */
4801                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4802                         skb->sk = NULL;
4803                         /*
4804                          *      Discard frame
4805                          */
4806                         kfree_skb(skb, FREE_READ);
4807                         return(0);
4808                 }
4809 
4810 /*              skb->len = len;*/
4811                 skb->acked = 0;
4812                 skb->used = 0;
4813                 skb->free = 0;
4814                 skb->saddr = daddr;
4815                 skb->daddr = saddr;
4816         
4817                 /* We may need to add it to the backlog here. */
4818                 cli();
4819                 if (sk->inuse) 
4820                 {
4821                         skb_queue_tail(&sk->back_log, skb);
4822                         sti();
4823                         return(0);
4824                 }
4825                 sk->inuse = 1;
4826                 sti();
4827         }
4828         else
4829         {
4830                 if (sk==NULL) 
4831                 {
4832                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4833                         skb->sk = NULL;
4834                         kfree_skb(skb, FREE_READ);
4835                         return(0);
4836                 }
4837         }
4838 
4839 
4840         if (!sk->prot) 
4841         {
4842                 printk("IMPOSSIBLE 3\n");
4843                 return(0);
4844         }
4845 
4846 
4847         /*
4848          *      Charge the memory to the socket. 
4849          */
4850          
4851         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4852         {
4853                 kfree_skb(skb, FREE_READ);
4854                 release_sock(sk);
4855                 return(0);
4856         }
4857 
4858         skb->sk=sk;
4859         sk->rmem_alloc += skb->truesize;
4860 
4861         /*
4862          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4863          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4864          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4865          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4866          */
4867 
4868         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4869         {
4870         
4871                 /*
4872                  *      Now deal with unusual cases.
4873                  */
4874          
4875                 if(sk->state==TCP_LISTEN)
4876                 {
4877                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4878                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4879 
4880                         /*
4881                          *      We don't care for RST, and non SYN are absorbed (old segments)
4882                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4883                          *      netmask on a running connection it can go broadcast. Even Sun's have
4884                          *      this problem so I'm ignoring it 
4885                          */
4886                            
4887                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4888                         {
4889                                 kfree_skb(skb, FREE_READ);
4890                                 release_sock(sk);
4891                                 return 0;
4892                         }
4893                 
4894                         /*      
4895                          *      Guess we need to make a new socket up 
4896                          */
4897                 
4898                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4899                 
4900                         /*
4901                          *      Now we have several options: In theory there is nothing else
4902                          *      in the frame. KA9Q has an option to send data with the syn,
4903                          *      BSD accepts data with the syn up to the [to be] advertised window
4904                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4905                          *      it, that fits the spec precisely and avoids incompatibilities. It
4906                          *      would be nice in future to drop through and process the data.
4907                          */
4908                          
4909                         release_sock(sk);
4910                         return 0;
4911                 }
4912         
4913                 /* retransmitted SYN? */
4914                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4915                 {
4916                         kfree_skb(skb, FREE_READ);
4917                         release_sock(sk);
4918                         return 0;
4919                 }
4920                 
4921                 /*
4922                  *      SYN sent means we have to look for a suitable ack and either reset
4923                  *      for bad matches or go to connected 
4924                  */
4925            
4926                 if(sk->state==TCP_SYN_SENT)
4927                 {
4928                         /* Crossed SYN or previous junk segment */
4929                         if(th->ack)
4930                         {
4931                                 /* We got an ack, but it's not a good ack */
4932                                 if(!tcp_ack(sk,th,saddr,len))
4933                                 {
4934                                         /* Reset the ack - its an ack from a 
4935                                            different connection  [ th->rst is checked in tcp_reset()] */
4936                                         tcp_statistics.TcpAttemptFails++;
4937                                         tcp_reset(daddr, saddr, th,
4938                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4939                                         kfree_skb(skb, FREE_READ);
4940                                         release_sock(sk);
4941                                         return(0);
4942                                 }
4943                                 if(th->rst)
4944                                         return tcp_std_reset(sk,skb);
4945                                 if(!th->syn)
4946                                 {
4947                                         /* A valid ack from a different connection
4948                                            start. Shouldn't happen but cover it */
4949                                         kfree_skb(skb, FREE_READ);
4950                                         release_sock(sk);
4951                                         return 0;
4952                                 }
4953                                 /*
4954                                  *      Ok.. it's good. Set up sequence numbers and
4955                                  *      move to established.
4956                                  */
4957                                 syn_ok=1;       /* Don't reset this connection for the syn */
4958                                 sk->acked_seq=th->seq+1;
4959                                 sk->fin_seq=th->seq;
4960                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4961                                 tcp_set_state(sk, TCP_ESTABLISHED);
4962                                 tcp_options(sk,th);
4963                                 sk->dummy_th.dest=th->source;
4964                                 sk->copied_seq = sk->acked_seq;
4965                                 if(!sk->dead)
4966                                 {
4967                                         sk->state_change(sk);
4968                                         sock_wake_async(sk->socket, 0);
4969                                 }
4970                                 if(sk->max_window==0)
4971                                 {
4972                                         sk->max_window = 32;
4973                                         sk->mss = min(sk->max_window, sk->mtu);
4974                                 }
4975                         }
4976                         else
4977                         {
4978                                 /* See if SYN's cross. Drop if boring */
4979                                 if(th->syn && !th->rst)
4980                                 {
4981                                         /* Crossed SYN's are fine - but talking to
4982                                            yourself is right out... */
4983                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4984                                                 sk->dummy_th.source==th->source &&
4985                                                 sk->dummy_th.dest==th->dest)
4986                                         {
4987                                                 tcp_statistics.TcpAttemptFails++;
4988                                                 return tcp_std_reset(sk,skb);
4989                                         }
4990                                         tcp_set_state(sk,TCP_SYN_RECV);
4991                                         
4992                                         /*
4993                                          *      FIXME:
4994                                          *      Must send SYN|ACK here
4995                                          */
4996                                 }               
4997                                 /* Discard junk segment */
4998                                 kfree_skb(skb, FREE_READ);
4999                                 release_sock(sk);
5000                                 return 0;
5001                         }
5002                         /*
5003                          *      SYN_RECV with data maybe.. drop through
5004                          */
5005                         goto rfc_step6;
5006                 }
5007 
5008         /*
5009          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5010          *      a more complex suggestion for fixing these reuse issues in RFC1644
5011          *      but not yet ready for general use. Also see RFC1379.
5012          */
5013         
5014 #define BSD_TIME_WAIT
5015 #ifdef BSD_TIME_WAIT
5016                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5017                         after(th->seq, sk->acked_seq) && !th->rst)
5018                 {
5019                         u32 seq = sk->write_seq;
5020                         if(sk->debug)
5021                                 printk("Doing a BSD time wait\n");
5022                         tcp_statistics.TcpEstabResets++;           
5023                         sk->rmem_alloc -= skb->truesize;
5024                         skb->sk = NULL;
5025                         sk->err=ECONNRESET;
5026                         tcp_set_state(sk, TCP_CLOSE);
5027                         sk->shutdown = SHUTDOWN_MASK;
5028                         release_sock(sk);
5029                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5030                         if (sk && sk->state==TCP_LISTEN)
5031                         {
5032                                 sk->inuse=1;
5033                                 skb->sk = sk;
5034                                 sk->rmem_alloc += skb->truesize;
5035                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5036                                 release_sock(sk);
5037                                 return 0;
5038                         }
5039                         kfree_skb(skb, FREE_READ);
5040                         return 0;
5041                 }
5042 #endif  
5043         }
5044 
5045         /*
5046          *      We are now in normal data flow (see the step list in the RFC)
5047          *      Note most of these are inline now. I'll inline the lot when
5048          *      I have time to test it hard and look at what gcc outputs 
5049          */
5050         
5051         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5052         {
5053                 kfree_skb(skb, FREE_READ);
5054                 release_sock(sk);
5055                 return 0;
5056         }
5057 
5058         if(th->rst)
5059                 return tcp_std_reset(sk,skb);
5060         
5061         /*
5062          *      !syn_ok is effectively the state test in RFC793.
5063          */
5064          
5065         if(th->syn && !syn_ok)
5066         {
5067                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5068                 return tcp_std_reset(sk,skb);   
5069         }
5070 
5071         /*
5072          *      Process the ACK
5073          */
5074          
5075 
5076         if(th->ack && !tcp_ack(sk,th,saddr,len))
5077         {
5078                 /*
5079                  *      Our three way handshake failed.
5080                  */
5081                  
5082                 if(sk->state==TCP_SYN_RECV)
5083                 {
5084                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5085                 }
5086                 kfree_skb(skb, FREE_READ);
5087                 release_sock(sk);
5088                 return 0;
5089         }
5090         
5091 rfc_step6:              /* I'll clean this up later */
5092 
5093         /*
5094          *      Process urgent data
5095          */
5096                 
5097         if(tcp_urg(sk, th, saddr, len))
5098         {
5099                 kfree_skb(skb, FREE_READ);
5100                 release_sock(sk);
5101                 return 0;
5102         }
5103         
5104         
5105         /*
5106          *      Process the encapsulated data
5107          */
5108         
5109         if(tcp_data(skb,sk, saddr, len))
5110         {
5111                 kfree_skb(skb, FREE_READ);
5112                 release_sock(sk);
5113                 return 0;
5114         }
5115 
5116         /*
5117          *      And done
5118          */     
5119         
5120         release_sock(sk);
5121         return 0;
5122 }
5123 
5124 /*
5125  *      This routine sends a packet with an out of date sequence
5126  *      number. It assumes the other end will try to ack it.
5127  */
5128 
5129 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5130 {
5131         struct sk_buff *buff,*skb;
5132         struct tcphdr *t1;
5133         struct device *dev=NULL;
5134         int tmp;
5135 
5136         if (sk->zapped)
5137                 return; /* After a valid reset we can send no more */
5138 
5139         /*
5140          *      Write data can still be transmitted/retransmitted in the
5141          *      following states.  If any other state is encountered, return.
5142          *      [listen/close will never occur here anyway]
5143          */
5144 
5145         if (sk->state != TCP_ESTABLISHED && 
5146             sk->state != TCP_CLOSE_WAIT &&
5147             sk->state != TCP_FIN_WAIT1 && 
5148             sk->state != TCP_LAST_ACK &&
5149             sk->state != TCP_CLOSING
5150         ) 
5151         {
5152                 return;
5153         }
5154         if ( before(sk->sent_seq, sk->window_seq) && 
5155             (skb=skb_peek(&sk->write_queue)))
5156         {
5157                 /*
5158                  * We are probing the opening of a window
5159                  * but the window size is != 0
5160                  * must have been a result SWS advoidance ( sender )
5161                  */
5162             
5163                 struct iphdr *iph;
5164                 struct tcphdr *th;
5165                 struct tcphdr *nth;
5166                 unsigned long win_size;
5167 #if 0
5168                 unsigned long ow_size;
5169 #endif
5170                 void * tcp_data_start;
5171         
5172                 /*
5173                  *      How many bytes can we send ?
5174                  */
5175                  
5176                 win_size = sk->window_seq - sk->sent_seq;
5177 
5178                 /*
5179                  *      Recover the buffer pointers
5180                  */
5181                  
5182                 iph = (struct iphdr *)skb->ip_hdr;
5183                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5184 
5185                 /*
5186                  *      Grab the data for a temporary frame
5187                  */
5188                  
5189                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
5190                                      (iph->ihl << 2) +
5191                                      sk->prot->max_header + 15, 
5192                                      1, GFP_ATOMIC);
5193                 if ( buff == NULL )
5194                         return;
5195 
5196                 /* 
5197                  *      If we strip the packet on the write queue we must
5198                  *      be ready to retransmit this one 
5199                  */
5200             
5201                 buff->free = /*0*/1;
5202 
5203                 buff->sk = sk;
5204                 buff->localroute = sk->localroute;
5205                 
5206                 /*
5207                  *      Put headers on the new packet
5208                  */
5209 
5210                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5211                                          IPPROTO_TCP, sk->opt, buff->truesize,
5212                                          sk->ip_tos,sk->ip_ttl);
5213                 if (tmp < 0) 
5214                 {
5215                         sk->prot->wfree(sk, buff);
5216                         return;
5217                 }
5218                 
5219                 /*
5220                  *      Move the TCP header over
5221                  */
5222 
5223                 buff->dev = dev;
5224 
5225                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5226 
5227                 memcpy(nth, th, th->doff * 4);
5228                 
5229                 /*
5230                  *      Correct the new header
5231                  */
5232                  
5233                 nth->ack = 1; 
5234                 nth->ack_seq = ntohl(sk->acked_seq);
5235                 nth->window = ntohs(tcp_select_window(sk));
5236                 nth->check = 0;
5237 
5238                 /*
5239                  *      Find the first data byte.
5240                  */
5241                  
5242                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5243                                 (iph->ihl << 2) + th->doff * 4;
5244 
5245                 /*
5246                  *      Add it to our new buffer
5247                  */
5248                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5249                 
5250                 /*
5251                  *      Remember our right edge sequence number.
5252                  */
5253                  
5254                 buff->h.seq = sk->sent_seq + win_size;
5255                 sk->sent_seq = buff->h.seq;             /* Hack */
5256 #if 0
5257 
5258                 /*
5259                  *      now: shrink the queue head segment 
5260                  */
5261                  
5262                 th->check = 0;
5263                 ow_size = skb->len - win_size - 
5264                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5265 
5266                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5267                 skb_trim(skb,skb->len-win_size);
5268                 sk->sent_seq += win_size;
5269                 th->seq = htonl(sk->sent_seq);
5270                 if (th->urg)
5271                 {
5272                         unsigned short urg_ptr;
5273         
5274                         urg_ptr = ntohs(th->urg_ptr);
5275                         if (urg_ptr <= win_size)
5276                                 th->urg = 0;
5277                         else
5278                         {
5279                                 urg_ptr -= win_size;
5280                                 th->urg_ptr = htons(urg_ptr);
5281                                 nth->urg_ptr = htons(win_size);
5282                         }
5283                 }
5284 #else
5285                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5286                         nth->urg = 0;
5287 #endif          
5288 
5289                 /*
5290                  *      Checksum the split buffer
5291                  */
5292                  
5293                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5294                            nth->doff * 4 + win_size , sk);
5295         }
5296         else
5297         {       
5298                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5299                 if (buff == NULL) 
5300                         return;
5301 
5302                 buff->free = 1;
5303                 buff->sk = sk;
5304                 buff->localroute = sk->localroute;
5305 
5306                 /*
5307                  *      Put in the IP header and routing stuff. 
5308                  */
5309                  
5310                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5311                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5312                 if (tmp < 0) 
5313                 {
5314                         sk->prot->wfree(sk, buff);
5315                         return;
5316                 }
5317 
5318                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5319                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5320 
5321                 /*
5322                  *      Use a previous sequence.
5323                  *      This should cause the other end to send an ack.
5324                  */
5325          
5326                 t1->seq = htonl(sk->sent_seq-1);
5327                 t1->ack = 1; 
5328                 t1->res1= 0;
5329                 t1->res2= 0;
5330                 t1->rst = 0;
5331                 t1->urg = 0;
5332                 t1->psh = 0;
5333                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5334                 t1->syn = 0;
5335                 t1->ack_seq = ntohl(sk->acked_seq);
5336                 t1->window = ntohs(tcp_select_window(sk));
5337                 t1->doff = sizeof(*t1)/4;
5338                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5339 
5340         }               
5341 
5342         /*
5343          *      Send it.
5344          */
5345         
5346         sk->prot->queue_xmit(sk, dev, buff, 1);
5347         tcp_statistics.TcpOutSegs++;
5348 }
5349 
5350 /*
5351  *      A window probe timeout has occurred.
5352  */
5353 
5354 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5355 {
5356         if (sk->zapped)
5357                 return;         /* After a valid reset we can send no more */
5358 
5359         tcp_write_wakeup(sk);
5360 
5361         sk->backoff++;
5362         sk->rto = min(sk->rto << 1, 120*HZ);
5363         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5364         sk->retransmits++;
5365         sk->prot->retransmits ++;
5366 }
5367 
5368 /*
5369  *      Socket option code for TCP. 
5370  */
5371   
5372 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5373 {
5374         int val,err;
5375 
5376         if(level!=SOL_TCP)
5377                 return ip_setsockopt(sk,level,optname,optval,optlen);
5378 
5379         if (optval == NULL) 
5380                 return(-EINVAL);
5381 
5382         err=verify_area(VERIFY_READ, optval, sizeof(int));
5383         if(err)
5384                 return err;
5385         
5386         val = get_user((int *)optval);
5387 
5388         switch(optname)
5389         {
5390                 case TCP_MAXSEG:
5391 /*
5392  * values greater than interface MTU won't take effect.  however at
5393  * the point when this call is done we typically don't yet know
5394  * which interface is going to be used
5395  */
5396                         if(val<1||val>MAX_WINDOW)
5397                                 return -EINVAL;
5398                         sk->user_mss=val;
5399                         return 0;
5400                 case TCP_NODELAY:
5401                         sk->nonagle=(val==0)?0:1;
5402                         return 0;
5403                 default:
5404                         return(-ENOPROTOOPT);
5405         }
5406 }
5407 
5408 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5409 {
5410         int val,err;
5411 
5412         if(level!=SOL_TCP)
5413                 return ip_getsockopt(sk,level,optname,optval,optlen);
5414                         
5415         switch(optname)
5416         {
5417                 case TCP_MAXSEG:
5418                         val=sk->user_mss;
5419                         break;
5420                 case TCP_NODELAY:
5421                         val=sk->nonagle;
5422                         break;
5423                 default:
5424                         return(-ENOPROTOOPT);
5425         }
5426         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5427         if(err)
5428                 return err;
5429         put_user(sizeof(int),(int *) optlen);
5430 
5431         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5432         if(err)
5433                 return err;
5434         put_user(val,(int *)optval);
5435 
5436         return(0);
5437 }       
5438 
5439 
5440 struct proto tcp_prot = {
5441         sock_wmalloc,
5442         sock_rmalloc,
5443         sock_wfree,
5444         sock_rfree,
5445         sock_rspace,
5446         sock_wspace,
5447         tcp_close,
5448         tcp_read,
5449         tcp_write,
5450         tcp_sendto,
5451         tcp_recvfrom,
5452         ip_build_header,
5453         tcp_connect,
5454         tcp_accept,
5455         ip_queue_xmit,
5456         tcp_retransmit,
5457         tcp_write_wakeup,
5458         tcp_read_wakeup,
5459         tcp_rcv,
5460         tcp_select,
5461         tcp_ioctl,
5462         NULL,
5463         tcp_shutdown,
5464         tcp_setsockopt,
5465         tcp_getsockopt,
5466         128,
5467         0,
5468         "TCP",
5469         0, 0,
5470         {NULL,}
5471 };

/* [previous][next][first][last][top][bottom][index][help] */