net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_sendmsg
tcp_read_wakeup
cleanup_rbuf
tcp_recv_urg
tcp_recvmsg
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect 
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It 
  33  *                                      wakes people on errors. select 
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_reset() fixed to work for 
  37  *                                      everything not just packets for 
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had 
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames. 
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst 
  46  *                                      receive otherwise odd bits of prattle 
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug. 
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list 
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential 
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the 
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries. 
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks, 
  69  *                                      so the kernel can layer network 
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised 
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer 
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing 
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if 
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics. 
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle select() after URG properly in 
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg() 
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in 
 110  *                                      tcp_readable(), select() after URG 
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the 
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to 
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in selecting before an 
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since 
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on 
 137  *                                      the RFC's for other useful protocol 
 138  *                                      references see: Comer, KA9Q NOS, and 
 139  *                                      for a reference on the difference 
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC 
 147  *                                      and using multiple timers for sanity. 
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       Select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and 
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if stat is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect()
 180  *
 181  *
 182  * To Fix:
 183  *              Fast path the code. Two things here - fix the window calculation
 184  *              so it doesn't iterate over the queue, also spot packets with no funny
 185  *              options arriving in order and process directly.
 186  *
 187  *              Implement RFC 1191 [Path MTU discovery]
 188  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 189  *              Rewrite output state machine to use a single queue and do low window
 190  *              situations as per the spec (RFC 1122)
 191  *              Speed up input assembly algorithm.
 192  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 193  *              could do with it working on IPv4
 194  *              User settable/learned rtt/max window/mtu
 195  *              Cope with MTU/device switches when retransmitting in tcp.
 196  *              Fix the window handling to use PR's new code.
 197  *
 198  *              Change the fundamental structure to a single send queue maintained
 199  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 200  *              active routes too]). Cut the queue off in tcp_retransmit/
 201  *              tcp_transmit.
 202  *              Change the receive queue to assemble as it goes. This lets us
 203  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 204  *              tcp_data/tcp_read as well as the window shrink crud.
 205  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 206  *              tcp_queue_skb seem obvious routines to extract.
 207  *      
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *              
 245  *      TCP_CLOSE               socket is finished
 246  */
 247 
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * 
 255  * Use of PSH (4.2.2.2)
 256  *   MAY aggregate data sent without the PSH flag. (does)
 257  *   MAY queue data recieved without the PSH flag. (does)
 258  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 259  *   MAY implement PSH on send calls. (doesn't, thus:)
 260  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 261  *     MUST set PSH on last segment (does)
 262  *   MAY pass received PSH to application layer (doesn't)
 263  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 264  * 
 265  * Window Size (4.2.2.3, 4.2.2.16)
 266  *   MUST treat window size as an unsigned number (does)
 267  *   SHOULD treat window size as a 32-bit number (does not)
 268  *   MUST NOT shrink window once it is offered (does not normally)
 269  *   
 270  * Urgent Pointer (4.2.2.4)
 271  * **MUST point urgent pointer to last byte of urgent data (not right
 272  *     after). (doesn't, to be like BSD)
 273  *   MUST inform application layer asynchronously of incoming urgent
 274  *     data. (does)
 275  *   MUST provide application with means of determining the amount of
 276  *     urgent data pending. (does)
 277  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 278  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 279  *      [Follows BSD 1 byte of urgent data]
 280  * 
 281  * TCP Options (4.2.2.5)
 282  *   MUST be able to recieve TCP options in any segment. (does)
 283  *   MUST ignore unsupported options (does)
 284  *   
 285  * Maximum Segment Size Option (4.2.2.6)
 286  *   MUST implement both sending and receiving MSS. (does)
 287  *   SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
 288  *     it always). (does, even when MSS == 536, which is legal)
 289  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 290  *   MUST calculate "effective send MSS" correctly:
 291  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 292  *     (does - but allows operator override)
 293  *  
 294  * TCP Checksum (4.2.2.7)
 295  *   MUST generate and check TCP checksum. (does)
 296  * 
 297  * Initial Sequence Number Selection (4.2.2.8)
 298  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 299  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 300  *     necessary for 10Mbps networks - and harder than BSD to spoof!)
 301  * 
 302  * Simultaneous Open Attempts (4.2.2.10)
 303  *   MUST support simultaneous open attempts (does)
 304  * 
 305  * Recovery from Old Duplicate SYN (4.2.2.11)
 306  *   MUST keep track of active vs. passive open (does)
 307  * 
 308  * RST segment (4.2.2.12)
 309  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 310  *     anything with it, which is standard)
 311  * 
 312  * Closing a Connection (4.2.2.13)
 313  *   MUST inform application of whether connectin was closed by RST or
 314  *     normal close. (does)
 315  *   MAY allow "half-duplex" close (treat connection as closed for the
 316  *     local app, even before handshake is done). (does)
 317  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 318  * 
 319  * Retransmission Timeout (4.2.2.15)
 320  *   MUST implement Jacobson's slow start and congestion avoidance
 321  *     stuff. (does) 
 322  * 
 323  * Probing Zero Windows (4.2.2.17)
 324  *   MUST support probing of zero windows. (does)
 325  *   MAY keep offered window closed indefinitely. (does)
 326  *   MUST allow remote window to stay closed indefinitely. (does)
 327  * 
 328  * Passive Open Calls (4.2.2.18)
 329  *   MUST NOT let new passive open affect other connections. (doesn't)
 330  *   MUST support passive opens (LISTENs) concurrently. (does)
 331  *   
 332  * Time to Live (4.2.2.19)
 333  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 334  * 
 335  * Event Processing (4.2.2.20)
 336  *   SHOULD queue out-of-order segments. (does)
 337  *   MUST aggregate ACK segments whenever possible. (does but badly)
 338  *   
 339  * Retransmission Timeout Calculation (4.2.3.1)
 340  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 341  *     calculation. (does, or at least explains them in the comments 8*b)
 342  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 343  * 
 344  * When to Send an ACK Segment (4.2.3.2)
 345  *   SHOULD implement delayed ACK. (does not)
 346  *   MUST keep ACK delay < 0.5 sec. (N/A)
 347  * 
 348  * When to Send a Window Update (4.2.3.3)
 349  *   MUST implement receiver-side SWS. (does)
 350  *   
 351  * When to Send Data (4.2.3.4)
 352  *   MUST implement sender-side SWS. (does - imperfectly)
 353  *   SHOULD implement Nagle algorithm. (does)
 354  * 
 355  * TCP Connection Failures (4.2.3.5)
 356  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 357  *   SHOULD inform application layer of soft errors. (doesn't)
 358  *   
 359  * TCP Keep-Alives (4.2.3.6)
 360  *   MAY provide keep-alives. (does)
 361  *   MUST make keep-alives configurable on a per-connection basis. (does)
 362  *   MUST default to no keep-alives. (does)
 363  * **MUST make keep-alive interval configurable. (doesn't)
 364  * **MUST make default keep-alive interval > 2 hours. (doesn't)
 365  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 366  *     connection. (doesn't)
 367  *   SHOULD send keep-alive with no data. (does)
 368  * 
 369  * TCP Multihoming (4.2.3.7)
 370  *   MUST get source address from IP layer before sending first
 371  *     SYN. (does)
 372  *   MUST use same local address for all segments of a connection. (does)
 373  * 
 374  * IP Options (4.2.3.8)
 375  *   (I don't think the IP layer sees the IP options, yet.)
 376  *   MUST ignore unsupported IP options. (does, I guess 8*b)
 377  *   MAY support Time Stamp and Record Route. (doesn't)
 378  * **MUST allow application to specify a source route. (doesn't?)
 379  * **MUST allow receieved Source Route option to set route for all future
 380  *     segments on this connection. (doesn't, not that I think it's a
 381  *     huge problem)
 382  * 
 383  * ICMP messages (4.2.3.9)
 384  *   MUST act on ICMP errors. (does)
 385  *   MUST slow transmission upon receipt of a Source Quench. (does)
 386  *   MUST NOT abort connection upon receipt of soft Destination
 387  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 388  *     Problems. (doesn't)
 389  *   SHOULD report soft Destination Unreachables etc. to the
 390  *     application. (doesn't)
 391  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 392  *     messages (2, 3, 4). (does)
 393  * 
 394  * Remote Address Validation (4.2.3.10)
 395  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 396  *   MUST ignore SYN with invalid source address. (does)
 397  *   MUST silently discard incoming SYN for broadcast/multicast
 398  *     address. (does) 
 399  * 
 400  * Asynchronous Reports (4.2.4.1)
 401  * **MUST provide mechanism for reporting soft errors to application
 402  *     layer. (doesn't)
 403  * 
 404  * Type of Service (4.2.4.2)
 405  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 406  * 
 407  * (Whew. -- MS 950903)
 408  **/
 409 
 410 #include <linux/types.h>
 411 #include <linux/sched.h>
 412 #include <linux/mm.h>
 413 #include <linux/time.h>
 414 #include <linux/string.h>
 415 #include <linux/config.h>
 416 #include <linux/socket.h>
 417 #include <linux/sockios.h>
 418 #include <linux/termios.h>
 419 #include <linux/in.h>
 420 #include <linux/fcntl.h>
 421 #include <linux/inet.h>
 422 #include <linux/netdevice.h>
 423 #include <net/snmp.h>
 424 #include <net/ip.h>
 425 #include <net/protocol.h>
 426 #include <net/icmp.h>
 427 #include <net/tcp.h>
 428 #include <net/arp.h>
 429 #include <linux/skbuff.h>
 430 #include <net/sock.h>
 431 #include <net/route.h>
 432 #include <linux/errno.h>
 433 #include <linux/timer.h>
 434 #include <asm/system.h>
 435 #include <asm/segment.h>
 436 #include <linux/mm.h>
 437 #include <net/checksum.h>
 438 
 439 /*
 440  *      The MSL timer is the 'normal' timer.
 441  */
 442  
 443 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 444 
 445 #define SEQ_TICK 3
 446 unsigned long seq_offset;
 447 struct tcp_mib  tcp_statistics;
 448 
 449 /*
 450  *      Cached last hit socket
 451  */
 452  
 453 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 454 volatile unsigned short  th_cache_dport, th_cache_sport;
 455 volatile struct sock *th_cache_sk;
 456 
 457 void tcp_cache_zap(void)
     /*  */
 458 {
 459         unsigned long flags;
 460         save_flags(flags);
 461         cli();
 462         th_cache_saddr=0;
 463         th_cache_daddr=0;
 464         th_cache_dport=0;
 465         th_cache_sport=0;
 466         th_cache_sk=NULL;
 467         restore_flags(flags);
 468 }
 469 
 470 static void tcp_close(struct sock *sk, int timeout);
 471 
 472 
 473 /*
 474  *      The less said about this the better, but it works and will do for 1.2 
 475  */
 476 
 477 static struct wait_queue *master_select_wakeup;
 478 
 479 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 480 {
 481         if (a < b) 
 482                 return(a);
 483         return(b);
 484 }
 485 
 486 #undef STATE_TRACE
 487 
 488 #ifdef STATE_TRACE
 489 static char *statename[]={
 490         "Unused","Established","Syn Sent","Syn Recv",
 491         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 492         "Close Wait","Last ACK","Listen","Closing"
 493 };
 494 #endif
 495 
 496 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 497 {
 498         if(sk->state==TCP_ESTABLISHED)
 499                 tcp_statistics.TcpCurrEstab--;
 500 #ifdef STATE_TRACE
 501         if(sk->debug)
 502                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 503 #endif  
 504         /* This is a hack but it doesn't occur often and it's going to
 505            be a real        to fix nicely */
 506            
 507         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 508         {
 509                 wake_up_interruptible(&master_select_wakeup);
 510         }
 511         sk->state=state;
 512         if(state==TCP_ESTABLISHED)
 513                 tcp_statistics.TcpCurrEstab++;
 514         if(sk->state==TCP_CLOSE)
 515                 tcp_cache_zap();
 516 }
 517 
 518 /*
 519  *      This routine picks a TCP windows for a socket based on
 520  *      the following constraints
 521  *  
 522  *      1. The window can never be shrunk once it is offered (RFC 793)
 523  *      2. We limit memory per socket
 524  *   
 525  *      For now we use NET2E3's heuristic of offering half the memory
 526  *      we have handy. All is not as bad as this seems however because
 527  *      of two things. Firstly we will bin packets even within the window
 528  *      in order to get the data we are waiting for into the memory limit.
 529  *      Secondly we bin common duplicate forms at receive time
 530  *      Better heuristics welcome
 531  */
 532    
 533 int tcp_select_window(struct sock *sk)
     /*  */
 534 {
 535         int new_window = sock_rspace(sk);
 536         
 537         if(sk->window_clamp)
 538                 new_window=min(sk->window_clamp,new_window);
 539         /*
 540          *      Two things are going on here.  First, we don't ever offer a
 541          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 542          *      receiver side of SWS as specified in RFC1122.
 543          *      Second, we always give them at least the window they
 544          *      had before, in order to avoid retracting window.  This
 545          *      is technically allowed, but RFC1122 advises against it and
 546          *      in practice it causes trouble.
 547          *
 548          *      Fixme: This doesn't correctly handle the case where
 549          *      new_window > sk->window but not by enough to allow for the
 550          *      shift in sequence space. 
 551          */
 552         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 553                 return(sk->window);
 554         return(new_window);
 555 }
 556 
 557 /*
 558  *      Find someone to 'accept'. Must be called with
 559  *      sk->inuse=1 or cli()
 560  */ 
 561 
 562 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 563 {
 564         struct sk_buff *p=skb_peek(&s->receive_queue);
 565         if(p==NULL)
 566                 return NULL;
 567         do
 568         {
 569                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 570                         return p;
 571                 p=p->next;
 572         }
 573         while(p!=(struct sk_buff *)&s->receive_queue);
 574         return NULL;
 575 }
 576 
 577 /*
 578  *      Remove a completed connection and return it. This is used by
 579  *      tcp_accept() to get connections from the queue.
 580  */
 581 
 582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 583 {
 584         struct sk_buff *skb;
 585         unsigned long flags;
 586         save_flags(flags);
 587         cli(); 
 588         skb=tcp_find_established(s);
 589         if(skb!=NULL)
 590                 skb_unlink(skb);        /* Take it off the queue */
 591         restore_flags(flags);
 592         return skb;
 593 }
 594 
 595 /* 
 596  *      This routine closes sockets which have been at least partially
 597  *      opened, but not yet accepted. Currently it is only called by
 598  *      tcp_close, and timeout mirrors the value there. 
 599  */
 600 
 601 static void tcp_close_pending (struct sock *sk) 
     /*  */
 602 {
 603         struct sk_buff *skb;
 604 
 605         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 606         {
 607                 skb->sk->dead=1;
 608                 tcp_close(skb->sk, 0);
 609                 kfree_skb(skb, FREE_READ);
 610         }
 611         return;
 612 }
 613 
 614 /*
 615  *      Enter the time wait state. 
 616  */
 617 
 618 static void tcp_time_wait(struct sock *sk)
     /*  */
 619 {
 620         tcp_set_state(sk,TCP_TIME_WAIT);
 621         sk->shutdown = SHUTDOWN_MASK;
 622         if (!sk->dead)
 623                 sk->state_change(sk);
 624         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 625 }
 626 
 627 /*
 628  *      A socket has timed out on its send queue and wants to do a
 629  *      little retransmitting. Currently this means TCP.
 630  */
 631 
 632 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 633 {
 634         struct sk_buff * skb;
 635         struct proto *prot;
 636         struct device *dev;
 637         int ct=0;
 638         struct rtable *rt;
 639 
 640         prot = sk->prot;
 641         skb = sk->send_head;
 642 
 643         while (skb != NULL)
 644         {
 645                 struct tcphdr *th;
 646                 struct iphdr *iph;
 647                 int size;
 648 
 649                 dev = skb->dev;
 650                 IS_SKB(skb);
 651                 skb->when = jiffies;
 652 
 653                 /*
 654                  *      Discard the surplus MAC header
 655                  */
 656                  
 657                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 658 
 659                 /*
 660                  * In general it's OK just to use the old packet.  However we
 661                  * need to use the current ack and window fields.  Urg and
 662                  * urg_ptr could possibly stand to be updated as well, but we
 663                  * don't keep the necessary data.  That shouldn't be a problem,
 664                  * if the other end is doing the right thing.  Since we're
 665                  * changing the packet, we have to issue a new IP identifier.
 666                  */
 667 
 668                 iph = (struct iphdr *)skb->data;
 669                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 670                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 671                 
 672                 /*
 673                  *      Note: We ought to check for window limits here but
 674                  *      currently this is done (less efficiently) elsewhere.
 675                  */
 676 
 677                 iph->id = htons(ip_id_count++);
 678                 ip_send_check(iph);
 679                 
 680                 /*
 681                  *      Put a MAC header back on (may cause ARPing)
 682                  */
 683                  
 684                 if(skb->localroute)
 685                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 686                 else
 687                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 688                         
 689                 if(rt==NULL)    /* Deep poo */
 690                 {
 691                         if(skb->sk)
 692                         {
 693                                 skb->sk->err=ENETUNREACH;
 694                                 skb->sk->error_report(skb->sk);
 695                         }
 696                 }
 697                 else
 698                 {
 699                         dev=rt->rt_dev;
 700                         skb->raddr=rt->rt_gateway;
 701                         if(skb->raddr==0)
 702                                 skb->raddr=iph->daddr;
 703                         skb->dev=dev;
 704                         skb->arp=1;
 705                         if(dev->hard_header)
 706                         {
 707                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 708                                         skb->arp=0;
 709                         }
 710                 
 711                         /*
 712                          *      This is not the right way to handle this. We have to
 713                          *      issue an up to date window and ack report with this 
 714                          *      retransmit to keep the odd buggy tcp that relies on 
 715                          *      the fact BSD does this happy. 
 716                          *      We don't however need to recalculate the entire 
 717                          *      checksum, so someone wanting a small problem to play
 718                          *      with might like to implement RFC1141/RFC1624 and speed
 719                          *      this up by avoiding a full checksum.
 720                          */
 721                  
 722                         th->ack_seq = ntohl(sk->acked_seq);
 723                         th->window = ntohs(tcp_select_window(sk));
 724                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 725                 
 726                         /*
 727                          *      If the interface is (still) up and running, kick it.
 728                          */
 729         
 730                         if (dev->flags & IFF_UP)
 731                         {
 732                                 /*
 733                                  *      If the packet is still being sent by the device/protocol
 734                                  *      below then don't retransmit. This is both needed, and good -
 735                                  *      especially with connected mode AX.25 where it stops resends
 736                                  *      occurring of an as yet unsent anyway frame!
 737                                  *      We still add up the counts as the round trip time wants
 738                                  *      adjusting.
 739                                  */
 740                                 if (sk && !skb_device_locked(skb))
 741                                 {
 742                                         /* Remove it from any existing driver queue first! */
 743                                         skb_unlink(skb);
 744                                         /* Now queue it */
 745                                         ip_statistics.IpOutRequests++;
 746                                         dev_queue_xmit(skb, dev, sk->priority);
 747                                 }
 748                         }
 749                 }
 750                 
 751                 /*
 752                  *      Count retransmissions
 753                  */
 754                  
 755                 ct++;
 756                 sk->prot->retransmits ++;
 757                 tcp_statistics.TcpRetransSegs++;
 758                 
 759 
 760                 /*
 761                  *      Only one retransmit requested.
 762                  */
 763         
 764                 if (!all)
 765                         break;
 766 
 767                 /*
 768                  *      This should cut it off before we send too many packets.
 769                  */
 770 
 771                 if (ct >= sk->cong_window)
 772                         break;
 773                 skb = skb->link3;
 774         }
 775 }
 776 
 777 /*
 778  *      Reset the retransmission timer
 779  */
 780  
 781 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 782 {
 783         del_timer(&sk->retransmit_timer);
 784         sk->ip_xmit_timeout = why;
 785         if((int)when < 0)
 786         {
 787                 when=3;
 788                 printk("Error: Negative timer in xmit_timer\n");
 789         }
 790         sk->retransmit_timer.expires=jiffies+when;
 791         add_timer(&sk->retransmit_timer);
 792 }
 793 
 794 /*
 795  *      This is the normal code called for timeouts.  It does the retransmission
 796  *      and then does backoff.  tcp_do_retransmit is separated out because
 797  *      tcp_ack needs to send stuff from the retransmit queue without
 798  *      initiating a backoff.
 799  */
 800 
 801 
 802 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 803 {
 804         tcp_do_retransmit(sk, all);
 805 
 806         /*
 807          * Increase the timeout each time we retransmit.  Note that
 808          * we do not increase the rtt estimate.  rto is initialized
 809          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 810          * that doubling rto each time is the least we can get away with.
 811          * In KA9Q, Karn uses this for the first few times, and then
 812          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 813          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 814          * defined in the protocol as the maximum possible RTT.  I guess
 815          * we'll have to use something other than TCP to talk to the
 816          * University of Mars.
 817          *
 818          * PAWS allows us longer timeouts and large windows, so once
 819          * implemented ftp to mars will work nicely. We will have to fix
 820          * the 120 second clamps though!
 821          */
 822 
 823         sk->retransmits++;
 824         sk->prot->retransmits++;
 825         sk->backoff++;
 826         sk->rto = min(sk->rto << 1, 120*HZ);
 827         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 828 }
 829 
 830 
 831 /*
 832  *      A timer event has trigger a tcp retransmit timeout. The
 833  *      socket xmit queue is ready and set up to send. Because
 834  *      the ack receive code keeps the queue straight we do
 835  *      nothing clever here.
 836  */
 837 
 838 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 839 {
 840         if (all) 
 841         {
 842                 tcp_retransmit_time(sk, all);
 843                 return;
 844         }
 845 
 846         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 847         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 848         sk->cong_count = 0;
 849 
 850         sk->cong_window = 1;
 851 
 852         /* Do the actual retransmit. */
 853         tcp_retransmit_time(sk, all);
 854 }
 855 
 856 /*
 857  *      A write timeout has occurred. Process the after effects.
 858  */
 859 
 860 static int tcp_write_timeout(struct sock *sk)
     /*  */
 861 {
 862         /*
 863          *      Look for a 'soft' timeout.
 864          */
 865         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 866                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 867         {
 868                 /*
 869                  *      Attempt to recover if arp has changed (unlikely!) or
 870                  *      a route has shifted (not supported prior to 1.3).
 871                  */
 872                 arp_destroy (sk->daddr, 0);
 873                 /*ip_route_check (sk->daddr);*/
 874         }
 875         
 876         /*
 877          *      Have we tried to SYN too many times (repent repent 8))
 878          */
 879          
 880         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 881         {
 882                 sk->err=ETIMEDOUT;
 883                 sk->error_report(sk);
 884                 del_timer(&sk->retransmit_timer);
 885                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 886                 tcp_set_state(sk,TCP_CLOSE);
 887                 /* Don't FIN, we got nothing back */
 888                 release_sock(sk);
 889                 return 0;
 890         }
 891         /*
 892          *      Has it gone just too far ?
 893          */
 894         if (sk->retransmits > TCP_RETR2) 
 895         {
 896                 sk->err = ETIMEDOUT;
 897                 sk->error_report(sk);
 898                 del_timer(&sk->retransmit_timer);
 899                 /*
 900                  *      Time wait the socket 
 901                  */
 902                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 903                 {
 904                         tcp_set_state(sk,TCP_TIME_WAIT);
 905                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 906                 }
 907                 else
 908                 {
 909                         /*
 910                          *      Clean up time.
 911                          */
 912                         tcp_set_state(sk, TCP_CLOSE);
 913                         release_sock(sk);
 914                         return 0;
 915                 }
 916         }
 917         return 1;
 918 }
 919 
 920 /*
 921  *      The TCP retransmit timer. This lacks a few small details.
 922  *
 923  *      1.      An initial rtt timeout on the probe0 should cause what we can
 924  *              of the first write queue buffer to be split and sent.
 925  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 926  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 927  *              tcp_err should save a 'soft error' for us.
 928  */
 929 
 930 static void retransmit_timer(unsigned long data)
     /*  */
 931 {
 932         struct sock *sk = (struct sock*)data;
 933         int why = sk->ip_xmit_timeout;
 934 
 935         /* 
 936          * only process if socket is not in use
 937          */
 938 
 939         cli();
 940         if (sk->inuse || in_bh) 
 941         {
 942                 /* Try again in 1 second */
 943                 sk->retransmit_timer.expires = jiffies+HZ;
 944                 add_timer(&sk->retransmit_timer);
 945                 sti();
 946                 return;
 947         }
 948 
 949         sk->inuse = 1;
 950         sti();
 951 
 952         /* Always see if we need to send an ack. */
 953 
 954         if (sk->ack_backlog && !sk->zapped) 
 955         {
 956                 sk->prot->read_wakeup (sk);
 957                 if (! sk->dead)
 958                         sk->data_ready(sk,0);
 959         }
 960 
 961         /* Now we need to figure out why the socket was on the timer. */
 962 
 963         switch (why) 
 964         {
 965                 /* Window probing */
 966                 case TIME_PROBE0:
 967                         tcp_send_probe0(sk);
 968                         tcp_write_timeout(sk);
 969                         break;
 970                 /* Retransmitting */
 971                 case TIME_WRITE:
 972                         /* It could be we got here because we needed to send an ack.
 973                          * So we need to check for that.
 974                          */
 975                 {
 976                         struct sk_buff *skb;
 977                         unsigned long flags;
 978 
 979                         save_flags(flags);
 980                         cli();
 981                         skb = sk->send_head;
 982                         if (!skb) 
 983                         {
 984                                 restore_flags(flags);
 985                         } 
 986                         else 
 987                         {
 988                                 /*
 989                                  *      Kicked by a delayed ack. Reset timer
 990                                  *      correctly now
 991                                  */
 992                                 if (jiffies < skb->when + sk->rto) 
 993                                 {
 994                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 995                                         restore_flags(flags);
 996                                         break;
 997                                 }
 998                                 restore_flags(flags);
 999                                 /*
1000                                  *      Retransmission
1001                                  */
1002                                 sk->retransmits++;
1003                                 sk->prot->retransmits++;
1004                                 sk->prot->retransmit (sk, 0);
1005                                 tcp_write_timeout(sk);
1006                         }
1007                         break;
1008                 }
1009                 /* Sending Keepalives */
1010                 case TIME_KEEPOPEN:
1011                         /* 
1012                          * this reset_timer() call is a hack, this is not
1013                          * how KEEPOPEN is supposed to work.
1014                          */
1015                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1016 
1017                         /* Send something to keep the connection open. */
1018                         if (sk->prot->write_wakeup)
1019                                   sk->prot->write_wakeup (sk);
1020                         sk->retransmits++;
1021                         sk->prot->retransmits++;
1022                         tcp_write_timeout(sk);
1023                         break;
1024                 default:
1025                         printk ("rexmit_timer: timer expired - reason unknown\n");
1026                         break;
1027         }
1028         release_sock(sk);
1029 }
1030 
1031 /*
1032  * This routine is called by the ICMP module when it gets some
1033  * sort of error condition.  If err < 0 then the socket should
1034  * be closed and the error returned to the user.  If err > 0
1035  * it's just the icmp type << 8 | icmp code.  After adjustment
1036  * header points to the first 8 bytes of the tcp header.  We need
1037  * to find the appropriate port.
1038  */
1039 
1040 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
     /*  */
1041         __u32 saddr, struct inet_protocol *protocol)
1042 {
1043         struct tcphdr *th;
1044         struct sock *sk;
1045         struct iphdr *iph=(struct iphdr *)header;
1046   
1047         header+=4*iph->ihl;
1048    
1049 
1050         th =(struct tcphdr *)header;
1051         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1052 
1053         if (sk == NULL) 
1054                 return;
1055   
1056         if (type == ICMP_SOURCE_QUENCH) 
1057         {
1058                 /*
1059                  * FIXME:
1060                  * For now we will just trigger a linear backoff.
1061                  * The slow start code should cause a real backoff here.
1062                  */
1063                 if (sk->cong_window > 4)
1064                         sk->cong_window--;
1065                 return;
1066         }
1067         
1068         if (type == ICMP_PARAMETERPROB)
1069         {
1070                 sk->err=EPROTO;
1071                 sk->error_report(sk);
1072         }
1073 
1074         /*
1075          * If we've already connected we will keep trying
1076          * until we time out, or the user gives up.
1077          */
1078 
1079         if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1080         {
1081                 sk->err = icmp_err_convert[code].errno;
1082                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
1083                 {
1084                         tcp_statistics.TcpAttemptFails++;
1085                         tcp_set_state(sk,TCP_CLOSE);
1086                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
1087                 }
1088         }
1089         return;
1090 }
1091 
1092 
1093 /*
1094  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
1095  *      in the received data queue (ie a frame missing that needs sending to us). Not
1096  *      sorting using two queues as data arrives makes life so much harder.
1097  */
1098 
1099 static int tcp_readable(struct sock *sk)
     /*  */
1100 {
1101         unsigned long counted;
1102         unsigned long amount;
1103         struct sk_buff *skb;
1104         int sum;
1105         unsigned long flags;
1106 
1107         if(sk && sk->debug)
1108                 printk("tcp_readable: %p - ",sk);
1109 
1110         save_flags(flags);
1111         cli();
1112         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1113         {
1114                 restore_flags(flags);
1115                 if(sk && sk->debug) 
1116                         printk("empty\n");
1117                 return(0);
1118         }
1119   
1120         counted = sk->copied_seq;       /* Where we are at the moment */
1121         amount = 0;
1122   
1123         /* 
1124          *      Do until a push or until we are out of data. 
1125          */
1126          
1127         do 
1128         {
1129                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
1130                         break;
1131                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
1132                 if (skb->h.th->syn)
1133                         sum++;
1134                 if (sum > 0) 
1135                 {                                       /* Add it up, move on */
1136                         amount += sum;
1137                         if (skb->h.th->syn) 
1138                                 amount--;
1139                         counted += sum;
1140                 }
1141                 /*
1142                  * Don't count urg data ... but do it in the right place!
1143                  * Consider: "old_data (ptr is here) URG PUSH data"
1144                  * The old code would stop at the first push because
1145                  * it counted the urg (amount==1) and then does amount--
1146                  * *after* the loop.  This means tcp_readable() always
1147                  * returned zero if any URG PUSH was in the queue, even
1148                  * though there was normal data available. If we subtract
1149                  * the urg data right here, we even get it to work for more
1150                  * than one URG PUSH skb without normal data.
1151                  * This means that select() finally works now with urg data
1152                  * in the queue.  Note that rlogin was never affected
1153                  * because it doesn't use select(); it uses two processes
1154                  * and a blocking read().  And the queue scan in tcp_read()
1155                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
1156                  */
1157                 if (skb->h.th->urg)
1158                         amount--;       /* don't count urg data */
1159                 if (amount && skb->h.th->psh) break;
1160                 skb = skb->next;
1161         }
1162         while(skb != (struct sk_buff *)&sk->receive_queue);
1163 
1164         restore_flags(flags);
1165         if(sk->debug)
1166                 printk("got %lu bytes.\n",amount);
1167         return(amount);
1168 }
1169 
1170 /*
1171  * LISTEN is a special case for select..
1172  */
1173 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1174 {
1175         if (sel_type == SEL_IN) {
1176                 int retval;
1177 
1178                 sk->inuse = 1;
1179                 retval = (tcp_find_established(sk) != NULL);
1180                 release_sock(sk);
1181                 if (!retval)
1182                         select_wait(&master_select_wakeup,wait);
1183                 return retval;
1184         }
1185         return 0;
1186 }
1187 
1188 
1189 /*
1190  *      Wait for a TCP event.
1191  *
1192  *      Note that we don't need to set "sk->inuse", as the upper select layers
1193  *      take care of normal races (between the test and the event) and we don't
1194  *      go look at any of the socket buffers directly.
1195  */
1196 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1197 {
1198         if (sk->state == TCP_LISTEN)
1199                 return tcp_listen_select(sk, sel_type, wait);
1200 
1201         switch(sel_type) {
1202         case SEL_IN:
1203                 if (sk->err)
1204                         return 1;
1205                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1206                         break;
1207 
1208                 if (sk->shutdown & RCV_SHUTDOWN)
1209                         return 1;
1210                         
1211                 if (sk->acked_seq == sk->copied_seq)
1212                         break;
1213 
1214                 if (sk->urg_seq != sk->copied_seq ||
1215                     sk->acked_seq != sk->copied_seq+1 ||
1216                     sk->urginline || !sk->urg_data)
1217                         return 1;
1218                 break;
1219 
1220         case SEL_OUT:
1221                 if (sk->err)
1222                         return 1;
1223                 if (sk->shutdown & SEND_SHUTDOWN) 
1224                         return 0;
1225                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1226                         break;
1227                 /*
1228                  * This is now right thanks to a small fix
1229                  * by Matt Dillon.
1230                  */
1231 
1232                 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1233                         break;
1234                 return 1;
1235 
1236         case SEL_EX:
1237                 if (sk->urg_data)
1238                         return 1;
1239                 break;
1240         }
1241         select_wait(sk->sleep, wait);
1242         return 0;
1243 }
1244 
1245 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1246 {
1247         int err;
1248         switch(cmd) 
1249         {
1250 
1251                 case TIOCINQ:
1252 #ifdef FIXME    /* FIXME: */
1253                 case FIONREAD:
1254 #endif
1255                 {
1256                         unsigned long amount;
1257 
1258                         if (sk->state == TCP_LISTEN) 
1259                                 return(-EINVAL);
1260 
1261                         sk->inuse = 1;
1262                         amount = tcp_readable(sk);
1263                         release_sock(sk);
1264                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1265                         if(err)
1266                                 return err;
1267                         put_user(amount, (int *)arg);
1268                         return(0);
1269                 }
1270                 case SIOCATMARK:
1271                 {
1272                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1273 
1274                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1275                         if (err)
1276                                 return err;
1277                         put_user(answ,(int *) arg);
1278                         return(0);
1279                 }
1280                 case TIOCOUTQ:
1281                 {
1282                         unsigned long amount;
1283 
1284                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1285                         amount = sock_wspace(sk);
1286                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1287                         if(err)
1288                                 return err;
1289                         put_user(amount, (int *)arg);
1290                         return(0);
1291                 }
1292                 default:
1293                         return(-EINVAL);
1294         }
1295 }
1296 
1297 
1298 /*
1299  *      This routine computes a TCP checksum. 
1300  *
1301  *      Modified January 1995 from a go-faster DOS routine by
1302  *      Jorge Cwik <jorge@laser.satlink.net>
1303  */
1304  
1305 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1306           unsigned long saddr, unsigned long daddr, unsigned long base)
1307 {     
1308         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1309 }
1310 
1311 
1312 
1313 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1314                 unsigned long daddr, int len, struct sock *sk)
1315 {
1316         th->check = 0;
1317         th->check = tcp_check(th, len, saddr, daddr,
1318                 csum_partial((char *)th,len,0));
1319         return;
1320 }
1321 
1322 /*
1323  *      This is the main buffer sending routine. We queue the buffer
1324  *      having checked it is sane seeming.
1325  */
1326  
1327 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1328 {
1329         int size;
1330         struct tcphdr * th = skb->h.th;
1331 
1332         /*
1333          *      length of packet (not counting length of pre-tcp headers) 
1334          */
1335          
1336         size = skb->len - ((unsigned char *) th - skb->data);
1337 
1338         /*
1339          *      Sanity check it.. 
1340          */
1341          
1342         if (size < sizeof(struct tcphdr) || size > skb->len) 
1343         {
1344                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1345                         skb, skb->data, th, skb->len);
1346                 kfree_skb(skb, FREE_WRITE);
1347                 return;
1348         }
1349 
1350         /*
1351          *      If we have queued a header size packet.. (these crash a few
1352          *      tcp stacks if ack is not set)
1353          */
1354          
1355         if (size == sizeof(struct tcphdr)) 
1356         {
1357                 /* If it's got a syn or fin it's notionally included in the size..*/
1358                 if(!th->syn && !th->fin) 
1359                 {
1360                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1361                         kfree_skb(skb,FREE_WRITE);
1362                         return;
1363                 }
1364         }
1365 
1366         /*
1367          *      Actual processing.
1368          */
1369          
1370         tcp_statistics.TcpOutSegs++;  
1371         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1372         
1373         /*
1374          *      We must queue if
1375          *
1376          *      a) The right edge of this frame exceeds the window
1377          *      b) We are retransmitting (Nagle's rule)
1378          *      c) We have too many packets 'in flight'
1379          */
1380          
1381         if (after(skb->h.seq, sk->window_seq) ||
1382             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1383              sk->packets_out >= sk->cong_window) 
1384         {
1385                 /* checksum will be supplied by tcp_write_xmit.  So
1386                  * we shouldn't need to set it at all.  I'm being paranoid */
1387                 th->check = 0;
1388                 if (skb->next != NULL) 
1389                 {
1390                         printk("tcp_send_partial: next != NULL\n");
1391                         skb_unlink(skb);
1392                 }
1393                 skb_queue_tail(&sk->write_queue, skb);
1394                 
1395                 /*
1396                  *      If we don't fit we have to start the zero window
1397                  *      probes. This is broken - we really need to do a partial
1398                  *      send _first_ (This is what causes the Cisco and PC/TCP
1399                  *      grief).
1400                  */
1401                  
1402                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1403                     sk->send_head == NULL && sk->ack_backlog == 0)
1404                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1405         } 
1406         else 
1407         {
1408                 /*
1409                  *      This is going straight out
1410                  */
1411                  
1412                 th->ack_seq = ntohl(sk->acked_seq);
1413                 th->window = ntohs(tcp_select_window(sk));
1414 
1415                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1416 
1417                 sk->sent_seq = sk->write_seq;
1418                 
1419                 /*
1420                  *      This is mad. The tcp retransmit queue is put together
1421                  *      by the ip layer. This causes half the problems with
1422                  *      unroutable FIN's and other things.
1423                  */
1424                  
1425                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1426                 
1427                 /*
1428                  *      Set for next retransmit based on expected ACK time.
1429                  *      FIXME: We set this every time which means our 
1430                  *      retransmits are really about a window behind.
1431                  */
1432 
1433                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1434         }
1435 }
1436 
1437 /*
1438  *      Locking problems lead us to a messy situation where we can have
1439  *      multiple partially complete buffers queued up. This is really bad
1440  *      as we don't want to be sending partial buffers. Fix this with
1441  *      a semaphore or similar to lock tcp_write per socket.
1442  *
1443  *      These routines are pretty self descriptive.
1444  */
1445  
1446 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1447 {
1448         struct sk_buff * skb;
1449         unsigned long flags;
1450 
1451         save_flags(flags);
1452         cli();
1453         skb = sk->partial;
1454         if (skb) {
1455                 sk->partial = NULL;
1456                 del_timer(&sk->partial_timer);
1457         }
1458         restore_flags(flags);
1459         return skb;
1460 }
1461 
1462 /*
1463  *      Empty the partial queue
1464  */
1465  
1466 static void tcp_send_partial(struct sock *sk)
     /*  */
1467 {
1468         struct sk_buff *skb;
1469 
1470         if (sk == NULL)
1471                 return;
1472         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1473                 tcp_send_skb(sk, skb);
1474 }
1475 
1476 /*
1477  *      Queue a partial frame
1478  */
1479  
1480 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1481 {
1482         struct sk_buff * tmp;
1483         unsigned long flags;
1484 
1485         save_flags(flags);
1486         cli();
1487         tmp = sk->partial;
1488         if (tmp)
1489                 del_timer(&sk->partial_timer);
1490         sk->partial = skb;
1491         init_timer(&sk->partial_timer);
1492         /*
1493          *      Wait up to 1 second for the buffer to fill.
1494          */
1495         sk->partial_timer.expires = jiffies+HZ;
1496         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1497         sk->partial_timer.data = (unsigned long) sk;
1498         add_timer(&sk->partial_timer);
1499         restore_flags(flags);
1500         if (tmp)
1501                 tcp_send_skb(sk, tmp);
1502 }
1503 
1504 
1505 /*
1506  *      This routine sends an ack and also updates the window. 
1507  */
1508  
1509 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1510              struct sock *sk,
1511              struct tcphdr *th, unsigned long daddr)
1512 {
1513         struct sk_buff *buff;
1514         struct tcphdr *t1;
1515         struct device *dev = NULL;
1516         int tmp;
1517 
1518         if(sk->zapped)
1519                 return;         /* We have been reset, we may not send again */
1520                 
1521         /*
1522          * We need to grab some memory, and put together an ack,
1523          * and then put it into the queue to be sent.
1524          */
1525 
1526         buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1527         if (buff == NULL) 
1528         {
1529                 /* 
1530                  *      Force it to send an ack. We don't have to do this
1531                  *      (ACK is unreliable) but it's much better use of 
1532                  *      bandwidth on slow links to send a spare ack than
1533                  *      resend packets. 
1534                  */
1535                  
1536                 sk->ack_backlog++;
1537                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1538                 {
1539                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1540                 }
1541                 return;
1542         }
1543 
1544         /*
1545          *      Assemble a suitable TCP frame
1546          */
1547          
1548         buff->sk = sk;
1549         buff->localroute = sk->localroute;
1550 
1551         /* 
1552          *      Put in the IP header and routing stuff. 
1553          */
1554          
1555         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1556                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1557         if (tmp < 0) 
1558         {
1559                 buff->free = 1;
1560                 sock_wfree(sk, buff);
1561                 return;
1562         }
1563         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1564 
1565         memcpy(t1, th, sizeof(*t1));
1566 
1567         /*
1568          *      Swap the send and the receive. 
1569          */
1570          
1571         t1->dest = th->source;
1572         t1->source = th->dest;
1573         t1->seq = ntohl(sequence);
1574         t1->ack = 1;
1575         sk->window = tcp_select_window(sk);
1576         t1->window = ntohs(sk->window);
1577         t1->res1 = 0;
1578         t1->res2 = 0;
1579         t1->rst = 0;
1580         t1->urg = 0;
1581         t1->syn = 0;
1582         t1->psh = 0;
1583         t1->fin = 0;
1584         
1585         /*
1586          *      If we have nothing queued for transmit and the transmit timer
1587          *      is on we are just doing an ACK timeout and need to switch
1588          *      to a keepalive.
1589          */
1590          
1591         if (ack == sk->acked_seq) 
1592         {
1593                 sk->ack_backlog = 0;
1594                 sk->bytes_rcv = 0;
1595                 sk->ack_timed = 0;
1596                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1597                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1598                 {
1599                         if(sk->keepopen) {
1600                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1601                         } else {
1602                                 delete_timer(sk);
1603                         }
1604                 }
1605         }
1606         
1607         /*
1608          *      Fill in the packet and send it
1609          */
1610          
1611         t1->ack_seq = ntohl(ack);
1612         t1->doff = sizeof(*t1)/4;
1613         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1614         if (sk->debug)
1615                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1616         tcp_statistics.TcpOutSegs++;
1617         sk->prot->queue_xmit(sk, dev, buff, 1);
1618 }
1619 
1620 
1621 /* 
1622  *      This routine builds a generic TCP header. 
1623  */
1624  
1625 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1626 {
1627 
1628         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1629         th->seq = htonl(sk->write_seq);
1630         th->psh =(push == 0) ? 1 : 0;
1631         th->doff = sizeof(*th)/4;
1632         th->ack = 1;
1633         th->fin = 0;
1634         sk->ack_backlog = 0;
1635         sk->bytes_rcv = 0;
1636         sk->ack_timed = 0;
1637         th->ack_seq = htonl(sk->acked_seq);
1638         sk->window = tcp_select_window(sk);
1639         th->window = htons(sk->window);
1640 
1641         return(sizeof(*th));
1642 }
1643 
1644 /*
1645  *      This routine copies from a user buffer into a socket,
1646  *      and starts the transmit system.
1647  */
1648 
1649 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
     /*  */
1650           int len, int nonblock, int flags)
1651 {
1652         int copied = 0;
1653         int copy;
1654         int tmp;
1655         int seglen;
1656         int iovct=0;
1657         struct sk_buff *skb;
1658         struct sk_buff *send_tmp;
1659         struct proto *prot;
1660         struct device *dev = NULL;
1661         unsigned char *from;
1662         
1663         /*
1664          *      Do sanity checking for sendmsg/sendto/send
1665          */
1666          
1667         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1668                 return -EINVAL;
1669         if (msg->msg_name)
1670         {
1671                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1672                 if(sk->state == TCP_CLOSE)
1673                         return -ENOTCONN;
1674                 if (msg->msg_namelen < sizeof(*addr))
1675                         return -EINVAL;
1676                 if (addr->sin_family && addr->sin_family != AF_INET) 
1677                         return -EINVAL;
1678                 if (addr->sin_port != sk->dummy_th.dest) 
1679                         return -EISCONN;
1680                 if (addr->sin_addr.s_addr != sk->daddr) 
1681                         return -EISCONN;
1682         }
1683         
1684         /*
1685          *      Ok commence sending
1686          */
1687         
1688         while(iovct<msg->msg_iovlen)
1689         {
1690                 seglen=msg->msg_iov[iovct].iov_len;
1691                 from=msg->msg_iov[iovct++].iov_base;
1692                 sk->inuse=1;
1693                 prot = sk->prot;
1694                 while(seglen > 0) 
1695                 {
1696                         if (sk->err) 
1697                         {                       /* Stop on an error */
1698                                 release_sock(sk);
1699                                 if (copied) 
1700                                         return(copied);
1701                                 return sock_error(sk);
1702                         }
1703 
1704                         /*
1705                          *      First thing we do is make sure that we are established. 
1706                          */
1707         
1708                         if (sk->shutdown & SEND_SHUTDOWN) 
1709                         {
1710                                 release_sock(sk);
1711                                 sk->err = EPIPE;
1712                                 if (copied) 
1713                                         return(copied);
1714                                 sk->err = 0;
1715                                 return(-EPIPE);
1716                         }
1717 
1718                         /* 
1719                          *      Wait for a connection to finish.
1720                          */
1721                 
1722                         while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1723                         {
1724                                 if (sk->err) 
1725                                 {
1726                                         release_sock(sk);
1727                                         if (copied) 
1728                                                 return(copied);
1729                                         return sock_error(sk);
1730                                 }               
1731         
1732                                 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1733                                 {
1734                                         release_sock(sk);
1735                                         if (copied) 
1736                                                 return(copied);
1737         
1738                                         if (sk->err) 
1739                                                 return sock_error(sk);
1740 
1741                                         if (sk->keepopen) 
1742                                         {
1743                                                 send_sig(SIGPIPE, current, 0);
1744                                         }
1745                                         return(-EPIPE);
1746                                 }
1747         
1748                                 if (nonblock || copied) 
1749                                 {
1750                                         release_sock(sk);
1751                                         if (copied) 
1752                                                 return(copied);
1753                                         return(-EAGAIN);
1754                                 }
1755         
1756                                 release_sock(sk);
1757                                 cli();
1758                         
1759                                 if (sk->state != TCP_ESTABLISHED &&
1760                                         sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1761                                 {
1762                                         interruptible_sleep_on(sk->sleep);      
1763                                         if (current->signal & ~current->blocked)
1764                                         {
1765                                                 sti();
1766                                                 if (copied) 
1767                                                         return(copied);
1768                                                 return(-ERESTARTSYS);
1769                                         }
1770                                 }
1771                                 sk->inuse = 1;
1772                                 sti();
1773                         }
1774         
1775                 /*
1776                  * The following code can result in copy <= if sk->mss is ever
1777                  * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1778                  * sk->mtu is constant once SYN processing is finished.  I.e. we
1779                  * had better not get here until we've seen his SYN and at least one
1780                  * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1781                  * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1782                  * non-decreasing.  Note that any ioctl to set user_mss must be done
1783                  * before the exchange of SYN's.  If the initial ack from the other
1784                  * end has a window of 0, max_window and thus mss will both be 0.
1785                  */
1786         
1787                 /* 
1788                  *      Now we need to check if we have a half built packet. 
1789                  */
1790         
1791                         if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1792                         {
1793                                 int hdrlen;
1794 
1795                                  /* IP header + TCP header */
1796                                 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1797                                          + sizeof(struct tcphdr);
1798         
1799                                 /* Add more stuff to the end of skb->len */
1800                                 if (!(flags & MSG_OOB)) 
1801                                 {
1802                                         copy = min(sk->mss - (skb->len - hdrlen), len);
1803                                         /* FIXME: this is really a bug. */
1804                                         if (copy <= 0) 
1805                                         {
1806                                                 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1807                                                 copy = 0;
1808                                         }                 
1809                                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1810                                         from += copy;
1811                                         copied += copy;
1812                                         len -= copy;
1813                                         sk->write_seq += copy;
1814                                         seglen -= copy;
1815                                 }
1816                                 if ((skb->len - hdrlen) >= sk->mss ||
1817                                         (flags & MSG_OOB) || !sk->packets_out)
1818                                         tcp_send_skb(sk, skb);
1819                                 else
1820                                         tcp_enqueue_partial(skb, sk);
1821                                 continue;
1822                         }
1823 
1824                 /*
1825                  * We also need to worry about the window.
1826                  * If window < 1/2 the maximum window we've seen from this
1827                  *   host, don't use it.  This is sender side
1828                  *   silly window prevention, as specified in RFC1122.
1829                  *   (Note that this is different than earlier versions of
1830                  *   SWS prevention, e.g. RFC813.).  What we actually do is 
1831                  *   use the whole MSS.  Since the results in the right
1832                  *   edge of the packet being outside the window, it will
1833                  *   be queued for later rather than sent.
1834                  */
1835 
1836                         copy = sk->window_seq - sk->write_seq;
1837                         if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1838                                 copy = sk->mss;
1839                         if (copy > len)
1840                                 copy = len;
1841 
1842                 /*
1843                  *      We should really check the window here also. 
1844                  */
1845                  
1846                         send_tmp = NULL;
1847                         if (copy < sk->mss && !(flags & MSG_OOB)) 
1848                         {
1849                                 /*
1850                                  *      We will release the socket in case we sleep here. 
1851                                  */
1852                                 release_sock(sk);
1853                                 /*
1854                                  *      NB: following must be mtu, because mss can be increased.
1855                                  *      mss is always <= mtu 
1856                                  */
1857                                 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1858                                 sk->inuse = 1;
1859                                 send_tmp = skb;
1860                         } 
1861                         else 
1862                         {
1863                                 /*
1864                                  *      We will release the socket in case we sleep here. 
1865                                  */
1866                                 release_sock(sk);
1867                                 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1868                                 sk->inuse = 1;
1869                         }
1870         
1871                         /*
1872                          *      If we didn't get any memory, we need to sleep. 
1873                          */
1874         
1875                         if (skb == NULL) 
1876                         {
1877                                 sk->socket->flags |= SO_NOSPACE;
1878                                 if (nonblock) 
1879                                 {
1880                                         release_sock(sk);
1881                                         if (copied) 
1882                                                 return(copied);
1883                                         return(-EAGAIN);
1884                                 }
1885 
1886                                 /*
1887                                  *      FIXME: here is another race condition. 
1888                                  */
1889 
1890                                 tmp = sk->wmem_alloc;
1891                                 release_sock(sk);
1892                                 cli();
1893                                 /*
1894                                  *      Again we will try to avoid it. 
1895                                  */
1896                                 if (tmp <= sk->wmem_alloc &&
1897                                           (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1898                                         && sk->err == 0) 
1899                                 {
1900                                         sk->socket->flags &= ~SO_NOSPACE;
1901                                         interruptible_sleep_on(sk->sleep);
1902                                         if (current->signal & ~current->blocked) 
1903                                         {
1904                                                 sti();
1905                                                 if (copied) 
1906                                                         return(copied);
1907                                                 return(-ERESTARTSYS);
1908                                         }
1909                                 }
1910                                 sk->inuse = 1;
1911                                 sti();
1912                                 continue;
1913                         }
1914 
1915                         skb->sk = sk;
1916                         skb->free = 0;
1917                         skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1918         
1919                         /*
1920                          * FIXME: we need to optimize this.
1921                          * Perhaps some hints here would be good.
1922                          */
1923                 
1924                         tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1925                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1926                         if (tmp < 0 ) 
1927                         {
1928                                 sock_wfree(sk, skb);
1929                                 release_sock(sk);
1930                                 if (copied) 
1931                                         return(copied);
1932                                 return(tmp);
1933                         }
1934                         skb->dev = dev;
1935                         skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1936                         tmp = tcp_build_header(skb->h.th, sk, len-copy);
1937                         if (tmp < 0) 
1938                         {
1939                                 sock_wfree(sk, skb);
1940                                 release_sock(sk);
1941                                 if (copied) 
1942                                         return(copied);
1943                                 return(tmp);
1944                         }
1945         
1946                         if (flags & MSG_OOB) 
1947                         {
1948                                 skb->h.th->urg = 1;
1949                                 skb->h.th->urg_ptr = ntohs(copy);
1950                         }
1951 
1952                         memcpy_fromfs(skb_put(skb,copy), from, copy);
1953                 
1954                         from += copy;
1955                         copied += copy;
1956                         len -= copy;
1957                         seglen -= copy;
1958                         skb->free = 0;
1959                         sk->write_seq += copy;
1960                 
1961                         if (send_tmp != NULL && sk->packets_out) 
1962                         {
1963                                 tcp_enqueue_partial(send_tmp, sk);
1964                                 continue;
1965                         }
1966                         tcp_send_skb(sk, skb);
1967                 }
1968         }
1969         sk->err = 0;
1970 
1971 /*
1972  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1973  *      interactive fast network servers. It's meant to be on and
1974  *      it really improves the throughput though not the echo time
1975  *      on my slow slip link - Alan
1976  */
1977 
1978 /*
1979  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1980  */
1981  
1982         if(sk->partial && ((!sk->packets_out) 
1983      /* If not nagling we can send on the before case too.. */
1984               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1985         ))
1986                 tcp_send_partial(sk);
1987 
1988         release_sock(sk);
1989         return(copied);
1990 }
1991 
1992 /*
1993  *      Send an ack if one is backlogged at this point. Ought to merge
1994  *      this with tcp_send_ack().
1995  */
1996  
1997 static void tcp_read_wakeup(struct sock *sk)
     /*  */
1998 {
1999         int tmp;
2000         struct device *dev = NULL;
2001         struct tcphdr *t1;
2002         struct sk_buff *buff;
2003 
2004         if (!sk->ack_backlog) 
2005                 return;
2006 
2007         /*
2008          * If we're closed, don't send an ack, or we'll get a RST
2009          * from the closed destination.
2010          */
2011         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2012                 return; 
2013 
2014         /*
2015          * FIXME: we need to put code here to prevent this routine from
2016          * being called.  Being called once in a while is ok, so only check
2017          * if this is the second time in a row.
2018          */
2019 
2020         /*
2021          * We need to grab some memory, and put together an ack,
2022          * and then put it into the queue to be sent.
2023          */
2024 
2025         buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2026         if (buff == NULL) 
2027         {
2028                 /* Try again real soon. */
2029                 reset_xmit_timer(sk, TIME_WRITE, HZ);
2030                 return;
2031         }
2032 
2033         buff->sk = sk;
2034         buff->localroute = sk->localroute;
2035         
2036         /*
2037          *      Put in the IP header and routing stuff. 
2038          */
2039 
2040         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2041                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2042         if (tmp < 0) 
2043         {
2044                 buff->free = 1;
2045                 sock_wfree(sk, buff);
2046                 return;
2047         }
2048 
2049         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2050 
2051         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2052         t1->seq = htonl(sk->sent_seq);
2053         t1->ack = 1;
2054         t1->res1 = 0;
2055         t1->res2 = 0;
2056         t1->rst = 0;
2057         t1->urg = 0;
2058         t1->syn = 0;
2059         t1->psh = 0;
2060         sk->ack_backlog = 0;
2061         sk->bytes_rcv = 0;
2062         sk->window = tcp_select_window(sk);
2063         t1->window = ntohs(sk->window);
2064         t1->ack_seq = ntohl(sk->acked_seq);
2065         t1->doff = sizeof(*t1)/4;
2066         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2067         sk->prot->queue_xmit(sk, dev, buff, 1);
2068         tcp_statistics.TcpOutSegs++;
2069 }
2070 
2071 
2072 /*
2073  *      FIXME:
2074  *      This routine frees used buffers.
2075  *      It should consider sending an ACK to let the
2076  *      other end know we now have a bigger window.
2077  */
2078 
2079 static void cleanup_rbuf(struct sock *sk)
     /*  */
2080 {
2081         unsigned long flags;
2082         unsigned long left;
2083         struct sk_buff *skb;
2084         unsigned long rspace;
2085 
2086         if(sk->debug)
2087                 printk("cleaning rbuf for sk=%p\n", sk);
2088   
2089         save_flags(flags);
2090         cli();
2091   
2092         left = sock_rspace(sk);
2093  
2094         /*
2095          *      We have to loop through all the buffer headers,
2096          *      and try to free up all the space we can.
2097          */
2098 
2099         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
2100         {
2101                 if (!skb->used || skb->users) 
2102                         break;
2103                 skb_unlink(skb);
2104                 skb->sk = sk;
2105                 kfree_skb(skb, FREE_READ);
2106         }
2107 
2108         restore_flags(flags);
2109 
2110         /*
2111          *      FIXME:
2112          *      At this point we should send an ack if the difference
2113          *      in the window, and the amount of space is bigger than
2114          *      TCP_WINDOW_DIFF.
2115          */
2116 
2117         if(sk->debug)
2118                 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2119                                             left);
2120         if ((rspace=sock_rspace(sk)) != left) 
2121         {
2122                 /*
2123                  * This area has caused the most trouble.  The current strategy
2124                  * is to simply do nothing if the other end has room to send at
2125                  * least 3 full packets, because the ack from those will auto-
2126                  * matically update the window.  If the other end doesn't think
2127                  * we have much space left, but we have room for at least 1 more
2128                  * complete packet than it thinks we do, we will send an ack
2129                  * immediately.  Otherwise we will wait up to .5 seconds in case
2130                  * the user reads some more.
2131                  */
2132                 sk->ack_backlog++;
2133         /*
2134          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
2135          * if the other end is offering a window smaller than the agreed on MSS
2136          * (called sk->mtu here).  In theory there's no connection between send
2137          * and receive, and so no reason to think that they're going to send
2138          * small packets.  For the moment I'm using the hack of reducing the mss
2139          * only on the send side, so I'm putting mtu here.
2140          */
2141 
2142                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
2143                 {
2144                         /* Send an ack right now. */
2145                         tcp_read_wakeup(sk);
2146                 } 
2147                 else 
2148                 {
2149                         /* Force it to send an ack soon. */
2150                         int was_active = del_timer(&sk->retransmit_timer);
2151                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
2152                         {
2153                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2154                         } 
2155                         else
2156                                 add_timer(&sk->retransmit_timer);
2157                 }
2158         }
2159 } 
2160 
2161 
2162 /*
2163  *      Handle reading urgent data. BSD has very simple semantics for
2164  *      this, no blocking and very strange errors 8)
2165  */
2166  
2167 static int tcp_recv_urg(struct sock * sk, int nonblock,
     /*  */
2168              struct msghdr *msg, int len, int flags, int *addr_len)
2169 {
2170         /*
2171          *      No URG data to read
2172          */
2173         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2174                 return -EINVAL; /* Yes this is right ! */
2175                 
2176         if (sk->err) 
2177                 return sock_error(sk);
2178                 
2179         if (sk->state == TCP_CLOSE || sk->done) 
2180         {
2181                 if (!sk->done) 
2182                 {
2183                         sk->done = 1;
2184                         return 0;
2185                 }
2186                 return -ENOTCONN;
2187         }
2188 
2189         if (sk->shutdown & RCV_SHUTDOWN) 
2190         {
2191                 sk->done = 1;
2192                 return 0;
2193         }
2194         sk->inuse = 1;
2195         if (sk->urg_data & URG_VALID) 
2196         {
2197                 char c = sk->urg_data;
2198                 if (!(flags & MSG_PEEK))
2199                         sk->urg_data = URG_READ;
2200                 memcpy_toiovec(msg->msg_iov, &c, 1);
2201                 if(msg->msg_name)
2202                 {
2203                         struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2204                         sin->sin_family=AF_INET;
2205                         sin->sin_addr.s_addr=sk->daddr;
2206                         sin->sin_port=sk->dummy_th.dest;
2207                 }
2208                 if(addr_len)
2209                         *addr_len=sizeof(struct sockaddr_in);
2210                 release_sock(sk);
2211                 return 1;
2212         }
2213         release_sock(sk);
2214         
2215         /*
2216          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2217          * the available implementations agree in this case:
2218          * this call should never block, independent of the
2219          * blocking state of the socket.
2220          * Mike <pall@rz.uni-karlsruhe.de>
2221          */
2222         return -EAGAIN;
2223 }
2224 
2225 
2226 /*
2227  *      This routine copies from a sock struct into the user buffer. 
2228  */
2229  
2230 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
     /*  */
2231         int len, int nonblock, int flags, int *addr_len)
2232 {
2233         struct wait_queue wait = { current, NULL };
2234         int copied = 0;
2235         u32 peek_seq;
2236         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2237         unsigned long used;
2238 
2239         /* 
2240          *      This error should be checked. 
2241          */
2242          
2243         if (sk->state == TCP_LISTEN)
2244                 return -ENOTCONN;
2245 
2246         /*
2247          *      Urgent data needs to be handled specially. 
2248          */
2249          
2250         if (flags & MSG_OOB)
2251                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2252 
2253         /*
2254          *      Copying sequence to update. This is volatile to handle
2255          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2256          *      inline and thus not flush cached variables otherwise).
2257          */
2258          
2259         peek_seq = sk->copied_seq;
2260         seq = &sk->copied_seq;
2261         if (flags & MSG_PEEK)
2262                 seq = &peek_seq;
2263 
2264         add_wait_queue(sk->sleep, &wait);
2265         sk->inuse = 1;
2266         while (len > 0) 
2267         {
2268                 struct sk_buff * skb;
2269                 u32 offset;
2270         
2271                 /*
2272                  * Are we at urgent data? Stop if we have read anything.
2273                  */
2274                  
2275                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2276                         break;
2277 
2278                 /*
2279                  *      Next get a buffer.
2280                  */
2281                  
2282                 current->state = TASK_INTERRUPTIBLE;
2283 
2284                 skb = skb_peek(&sk->receive_queue);
2285                 do 
2286                 {
2287                         if (!skb)
2288                                 break;
2289                         if (before(*seq, skb->h.th->seq))
2290                                 break;
2291                         offset = *seq - skb->h.th->seq;
2292                         if (skb->h.th->syn)
2293                                 offset--;
2294                         if (offset < skb->len)
2295                                 goto found_ok_skb;
2296                         if (skb->h.th->fin)
2297                                 goto found_fin_ok;
2298                         if (!(flags & MSG_PEEK))
2299                                 skb->used = 1;
2300                         skb = skb->next;
2301                 }
2302                 while (skb != (struct sk_buff *)&sk->receive_queue);
2303 
2304                 if (copied)
2305                         break;
2306 
2307                 if (sk->err) 
2308                 {
2309                         copied = -xchg(&sk->err,0);
2310                         break;
2311                 }
2312 
2313                 if (sk->state == TCP_CLOSE) 
2314                 {
2315                         if (!sk->done) 
2316                         {
2317                                 sk->done = 1;
2318                                 break;
2319                         }
2320                         copied = -ENOTCONN;
2321                         break;
2322                 }
2323 
2324                 if (sk->shutdown & RCV_SHUTDOWN) 
2325                 {
2326                         sk->done = 1;
2327                         break;
2328                 }
2329                         
2330                 if (nonblock) 
2331                 {
2332                         copied = -EAGAIN;
2333                         break;
2334                 }
2335 
2336                 cleanup_rbuf(sk);
2337                 release_sock(sk);
2338                 sk->socket->flags |= SO_WAITDATA;
2339                 schedule();
2340                 sk->socket->flags &= ~SO_WAITDATA;
2341                 sk->inuse = 1;
2342 
2343                 if (current->signal & ~current->blocked) 
2344                 {
2345                         copied = -ERESTARTSYS;
2346                         break;
2347                 }
2348                 continue;
2349 
2350         found_ok_skb:
2351                 /*
2352                  *      Lock the buffer. We can be fairly relaxed as
2353                  *      an interrupt will never steal a buffer we are 
2354                  *      using unless I've missed something serious in
2355                  *      tcp_data.
2356                  */
2357                 
2358                 skb->users++;
2359                 
2360                 /*
2361                  *      Ok so how much can we use ? 
2362                  */
2363                  
2364                 used = skb->len - offset;
2365                 if (len < used)
2366                         used = len;
2367                 /*
2368                  *      Do we have urgent data here? 
2369                  */
2370                 
2371                 if (sk->urg_data) 
2372                 {
2373                         u32 urg_offset = sk->urg_seq - *seq;
2374                         if (urg_offset < used) 
2375                         {
2376                                 if (!urg_offset) 
2377                                 {
2378                                         if (!sk->urginline) 
2379                                         {
2380                                                 ++*seq;
2381                                                 offset++;
2382                                                 used--;
2383                                         }
2384                                 }
2385                                 else
2386                                         used = urg_offset;
2387                         }
2388                 }
2389                 
2390                 /*
2391                  *      Copy it - We _MUST_ update *seq first so that we
2392                  *      don't ever double read when we have dual readers
2393                  */
2394                  
2395                 *seq += used;
2396 
2397                 /*
2398                  *      This memcpy_tofs can sleep. If it sleeps and we
2399                  *      do a second read it relies on the skb->users to avoid
2400                  *      a crash when cleanup_rbuf() gets called.
2401                  */
2402                  
2403                 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2404                         skb->h.th->doff*4 + offset, used);
2405                 copied += used;
2406                 len -= used;
2407                 
2408                 /*
2409                  *      We now will not sleep again until we are finished
2410                  *      with skb. Sorry if you are doing the SMP port
2411                  *      but you'll just have to fix it neatly ;)
2412                  */
2413                  
2414                 skb->users --;
2415                 
2416                 if (after(sk->copied_seq,sk->urg_seq))
2417                         sk->urg_data = 0;
2418                 if (used + offset < skb->len)
2419                         continue;
2420                 
2421                 /*
2422                  *      Process the FIN.
2423                  */
2424 
2425                 if (skb->h.th->fin)
2426                         goto found_fin_ok;
2427                 if (flags & MSG_PEEK)
2428                         continue;
2429                 skb->used = 1;
2430                 continue;
2431 
2432         found_fin_ok:
2433                 ++*seq;
2434                 if (flags & MSG_PEEK)
2435                         break;
2436                         
2437                 /*
2438                  *      All is done
2439                  */
2440                  
2441                 skb->used = 1;
2442                 sk->shutdown |= RCV_SHUTDOWN;
2443                 break;
2444 
2445         }
2446         
2447         if(copied>0 && msg->msg_name)
2448         {
2449                 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2450                 sin->sin_family=AF_INET;
2451                 sin->sin_addr.s_addr=sk->daddr;
2452                 sin->sin_port=sk->dummy_th.dest;
2453         }
2454         if(addr_len)
2455                 *addr_len=sizeof(struct sockaddr_in);
2456                 
2457         remove_wait_queue(sk->sleep, &wait);
2458         current->state = TASK_RUNNING;
2459 
2460         /* Clean up data we have read: This will do ACK frames */
2461         cleanup_rbuf(sk);
2462         release_sock(sk);
2463         return copied;
2464 }
2465 
2466 
2467 
2468 /*
2469  *      State processing on a close. This implements the state shift for
2470  *      sending our FIN frame. Note that we only send a FIN for some 
2471  *      states. A shutdown() may have already sent the FIN, or we may be
2472  *      closed.
2473  */
2474  
2475 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2476 {
2477         int ns=TCP_CLOSE;
2478         int send_fin=0;
2479         switch(sk->state)
2480         {
2481                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2482                         break;
2483                 case TCP_SYN_RECV:
2484                 case TCP_ESTABLISHED:   /* Closedown begin */
2485                         ns=TCP_FIN_WAIT1;
2486                         send_fin=1;
2487                         break;
2488                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2489                 case TCP_FIN_WAIT2:
2490                 case TCP_CLOSING:
2491                         ns=sk->state;
2492                         break;
2493                 case TCP_CLOSE:
2494                 case TCP_LISTEN:
2495                         break;
2496                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2497                                            wait only for the ACK */
2498                         ns=TCP_LAST_ACK;
2499                         send_fin=1;
2500         }
2501         
2502         tcp_set_state(sk,ns);
2503                 
2504         /*
2505          *      This is a (useful) BSD violating of the RFC. There is a
2506          *      problem with TCP as specified in that the other end could
2507          *      keep a socket open forever with no application left this end.
2508          *      We use a 3 minute timeout (about the same as BSD) then kill
2509          *      our end. If they send after that then tough - BUT: long enough
2510          *      that we won't make the old 4*rto = almost no time - whoops
2511          *      reset mistake.
2512          */
2513         if(dead && ns==TCP_FIN_WAIT2)
2514         {
2515                 int timer_active=del_timer(&sk->timer);
2516                 if(timer_active)
2517                         add_timer(&sk->timer);
2518                 else
2519                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2520         }
2521         
2522         return send_fin;
2523 }
2524 
2525 /*
2526  *      Send a fin.
2527  */
2528 
2529 static void tcp_send_fin(struct sock *sk)
     /*  */
2530 {
2531         struct proto *prot =(struct proto *)sk->prot;
2532         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2533         struct tcphdr *t1;
2534         struct sk_buff *buff;
2535         struct device *dev=NULL;
2536         int tmp;
2537                 
2538         release_sock(sk); /* in case the malloc sleeps. */
2539         
2540         buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2541         sk->inuse = 1;
2542 
2543         if (buff == NULL)
2544         {
2545                 /* This is a disaster if it occurs */
2546                 printk("tcp_send_fin: Impossible malloc failure");
2547                 return;
2548         }
2549 
2550         /*
2551          *      Administrivia
2552          */
2553          
2554         buff->sk = sk;
2555         buff->localroute = sk->localroute;
2556 
2557         /*
2558          *      Put in the IP header and routing stuff. 
2559          */
2560 
2561         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2562                            IPPROTO_TCP, sk->opt,
2563                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2564         if (tmp < 0) 
2565         {
2566                 int t;
2567                 /*
2568                  *      Finish anyway, treat this as a send that got lost. 
2569                  *      (Not good).
2570                  */
2571                  
2572                 buff->free = 1;
2573                 sock_wfree(sk,buff);
2574                 sk->write_seq++;
2575                 t=del_timer(&sk->timer);
2576                 if(t)
2577                         add_timer(&sk->timer);
2578                 else
2579                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2580                 return;
2581         }
2582         
2583         /*
2584          *      We ought to check if the end of the queue is a buffer and
2585          *      if so simply add the fin to that buffer, not send it ahead.
2586          */
2587 
2588         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2589         buff->dev = dev;
2590         memcpy(t1, th, sizeof(*t1));
2591         t1->seq = ntohl(sk->write_seq);
2592         sk->write_seq++;
2593         buff->h.seq = sk->write_seq;
2594         t1->ack = 1;
2595         t1->ack_seq = ntohl(sk->acked_seq);
2596         t1->window = ntohs(sk->window=tcp_select_window(sk));
2597         t1->fin = 1;
2598         t1->rst = 0;
2599         t1->doff = sizeof(*t1)/4;
2600         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2601 
2602         /*
2603          * If there is data in the write queue, the fin must be appended to
2604          * the write queue.
2605          */
2606         
2607         if (skb_peek(&sk->write_queue) != NULL) 
2608         {
2609                 buff->free = 0;
2610                 if (buff->next != NULL) 
2611                 {
2612                         printk("tcp_send_fin: next != NULL\n");
2613                         skb_unlink(buff);
2614                 }
2615                 skb_queue_tail(&sk->write_queue, buff);
2616         } 
2617         else 
2618         {
2619                 sk->sent_seq = sk->write_seq;
2620                 sk->prot->queue_xmit(sk, dev, buff, 0);
2621                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2622         }
2623 }
2624 
2625 /*
2626  *      Shutdown the sending side of a connection. Much like close except
2627  *      that we don't receive shut down or set sk->dead=1.
2628  */
2629 
2630 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2631 {
2632         /*
2633          *      We need to grab some memory, and put together a FIN,
2634          *      and then put it into the queue to be sent.
2635          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2636          */
2637 
2638         if (!(how & SEND_SHUTDOWN)) 
2639                 return;
2640          
2641         /*
2642          *      If we've already sent a FIN, or it's a closed state
2643          */
2644          
2645         if (sk->state == TCP_FIN_WAIT1 ||
2646             sk->state == TCP_FIN_WAIT2 ||
2647             sk->state == TCP_CLOSING ||
2648             sk->state == TCP_LAST_ACK ||
2649             sk->state == TCP_TIME_WAIT || 
2650             sk->state == TCP_CLOSE ||
2651             sk->state == TCP_LISTEN
2652           )
2653         {
2654                 return;
2655         }
2656         sk->inuse = 1;
2657 
2658         /*
2659          * flag that the sender has shutdown
2660          */
2661 
2662         sk->shutdown |= SEND_SHUTDOWN;
2663 
2664         /*
2665          *  Clear out any half completed packets. 
2666          */
2667 
2668         if (sk->partial)
2669                 tcp_send_partial(sk);
2670                 
2671         /*
2672          *      FIN if needed
2673          */
2674          
2675         if(tcp_close_state(sk,0))
2676                 tcp_send_fin(sk);
2677                 
2678         release_sock(sk);
2679 }
2680 
2681 /*
2682  *      This routine will send an RST to the other tcp. 
2683  */
2684  
2685 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2686           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2687 {
2688         struct sk_buff *buff;
2689         struct tcphdr *t1;
2690         int tmp;
2691         struct device *ndev=NULL;
2692 
2693         /*
2694          *      Cannot reset a reset (Think about it).
2695          */
2696          
2697         if(th->rst)
2698                 return;
2699   
2700         /*
2701          * We need to grab some memory, and put together an RST,
2702          * and then put it into the queue to be sent.
2703          */
2704 
2705         buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2706         if (buff == NULL) 
2707                 return;
2708 
2709         buff->sk = NULL;
2710         buff->dev = dev;
2711         buff->localroute = 0;
2712 
2713         /*
2714          *      Put in the IP header and routing stuff. 
2715          */
2716 
2717         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2718                            sizeof(struct tcphdr),tos,ttl);
2719         if (tmp < 0) 
2720         {
2721                 buff->free = 1;
2722                 sock_wfree(NULL, buff);
2723                 return;
2724         }
2725 
2726         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2727         memcpy(t1, th, sizeof(*t1));
2728 
2729         /*
2730          *      Swap the send and the receive. 
2731          */
2732 
2733         t1->dest = th->source;
2734         t1->source = th->dest;
2735         t1->rst = 1;  
2736         t1->window = 0;
2737   
2738         if(th->ack)
2739         {
2740                 t1->ack = 0;
2741                 t1->seq = th->ack_seq;
2742                 t1->ack_seq = 0;
2743         }
2744         else
2745         {
2746                 t1->ack = 1;
2747                 if(!th->syn)
2748                         t1->ack_seq=htonl(th->seq);
2749                 else
2750                         t1->ack_seq=htonl(th->seq+1);
2751                 t1->seq=0;
2752         }
2753 
2754         t1->syn = 0;
2755         t1->urg = 0;
2756         t1->fin = 0;
2757         t1->psh = 0;
2758         t1->doff = sizeof(*t1)/4;
2759         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2760         prot->queue_xmit(NULL, ndev, buff, 1);
2761         tcp_statistics.TcpOutSegs++;
2762 }
2763 
2764 
2765 /*
2766  *      Look for tcp options. Parses everything but only knows about MSS.
2767  *      This routine is always called with the packet containing the SYN.
2768  *      However it may also be called with the ack to the SYN.  So you
2769  *      can't assume this is always the SYN.  It's always called after
2770  *      we have set up sk->mtu to our own MTU.
2771  *
2772  *      We need at minimum to add PAWS support here. Possibly large windows
2773  *      as Linux gets deployed on 100Mb/sec networks.
2774  */
2775  
2776 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2777 {
2778         unsigned char *ptr;
2779         int length=(th->doff*4)-sizeof(struct tcphdr);
2780         int mss_seen = 0;
2781     
2782         ptr = (unsigned char *)(th + 1);
2783   
2784         while(length>0)
2785         {
2786                 int opcode=*ptr++;
2787                 int opsize=*ptr++;
2788                 switch(opcode)
2789                 {
2790                         case TCPOPT_EOL:
2791                                 return;
2792                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2793                                 length--;
2794                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2795                                 continue;
2796                         
2797                         default:
2798                                 if(opsize<=2)   /* Avoid silly options looping forever */
2799                                         return;
2800                                 switch(opcode)
2801                                 {
2802                                         case TCPOPT_MSS:
2803                                                 if(opsize==4 && th->syn)
2804                                                 {
2805                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2806                                                         mss_seen = 1;
2807                                                 }
2808                                                 break;
2809                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2810                                 }
2811                                 ptr+=opsize-2;
2812                                 length-=opsize;
2813                 }
2814         }
2815         if (th->syn) 
2816         {
2817                 if (! mss_seen)
2818                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2819         }
2820 #ifdef CONFIG_INET_PCTCP
2821         sk->mss = min(sk->max_window >> 1, sk->mtu);
2822 #else    
2823         sk->mss = min(sk->max_window, sk->mtu);
2824 #endif  
2825 }
2826 
2827 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2828 {
2829         dst = ntohl(dst);
2830         if (IN_CLASSA(dst))
2831                 return htonl(IN_CLASSA_NET);
2832         if (IN_CLASSB(dst))
2833                 return htonl(IN_CLASSB_NET);
2834         return htonl(IN_CLASSC_NET);
2835 }
2836 
2837 /*
2838  *      Default sequence number picking algorithm.
2839  *      As close as possible to RFC 793, which
2840  *      suggests using a 250kHz clock.
2841  *      Further reading shows this assumes 2MB/s networks.
2842  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2843  *      That's funny, Linux has one built in!  Use it!
2844  */
2845 
2846 extern inline u32 tcp_init_seq(void)
     /*  */
2847 {
2848         struct timeval tv;
2849         do_gettimeofday(&tv);
2850         return tv.tv_usec+tv.tv_sec*1000000;
2851 }
2852 
2853 /*
2854  *      This routine handles a connection request.
2855  *      It should make sure we haven't already responded.
2856  *      Because of the way BSD works, we have to send a syn/ack now.
2857  *      This also means it will be harder to close a socket which is
2858  *      listening.
2859  */
2860  
2861 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2862                  unsigned long daddr, unsigned long saddr,
2863                  struct options *opt, struct device *dev, u32 seq)
2864 {
2865         struct sk_buff *buff;
2866         struct tcphdr *t1;
2867         unsigned char *ptr;
2868         struct sock *newsk;
2869         struct tcphdr *th;
2870         struct device *ndev=NULL;
2871         int tmp;
2872         struct rtable *rt;
2873   
2874         th = skb->h.th;
2875 
2876         /* If the socket is dead, don't accept the connection. */
2877         if (!sk->dead) 
2878         {
2879                 sk->data_ready(sk,0);
2880         }
2881         else 
2882         {
2883                 if(sk->debug)
2884                         printk("Reset on %p: Connect on dead socket.\n",sk);
2885                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2886                 tcp_statistics.TcpAttemptFails++;
2887                 kfree_skb(skb, FREE_READ);
2888                 return;
2889         }
2890 
2891         /*
2892          * Make sure we can accept more.  This will prevent a
2893          * flurry of syns from eating up all our memory.
2894          */
2895 
2896         if (sk->ack_backlog >= sk->max_ack_backlog) 
2897         {
2898                 tcp_statistics.TcpAttemptFails++;
2899                 kfree_skb(skb, FREE_READ);
2900                 return;
2901         }
2902 
2903         /*
2904          * We need to build a new sock struct.
2905          * It is sort of bad to have a socket without an inode attached
2906          * to it, but the wake_up's will just wake up the listening socket,
2907          * and if the listening socket is destroyed before this is taken
2908          * off of the queue, this will take care of it.
2909          */
2910 
2911         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2912         if (newsk == NULL) 
2913         {
2914                 /* just ignore the syn.  It will get retransmitted. */
2915                 tcp_statistics.TcpAttemptFails++;
2916                 kfree_skb(skb, FREE_READ);
2917                 return;
2918         }
2919 
2920         memcpy(newsk, sk, sizeof(*newsk));
2921         newsk->opt = NULL;
2922         if (opt && opt->optlen) {
2923           sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2924           if (!sk->opt) {
2925                 kfree_s(newsk, sizeof(struct sock));
2926                 tcp_statistics.TcpAttemptFails++;
2927                 kfree_skb(skb, FREE_READ);
2928                 return;
2929           }
2930           if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2931                 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2932                 kfree_s(newsk, sizeof(struct sock));
2933                 tcp_statistics.TcpAttemptFails++;
2934                 kfree_skb(skb, FREE_READ);
2935                 return;
2936           }
2937         }
2938         skb_queue_head_init(&newsk->write_queue);
2939         skb_queue_head_init(&newsk->receive_queue);
2940         newsk->send_head = NULL;
2941         newsk->send_tail = NULL;
2942         skb_queue_head_init(&newsk->back_log);
2943         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2944         newsk->rto = TCP_TIMEOUT_INIT;
2945         newsk->mdev = 0;
2946         newsk->max_window = 0;
2947         newsk->cong_window = 1;
2948         newsk->cong_count = 0;
2949         newsk->ssthresh = 0;
2950         newsk->backoff = 0;
2951         newsk->blog = 0;
2952         newsk->intr = 0;
2953         newsk->proc = 0;
2954         newsk->done = 0;
2955         newsk->partial = NULL;
2956         newsk->pair = NULL;
2957         newsk->wmem_alloc = 0;
2958         newsk->rmem_alloc = 0;
2959         newsk->localroute = sk->localroute;
2960 
2961         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2962 
2963         newsk->err = 0;
2964         newsk->shutdown = 0;
2965         newsk->ack_backlog = 0;
2966         newsk->acked_seq = skb->h.th->seq+1;
2967         newsk->copied_seq = skb->h.th->seq+1;
2968         newsk->fin_seq = skb->h.th->seq;
2969         newsk->state = TCP_SYN_RECV;
2970         newsk->timeout = 0;
2971         newsk->ip_xmit_timeout = 0;
2972         newsk->write_seq = seq; 
2973         newsk->window_seq = newsk->write_seq;
2974         newsk->rcv_ack_seq = newsk->write_seq;
2975         newsk->urg_data = 0;
2976         newsk->retransmits = 0;
2977         newsk->linger=0;
2978         newsk->destroy = 0;
2979         init_timer(&newsk->timer);
2980         newsk->timer.data = (unsigned long)newsk;
2981         newsk->timer.function = &net_timer;
2982         init_timer(&newsk->retransmit_timer);
2983         newsk->retransmit_timer.data = (unsigned long)newsk;
2984         newsk->retransmit_timer.function=&retransmit_timer;
2985         newsk->dummy_th.source = skb->h.th->dest;
2986         newsk->dummy_th.dest = skb->h.th->source;
2987         
2988         /*
2989          *      Swap these two, they are from our point of view. 
2990          */
2991          
2992         newsk->daddr = saddr;
2993         newsk->saddr = daddr;
2994         newsk->rcv_saddr = daddr;
2995 
2996         put_sock(newsk->num,newsk);
2997         newsk->dummy_th.res1 = 0;
2998         newsk->dummy_th.doff = 6;
2999         newsk->dummy_th.fin = 0;
3000         newsk->dummy_th.syn = 0;
3001         newsk->dummy_th.rst = 0;        
3002         newsk->dummy_th.psh = 0;
3003         newsk->dummy_th.ack = 0;
3004         newsk->dummy_th.urg = 0;
3005         newsk->dummy_th.res2 = 0;
3006         newsk->acked_seq = skb->h.th->seq + 1;
3007         newsk->copied_seq = skb->h.th->seq + 1;
3008         newsk->socket = NULL;
3009 
3010         /*
3011          *      Grab the ttl and tos values and use them 
3012          */
3013 
3014         newsk->ip_ttl=sk->ip_ttl;
3015         newsk->ip_tos=skb->ip_hdr->tos;
3016 
3017         /*
3018          *      Use 512 or whatever user asked for 
3019          */
3020 
3021         /*
3022          *      Note use of sk->user_mss, since user has no direct access to newsk 
3023          */
3024 
3025         rt=ip_rt_route(saddr, NULL,NULL);
3026         
3027         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3028                 newsk->window_clamp = rt->rt_window;
3029         else
3030                 newsk->window_clamp = 0;
3031                 
3032         if (sk->user_mss)
3033                 newsk->mtu = sk->user_mss;
3034         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3035                 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3036         else 
3037         {
3038 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
3039                 if ((saddr ^ daddr) & default_mask(saddr))
3040 #else
3041                 if ((saddr ^ daddr) & dev->pa_mask)
3042 #endif
3043                         newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3044                 else
3045                         newsk->mtu = MAX_WINDOW;
3046         }
3047 
3048         /*
3049          *      But not bigger than device MTU 
3050          */
3051 
3052         newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3053 
3054         /*
3055          *      This will min with what arrived in the packet 
3056          */
3057 
3058         tcp_options(newsk,skb->h.th);
3059         
3060         tcp_cache_zap();
3061 
3062         buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3063         if (buff == NULL) 
3064         {
3065                 sk->err = ENOMEM;
3066                 newsk->dead = 1;
3067                 newsk->state = TCP_CLOSE;
3068                 /* And this will destroy it */
3069                 release_sock(newsk);
3070                 kfree_skb(skb, FREE_READ);
3071                 tcp_statistics.TcpAttemptFails++;
3072                 return;
3073         }
3074   
3075         buff->sk = newsk;
3076         buff->localroute = newsk->localroute;
3077 
3078         /*
3079          *      Put in the IP header and routing stuff. 
3080          */
3081 
3082         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3083                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3084 
3085         /*
3086          *      Something went wrong. 
3087          */
3088 
3089         if (tmp < 0) 
3090         {
3091                 sk->err = tmp;
3092                 buff->free = 1;
3093                 kfree_skb(buff,FREE_WRITE);
3094                 newsk->dead = 1;
3095                 newsk->state = TCP_CLOSE;
3096                 release_sock(newsk);
3097                 skb->sk = sk;
3098                 kfree_skb(skb, FREE_READ);
3099                 tcp_statistics.TcpAttemptFails++;
3100                 return;
3101         }
3102 
3103         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3104   
3105         memcpy(t1, skb->h.th, sizeof(*t1));
3106         buff->h.seq = newsk->write_seq;
3107         /*
3108          *      Swap the send and the receive. 
3109          */
3110         t1->dest = skb->h.th->source;
3111         t1->source = newsk->dummy_th.source;
3112         t1->seq = ntohl(newsk->write_seq++);
3113         t1->ack = 1;
3114         newsk->window = tcp_select_window(newsk);
3115         newsk->sent_seq = newsk->write_seq;
3116         t1->window = ntohs(newsk->window);
3117         t1->res1 = 0;
3118         t1->res2 = 0;
3119         t1->rst = 0;
3120         t1->urg = 0;
3121         t1->psh = 0;
3122         t1->syn = 1;
3123         t1->ack_seq = ntohl(skb->h.th->seq+1);
3124         t1->doff = sizeof(*t1)/4+1;
3125         ptr = skb_put(buff,4);
3126         ptr[0] = 2;
3127         ptr[1] = 4;
3128         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3129         ptr[3] =(newsk->mtu) & 0xff;
3130 
3131         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3132         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3133         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3134         skb->sk = newsk;
3135 
3136         /*
3137          *      Charge the sock_buff to newsk. 
3138          */
3139          
3140         sk->rmem_alloc -= skb->truesize;
3141         newsk->rmem_alloc += skb->truesize;
3142         
3143         skb_queue_tail(&sk->receive_queue,skb);
3144         sk->ack_backlog++;
3145         release_sock(newsk);
3146         tcp_statistics.TcpOutSegs++;
3147 }
3148 
3149 
3150 static void tcp_close(struct sock *sk, int timeout)
     /*  */
3151 {
3152         /*
3153          * We need to grab some memory, and put together a FIN, 
3154          * and then put it into the queue to be sent.
3155          */
3156         
3157         sk->inuse = 1;
3158         
3159         if(th_cache_sk==sk)
3160                 tcp_cache_zap();
3161         if(sk->state == TCP_LISTEN)
3162         {
3163                 /* Special case */
3164                 tcp_set_state(sk, TCP_CLOSE);
3165                 tcp_close_pending(sk);
3166                 release_sock(sk);
3167                 return;
3168         }
3169         
3170         sk->keepopen = 1;
3171         sk->shutdown = SHUTDOWN_MASK;
3172 
3173         if (!sk->dead) 
3174                 sk->state_change(sk);
3175 
3176         if (timeout == 0) 
3177         {
3178                 struct sk_buff *skb;
3179                 
3180                 /*
3181                  *  We need to flush the recv. buffs.  We do this only on the
3182                  *  descriptor close, not protocol-sourced closes, because the
3183                  *  reader process may not have drained the data yet!
3184                  */
3185                  
3186                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3187                         kfree_skb(skb, FREE_READ);
3188                 /*
3189                  *      Get rid off any half-completed packets. 
3190                  */
3191 
3192                 if (sk->partial) 
3193                         tcp_send_partial(sk);
3194         }
3195 
3196                 
3197         /*
3198          *      Timeout is not the same thing - however the code likes
3199          *      to send both the same way (sigh).
3200          */
3201          
3202         if(timeout)
3203         {
3204                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3205         }
3206         else
3207         {
3208                 if(tcp_close_state(sk,1)==1)
3209                 {
3210                         tcp_send_fin(sk);
3211                 }
3212         }
3213         release_sock(sk);
3214 }
3215 
3216 
3217 /*
3218  *      This routine takes stuff off of the write queue,
3219  *      and puts it in the xmit queue. This happens as incoming acks
3220  *      open up the remote window for us.
3221  */
3222  
3223 static void tcp_write_xmit(struct sock *sk)
     /*  */
3224 {
3225         struct sk_buff *skb;
3226 
3227         /*
3228          *      The bytes will have to remain here. In time closedown will
3229          *      empty the write queue and all will be happy 
3230          */
3231 
3232         if(sk->zapped)
3233                 return;
3234 
3235         /*
3236          *      Anything on the transmit queue that fits the window can
3237          *      be added providing we are not
3238          *
3239          *      a) retransmitting (Nagle's rule)
3240          *      b) exceeding our congestion window.
3241          */
3242          
3243         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3244                 before(skb->h.seq, sk->window_seq + 1) &&
3245                 (sk->retransmits == 0 ||
3246                  sk->ip_xmit_timeout != TIME_WRITE ||
3247                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3248                 && sk->packets_out < sk->cong_window) 
3249         {
3250                 IS_SKB(skb);
3251                 skb_unlink(skb);
3252                 
3253                 /*
3254                  *      See if we really need to send the packet. 
3255                  */
3256                  
3257                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3258                 {
3259                         /*
3260                          *      This is acked data. We can discard it. This 
3261                          *      cannot currently occur.
3262                          */
3263                          
3264                         sk->retransmits = 0;
3265                         kfree_skb(skb, FREE_WRITE);
3266                         if (!sk->dead) 
3267                                 sk->write_space(sk);
3268                 } 
3269                 else
3270                 {
3271                         struct tcphdr *th;
3272                         struct iphdr *iph;
3273                         int size;
3274 /*
3275  * put in the ack seq and window at this point rather than earlier,
3276  * in order to keep them monotonic.  We really want to avoid taking
3277  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3278  * Ack and window will in general have changed since this packet was put
3279  * on the write queue.
3280  */
3281                         iph = skb->ip_hdr;
3282                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3283                         size = skb->len - (((unsigned char *) th) - skb->data);
3284                         
3285                         th->ack_seq = ntohl(sk->acked_seq);
3286                         th->window = ntohs(tcp_select_window(sk));
3287 
3288                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3289 
3290                         sk->sent_seq = skb->h.seq;
3291                         
3292                         /*
3293                          *      IP manages our queue for some crazy reason
3294                          */
3295                          
3296                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3297                         
3298                         /*
3299                          *      Again we slide the timer wrongly
3300                          */
3301                          
3302                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3303                 }
3304         }
3305 }
3306 
3307 
3308 /*
3309  *      This routine deals with incoming acks, but not outgoing ones.
3310  */
3311 
3312 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3313 {
3314         u32 ack;
3315         int flag = 0;
3316 
3317         /* 
3318          * 1 - there was data in packet as well as ack or new data is sent or 
3319          *     in shutdown state
3320          * 2 - data from retransmit queue was acked and removed
3321          * 4 - window shrunk or data from retransmit queue was acked and removed
3322          */
3323 
3324         if(sk->zapped)
3325                 return(1);      /* Dead, cant ack any more so why bother */
3326 
3327         /*
3328          *      Have we discovered a larger window
3329          */
3330          
3331         ack = ntohl(th->ack_seq);
3332 
3333         if (ntohs(th->window) > sk->max_window) 
3334         {
3335                 sk->max_window = ntohs(th->window);
3336 #ifdef CONFIG_INET_PCTCP
3337                 /* Hack because we don't send partial packets to non SWS
3338                    handling hosts */
3339                 sk->mss = min(sk->max_window>>1, sk->mtu);
3340 #else
3341                 sk->mss = min(sk->max_window, sk->mtu);
3342 #endif  
3343         }
3344 
3345         /*
3346          *      We have dropped back to keepalive timeouts. Thus we have
3347          *      no retransmits pending.
3348          */
3349          
3350         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3351                 sk->retransmits = 0;
3352 
3353         /*
3354          *      If the ack is newer than sent or older than previous acks
3355          *      then we can probably ignore it.
3356          */
3357          
3358         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3359         {
3360                 if(sk->debug)
3361                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3362                         
3363                 /*
3364                  *      Keepalive processing.
3365                  */
3366                  
3367                 if (after(ack, sk->sent_seq)) 
3368                 {
3369                         return(0);
3370                 }
3371                 
3372                 /*
3373                  *      Restart the keepalive timer.
3374                  */
3375                  
3376                 if (sk->keepopen) 
3377                 {
3378                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3379                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3380                 }
3381                 return(1);
3382         }
3383 
3384         /*
3385          *      If there is data set flag 1
3386          */
3387          
3388         if (len != th->doff*4) 
3389                 flag |= 1;
3390 
3391         /*
3392          *      See if our window has been shrunk. 
3393          */
3394 
3395         if (after(sk->window_seq, ack+ntohs(th->window))) 
3396         {
3397                 /*
3398                  * We may need to move packets from the send queue
3399                  * to the write queue, if the window has been shrunk on us.
3400                  * The RFC says you are not allowed to shrink your window
3401                  * like this, but if the other end does, you must be able
3402                  * to deal with it.
3403                  */
3404                 struct sk_buff *skb;
3405                 struct sk_buff *skb2;
3406                 struct sk_buff *wskb = NULL;
3407         
3408                 skb2 = sk->send_head;
3409                 sk->send_head = NULL;
3410                 sk->send_tail = NULL;
3411         
3412                 /*
3413                  *      This is an artifact of a flawed concept. We want one
3414                  *      queue and a smarter send routine when we send all.
3415                  */
3416         
3417                 flag |= 4;      /* Window changed */
3418         
3419                 sk->window_seq = ack + ntohs(th->window);
3420                 cli();
3421                 while (skb2 != NULL) 
3422                 {
3423                         skb = skb2;
3424                         skb2 = skb->link3;
3425                         skb->link3 = NULL;
3426                         if (after(skb->h.seq, sk->window_seq)) 
3427                         {
3428                                 if (sk->packets_out > 0) 
3429                                         sk->packets_out--;
3430                                 /* We may need to remove this from the dev send list. */
3431                                 if (skb->next != NULL) 
3432                                 {
3433                                         skb_unlink(skb);                                
3434                                 }
3435                                 /* Now add it to the write_queue. */
3436                                 if (wskb == NULL)
3437                                         skb_queue_head(&sk->write_queue,skb);
3438                                 else
3439                                         skb_append(wskb,skb);
3440                                 wskb = skb;
3441                         } 
3442                         else 
3443                         {
3444                                 if (sk->send_head == NULL) 
3445                                 {
3446                                         sk->send_head = skb;
3447                                         sk->send_tail = skb;
3448                                 }
3449                                 else
3450                                 {
3451                                         sk->send_tail->link3 = skb;
3452                                         sk->send_tail = skb;
3453                                 }
3454                                 skb->link3 = NULL;
3455                         }
3456                 }
3457                 sti();
3458         }
3459 
3460         /*
3461          *      Pipe has emptied
3462          */
3463          
3464         if (sk->send_tail == NULL || sk->send_head == NULL) 
3465         {
3466                 sk->send_head = NULL;
3467                 sk->send_tail = NULL;
3468                 sk->packets_out= 0;
3469         }
3470 
3471         /*
3472          *      Update the right hand window edge of the host
3473          */
3474          
3475         sk->window_seq = ack + ntohs(th->window);
3476 
3477         /*
3478          *      We don't want too many packets out there. 
3479          */
3480          
3481         if (sk->ip_xmit_timeout == TIME_WRITE && 
3482                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3483         {
3484                 /* 
3485                  * This is Jacobson's slow start and congestion avoidance. 
3486                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3487                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3488                  * counter and increment it once every cwnd times.  It's possible
3489                  * that this should be done only if sk->retransmits == 0.  I'm
3490                  * interpreting "new data is acked" as including data that has
3491                  * been retransmitted but is just now being acked.
3492                  */
3493                 if (sk->cong_window < sk->ssthresh)  
3494                         /* 
3495                          *      In "safe" area, increase
3496                          */
3497                         sk->cong_window++;
3498                 else 
3499                 {
3500                         /*
3501                          *      In dangerous area, increase slowly.  In theory this is
3502                          *      sk->cong_window += 1 / sk->cong_window
3503                          */
3504                         if (sk->cong_count >= sk->cong_window) 
3505                         {
3506                                 sk->cong_window++;
3507                                 sk->cong_count = 0;
3508                         }
3509                         else 
3510                                 sk->cong_count++;
3511                 }
3512         }
3513 
3514         /*
3515          *      Remember the highest ack received.
3516          */
3517          
3518         sk->rcv_ack_seq = ack;
3519 
3520         /*
3521          *      If this ack opens up a zero window, clear backoff.  It was
3522          *      being used to time the probes, and is probably far higher than
3523          *      it needs to be for normal retransmission.
3524          */
3525 
3526         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3527         {
3528                 sk->retransmits = 0;    /* Our probe was answered */
3529                 
3530                 /*
3531                  *      Was it a usable window open ?
3532                  */
3533                  
3534                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3535                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3536                 {
3537                         sk->backoff = 0;
3538                         
3539                         /*
3540                          *      Recompute rto from rtt.  this eliminates any backoff.
3541                          */
3542 
3543                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3544                         if (sk->rto > 120*HZ)
3545                                 sk->rto = 120*HZ;
3546                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3547                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3548                                                    .2 of a second is going to need huge windows (SIGH) */
3549                         sk->rto = 20;
3550                 }
3551         }
3552 
3553         /* 
3554          *      See if we can take anything off of the retransmit queue.
3555          */
3556    
3557         while(sk->send_head != NULL) 
3558         {
3559                 /* Check for a bug. */
3560                 if (sk->send_head->link3 &&
3561                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3562                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3563                         
3564                 /*
3565                  *      If our packet is before the ack sequence we can
3566                  *      discard it as it's confirmed to have arrived the other end.
3567                  */
3568                  
3569                 if (before(sk->send_head->h.seq, ack+1)) 
3570                 {
3571                         struct sk_buff *oskb;   
3572                         if (sk->retransmits) 
3573                         {       
3574                                 /*
3575                                  *      We were retransmitting.  don't count this in RTT est 
3576                                  */
3577                                 flag |= 2;
3578 
3579                                 /*
3580                                  * even though we've gotten an ack, we're still
3581                                  * retransmitting as long as we're sending from
3582                                  * the retransmit queue.  Keeping retransmits non-zero
3583                                  * prevents us from getting new data interspersed with
3584                                  * retransmissions.
3585                                  */
3586 
3587                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3588                                         sk->retransmits = 1;
3589                                 else
3590                                         sk->retransmits = 0;
3591                         }
3592                         /*
3593                          * Note that we only reset backoff and rto in the
3594                          * rtt recomputation code.  And that doesn't happen
3595                          * if there were retransmissions in effect.  So the
3596                          * first new packet after the retransmissions is
3597                          * sent with the backoff still in effect.  Not until
3598                          * we get an ack from a non-retransmitted packet do
3599                          * we reset the backoff and rto.  This allows us to deal
3600                          * with a situation where the network delay has increased
3601                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3602                          */
3603 
3604                         /*
3605                          *      We have one less packet out there. 
3606                          */
3607                          
3608                         if (sk->packets_out > 0) 
3609                                 sk->packets_out --;
3610                         /* 
3611                          *      Wake up the process, it can probably write more. 
3612                          */
3613                         if (!sk->dead) 
3614                                 sk->write_space(sk);
3615                         oskb = sk->send_head;
3616 
3617                         if (!(flag&2))  /* Not retransmitting */
3618                         {
3619                                 long m;
3620         
3621                                 /*
3622                                  *      The following amusing code comes from Jacobson's
3623                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3624                                  *      are scaled versions of rtt and mean deviation.
3625                                  *      This is designed to be as fast as possible 
3626                                  *      m stands for "measurement".
3627                                  */
3628         
3629                                 m = jiffies - oskb->when;  /* RTT */
3630                                 if(m<=0)
3631                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3632                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3633                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3634                                 if (m < 0)
3635                                         m = -m;         /* m is now abs(error) */
3636                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3637                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3638         
3639                                 /*
3640                                  *      Now update timeout.  Note that this removes any backoff.
3641                                  */
3642                          
3643                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3644                                 if (sk->rto > 120*HZ)
3645                                         sk->rto = 120*HZ;
3646                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3647                                         sk->rto = 20;
3648                                 sk->backoff = 0;
3649                         }
3650                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3651                                            In this case as we just set it up */
3652                         cli();
3653                         oskb = sk->send_head;
3654                         IS_SKB(oskb);
3655                         sk->send_head = oskb->link3;
3656                         if (sk->send_head == NULL) 
3657                         {
3658                                 sk->send_tail = NULL;
3659                         }
3660 
3661                 /*
3662                  *      We may need to remove this from the dev send list. 
3663                  */
3664 
3665                         if (oskb->next)
3666                                 skb_unlink(oskb);
3667                         sti();
3668                         kfree_skb(oskb, FREE_WRITE); /* write. */
3669                         if (!sk->dead) 
3670                                 sk->write_space(sk);
3671                 }
3672                 else
3673                 {
3674                         break;
3675                 }
3676         }
3677 
3678         /*
3679          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3680          * returns non-NULL, we complete ignore the timer stuff in the else
3681          * clause.  We ought to organize the code so that else clause can
3682          * (should) be executed regardless, possibly moving the PROBE timer
3683          * reset over.  The skb_peek() thing should only move stuff to the
3684          * write queue, NOT also manage the timer functions.
3685          */
3686 
3687         /*
3688          * Maybe we can take some stuff off of the write queue,
3689          * and put it onto the xmit queue.
3690          */
3691         if (skb_peek(&sk->write_queue) != NULL) 
3692         {
3693                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3694                         (sk->retransmits == 0 || 
3695                          sk->ip_xmit_timeout != TIME_WRITE ||
3696                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3697                         && sk->packets_out < sk->cong_window) 
3698                 {
3699                         /*
3700                          *      Add more data to the send queue.
3701                          */
3702                         flag |= 1;
3703                         tcp_write_xmit(sk);
3704                 }
3705                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3706                         sk->send_head == NULL &&
3707                         sk->ack_backlog == 0 &&
3708                         sk->state != TCP_TIME_WAIT) 
3709                 {
3710                         /*
3711                          *      Data to queue but no room.
3712                          */
3713                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3714                 }               
3715         }
3716         else
3717         {
3718                 /*
3719                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3720                  * from TCP_CLOSE we don't do anything
3721                  *
3722                  * from anything else, if there is write data (or fin) pending,
3723                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3724                  * a KEEPALIVE timeout, else we delete the timer.
3725                  *
3726                  * We do not set flag for nominal write data, otherwise we may
3727                  * force a state where we start to write itsy bitsy tidbits
3728                  * of data.
3729                  */
3730 
3731                 switch(sk->state) {
3732                 case TCP_TIME_WAIT:
3733                         /*
3734                          * keep us in TIME_WAIT until we stop getting packets,
3735                          * reset the timeout.
3736                          */
3737                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3738                         break;
3739                 case TCP_CLOSE:
3740                         /*
3741                          * don't touch the timer.
3742                          */
3743                         break;
3744                 default:
3745                         /*
3746                          *      Must check send_head, write_queue, and ack_backlog
3747                          *      to determine which timeout to use.
3748                          */
3749                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3750                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3751                         } else if (sk->keepopen) {
3752                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3753                         } else {
3754                                 del_timer(&sk->retransmit_timer);
3755                                 sk->ip_xmit_timeout = 0;
3756                         }
3757                         break;
3758                 }
3759         }
3760 
3761         /*
3762          *      We have nothing queued but space to send. Send any partial
3763          *      packets immediately (end of Nagle rule application).
3764          */
3765          
3766         if (sk->packets_out == 0 && sk->partial != NULL &&
3767                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3768         {
3769                 flag |= 1;
3770                 tcp_send_partial(sk);
3771         }
3772 
3773         /*
3774          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3775          * we are now waiting for an acknowledge to our FIN.  The other end is
3776          * already in TIME_WAIT.
3777          *
3778          * Move to TCP_CLOSE on success.
3779          */
3780 
3781         if (sk->state == TCP_LAST_ACK) 
3782         {
3783                 if (!sk->dead)
3784                         sk->state_change(sk);
3785                 if(sk->debug)
3786                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3787                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3788                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3789                 {
3790                         flag |= 1;
3791                         tcp_set_state(sk,TCP_CLOSE);
3792                         sk->shutdown = SHUTDOWN_MASK;
3793                 }
3794         }
3795 
3796         /*
3797          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3798          *
3799          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3800          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3801          */
3802 
3803         if (sk->state == TCP_FIN_WAIT1) 
3804         {
3805 
3806                 if (!sk->dead) 
3807                         sk->state_change(sk);
3808                 if (sk->rcv_ack_seq == sk->write_seq) 
3809                 {
3810                         flag |= 1;
3811                         sk->shutdown |= SEND_SHUTDOWN;
3812                         tcp_set_state(sk, TCP_FIN_WAIT2);
3813                 }
3814         }
3815 
3816         /*
3817          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3818          *
3819          *      Move to TIME_WAIT
3820          */
3821 
3822         if (sk->state == TCP_CLOSING) 
3823         {
3824 
3825                 if (!sk->dead) 
3826                         sk->state_change(sk);
3827                 if (sk->rcv_ack_seq == sk->write_seq) 
3828                 {
3829                         flag |= 1;
3830                         tcp_time_wait(sk);
3831                 }
3832         }
3833         
3834         /*
3835          *      Final ack of a three way shake 
3836          */
3837          
3838         if(sk->state==TCP_SYN_RECV)
3839         {
3840                 tcp_set_state(sk, TCP_ESTABLISHED);
3841                 tcp_options(sk,th);
3842                 sk->dummy_th.dest=th->source;
3843                 sk->copied_seq = sk->acked_seq;
3844                 if(!sk->dead)
3845                         sk->state_change(sk);
3846                 if(sk->max_window==0)
3847                 {
3848                         sk->max_window=32;      /* Sanity check */
3849                         sk->mss=min(sk->max_window,sk->mtu);
3850                 }
3851         }
3852         
3853         /*
3854          * I make no guarantees about the first clause in the following
3855          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3856          * what conditions "!flag" would be true.  However I think the rest
3857          * of the conditions would prevent that from causing any
3858          * unnecessary retransmission. 
3859          *   Clearly if the first packet has expired it should be 
3860          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3861          * harder to explain:  You have to look carefully at how and when the
3862          * timer is set and with what timeout.  The most recent transmission always
3863          * sets the timer.  So in general if the most recent thing has timed
3864          * out, everything before it has as well.  So we want to go ahead and
3865          * retransmit some more.  If we didn't explicitly test for this
3866          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3867          * would not be true.  If you look at the pattern of timing, you can
3868          * show that rto is increased fast enough that the next packet would
3869          * almost never be retransmitted immediately.  Then you'd end up
3870          * waiting for a timeout to send each packet on the retransmission
3871          * queue.  With my implementation of the Karn sampling algorithm,
3872          * the timeout would double each time.  The net result is that it would
3873          * take a hideous amount of time to recover from a single dropped packet.
3874          * It's possible that there should also be a test for TIME_WRITE, but
3875          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3876          * got to be in real retransmission mode.
3877          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3878          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3879          * As long as no further losses occur, this seems reasonable.
3880          */
3881         
3882         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3883                (((flag&2) && sk->retransmits) ||
3884                (sk->send_head->when + sk->rto < jiffies))) 
3885         {
3886                 if(sk->send_head->when + sk->rto < jiffies)
3887                         tcp_retransmit(sk,0);   
3888                 else
3889                 {
3890                         tcp_do_retransmit(sk, 1);
3891                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3892                 }
3893         }
3894 
3895         return(1);
3896 }
3897 
3898 
3899 /*
3900  *      Process the FIN bit. This now behaves as it is supposed to work
3901  *      and the FIN takes effect when it is validly part of sequence
3902  *      space. Not before when we get holes.
3903  *
3904  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3905  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3906  *      TIME-WAIT)
3907  *
3908  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3909  *      close and we go into CLOSING (and later onto TIME-WAIT)
3910  *
3911  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3912  *
3913  */
3914  
3915 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3916 {
3917         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3918 
3919         if (!sk->dead) 
3920         {
3921                 sk->state_change(sk);
3922                 sock_wake_async(sk->socket, 1);
3923         }
3924 
3925         switch(sk->state) 
3926         {
3927                 case TCP_SYN_RECV:
3928                 case TCP_SYN_SENT:
3929                 case TCP_ESTABLISHED:
3930                         /*
3931                          * move to CLOSE_WAIT, tcp_data() already handled
3932                          * sending the ack.
3933                          */
3934                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3935                         if (th->rst)
3936                                 sk->shutdown = SHUTDOWN_MASK;
3937                         break;
3938 
3939                 case TCP_CLOSE_WAIT:
3940                 case TCP_CLOSING:
3941                         /*
3942                          * received a retransmission of the FIN, do
3943                          * nothing.
3944                          */
3945                         break;
3946                 case TCP_TIME_WAIT:
3947                         /*
3948                          * received a retransmission of the FIN,
3949                          * restart the TIME_WAIT timer.
3950                          */
3951                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3952                         return(0);
3953                 case TCP_FIN_WAIT1:
3954                         /*
3955                          * This case occurs when a simultaneous close
3956                          * happens, we must ack the received FIN and
3957                          * enter the CLOSING state.
3958                          *
3959                          * This causes a WRITE timeout, which will either
3960                          * move on to TIME_WAIT when we timeout, or resend
3961                          * the FIN properly (maybe we get rid of that annoying
3962                          * FIN lost hang). The TIME_WRITE code is already correct
3963                          * for handling this timeout.
3964                          */
3965 
3966                         if(sk->ip_xmit_timeout != TIME_WRITE)
3967                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3968                         tcp_set_state(sk,TCP_CLOSING);
3969                         break;
3970                 case TCP_FIN_WAIT2:
3971                         /*
3972                          * received a FIN -- send ACK and enter TIME_WAIT
3973                          */
3974                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3975                         sk->shutdown|=SHUTDOWN_MASK;
3976                         tcp_set_state(sk,TCP_TIME_WAIT);
3977                         break;
3978                 case TCP_CLOSE:
3979                         /*
3980                          * already in CLOSE
3981                          */
3982                         break;
3983                 default:
3984                         tcp_set_state(sk,TCP_LAST_ACK);
3985         
3986                         /* Start the timers. */
3987                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3988                         return(0);
3989         }
3990 
3991         return(0);
3992 }
3993 
3994 
3995 
3996 /*
3997  *      This routine handles the data.  If there is room in the buffer,
3998  *      it will be have already been moved into it.  If there is no
3999  *      room, then we will just have to discard the packet.
4000  */
4001 
4002 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
4003          unsigned long saddr, unsigned short len)
4004 {
4005         struct sk_buff *skb1, *skb2;
4006         struct tcphdr *th;
4007         int dup_dumped=0;
4008         u32 new_seq, shut_seq;
4009 
4010         th = skb->h.th;
4011         skb_pull(skb,th->doff*4);
4012         skb_trim(skb,len-(th->doff*4));
4013 
4014         /*
4015          *      The bytes in the receive read/assembly queue has increased. Needed for the
4016          *      low memory discard algorithm 
4017          */
4018            
4019         sk->bytes_rcv += skb->len;
4020         
4021         if (skb->len == 0 && !th->fin) 
4022         {
4023                 /* 
4024                  *      Don't want to keep passing ack's back and forth. 
4025                  *      (someone sent us dataless, boring frame)
4026                  */
4027                 if (!th->ack)
4028                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4029                 kfree_skb(skb, FREE_READ);
4030                 return(0);
4031         }
4032         
4033         /*
4034          *      We no longer have anyone receiving data on this connection.
4035          */
4036 
4037 #ifndef TCP_DONT_RST_SHUTDOWN            
4038 
4039         if(sk->shutdown & RCV_SHUTDOWN)
4040         {
4041                 /*
4042                  *      FIXME: BSD has some magic to avoid sending resets to
4043                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
4044                  *      BSD stacks still have broken keepalives so we want to
4045                  *      cope with it.
4046                  */
4047 
4048                 if(skb->len)    /* We don't care if it's just an ack or
4049                                    a keepalive/window probe */
4050                 {
4051                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
4052                         
4053                         /* Do this the way 4.4BSD treats it. Not what I'd
4054                            regard as the meaning of the spec but it's what BSD
4055                            does and clearly they know everything 8) */
4056 
4057                         /*
4058                          *      This is valid because of two things
4059                          *
4060                          *      a) The way tcp_data behaves at the bottom.
4061                          *      b) A fin takes effect when read not when received.
4062                          */
4063                          
4064                         shut_seq=sk->acked_seq+1;       /* Last byte */
4065                         
4066                         if(after(new_seq,shut_seq))
4067                         {
4068                                 if(sk->debug)
4069                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4070                                                 sk, new_seq, shut_seq, sk->blog);
4071                                 if(sk->dead)
4072                                 {
4073                                         sk->acked_seq = new_seq + th->fin;
4074                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4075                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4076                                         tcp_statistics.TcpEstabResets++;
4077                                         tcp_set_state(sk,TCP_CLOSE);
4078                                         sk->err = EPIPE;
4079                                         sk->shutdown = SHUTDOWN_MASK;
4080                                         kfree_skb(skb, FREE_READ);
4081                                         return 0;
4082                                 }
4083                         }
4084                 }
4085         }
4086 
4087 #endif
4088 
4089         /*
4090          *      Now we have to walk the chain, and figure out where this one
4091          *      goes into it.  This is set up so that the last packet we received
4092          *      will be the first one we look at, that way if everything comes
4093          *      in order, there will be no performance loss, and if they come
4094          *      out of order we will be able to fit things in nicely.
4095          *
4096          *      [AC: This is wrong. We should assume in order first and then walk
4097          *       forwards from the first hole based upon real traffic patterns.]
4098          *      
4099          */
4100 
4101         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
4102         {
4103                 skb_queue_head(&sk->receive_queue,skb);
4104                 skb1= NULL;
4105         } 
4106         else
4107         {
4108                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
4109                 {
4110                         if(sk->debug)
4111                         {
4112                                 printk("skb1=%p :", skb1);
4113                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4114                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4115                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4116                                                 sk->acked_seq);
4117                         }
4118                         
4119                         /*
4120                          *      Optimisation: Duplicate frame or extension of previous frame from
4121                          *      same sequence point (lost ack case).
4122                          *      The frame contains duplicate data or replaces a previous frame
4123                          *      discard the previous frame (safe as sk->inuse is set) and put
4124                          *      the new one in its place.
4125                          */
4126                          
4127                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4128                         {
4129                                 skb_append(skb1,skb);
4130                                 skb_unlink(skb1);
4131                                 kfree_skb(skb1,FREE_READ);
4132                                 dup_dumped=1;
4133                                 skb1=NULL;
4134                                 break;
4135                         }
4136                         
4137                         /*
4138                          *      Found where it fits
4139                          */
4140                          
4141                         if (after(th->seq+1, skb1->h.th->seq))
4142                         {
4143                                 skb_append(skb1,skb);
4144                                 break;
4145                         }
4146                         
4147                         /*
4148                          *      See if we've hit the start. If so insert.
4149                          */
4150                         if (skb1 == skb_peek(&sk->receive_queue))
4151                         {
4152                                 skb_queue_head(&sk->receive_queue, skb);
4153                                 break;
4154                         }
4155                 }
4156         }
4157 
4158         /*
4159          *      Figure out what the ack value for this frame is
4160          */
4161          
4162         th->ack_seq = th->seq + skb->len;
4163         if (th->syn) 
4164                 th->ack_seq++;
4165         if (th->fin)
4166                 th->ack_seq++;
4167 
4168         if (before(sk->acked_seq, sk->copied_seq)) 
4169         {
4170                 printk("*** tcp.c:tcp_data bug acked < copied\n");
4171                 sk->acked_seq = sk->copied_seq;
4172         }
4173 
4174         /*
4175          *      Now figure out if we can ack anything. This is very messy because we really want two
4176          *      receive queues, a completed and an assembly queue. We also want only one transmit
4177          *      queue.
4178          */
4179 
4180         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
4181         {
4182                 if (before(th->seq, sk->acked_seq+1)) 
4183                 {
4184                         int newwindow;
4185 
4186                         if (after(th->ack_seq, sk->acked_seq)) 
4187                         {
4188                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4189                                 if (newwindow < 0)
4190                                         newwindow = 0;  
4191                                 sk->window = newwindow;
4192                                 sk->acked_seq = th->ack_seq;
4193                         }
4194                         skb->acked = 1;
4195 
4196                         /*
4197                          *      When we ack the fin, we do the FIN 
4198                          *      processing.
4199                          */
4200 
4201                         if (skb->h.th->fin) 
4202                         {
4203                                 tcp_fin(skb,sk,skb->h.th);
4204                         }
4205           
4206                         for(skb2 = skb->next;
4207                             skb2 != (struct sk_buff *)&sk->receive_queue;
4208                             skb2 = skb2->next) 
4209                         {
4210                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4211                                 {
4212                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4213                                         {
4214                                                 newwindow = sk->window -
4215                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4216                                                 if (newwindow < 0)
4217                                                         newwindow = 0;  
4218                                                 sk->window = newwindow;
4219                                                 sk->acked_seq = skb2->h.th->ack_seq;
4220                                         }
4221                                         skb2->acked = 1;
4222                                         /*
4223                                          *      When we ack the fin, we do
4224                                          *      the fin handling.
4225                                          */
4226                                         if (skb2->h.th->fin) 
4227                                         {
4228                                                 tcp_fin(skb,sk,skb->h.th);
4229                                         }
4230 
4231                                         /*
4232                                          *      Force an immediate ack.
4233                                          */
4234                                          
4235                                         sk->ack_backlog = sk->max_ack_backlog;
4236                                 }
4237                                 else
4238                                 {
4239                                         break;
4240                                 }
4241                         }
4242 
4243                         /*
4244                          *      This also takes care of updating the window.
4245                          *      This if statement needs to be simplified.
4246                          */
4247                         if (!sk->delay_acks ||
4248                             sk->ack_backlog >= sk->max_ack_backlog || 
4249                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4250         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4251                         }
4252                         else 
4253                         {
4254                                 sk->ack_backlog++;
4255                                 if(sk->debug)
4256                                         printk("Ack queued.\n");
4257                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4258                         }
4259                 }
4260         }
4261 
4262         /*
4263          *      If we've missed a packet, send an ack.
4264          *      Also start a timer to send another.
4265          */
4266          
4267         if (!skb->acked) 
4268         {
4269         
4270         /*
4271          *      This is important.  If we don't have much room left,
4272          *      we need to throw out a few packets so we have a good
4273          *      window.  Note that mtu is used, not mss, because mss is really
4274          *      for the send side.  He could be sending us stuff as large as mtu.
4275          */
4276                  
4277                 while (sock_rspace(sk) < sk->mtu) 
4278                 {
4279                         skb1 = skb_peek(&sk->receive_queue);
4280                         if (skb1 == NULL) 
4281                         {
4282                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4283                                 break;
4284                         }
4285 
4286                         /*
4287                          *      Don't throw out something that has been acked. 
4288                          */
4289                  
4290                         if (skb1->acked) 
4291                         {
4292                                 break;
4293                         }
4294                 
4295                         skb_unlink(skb1);
4296                         kfree_skb(skb1, FREE_READ);
4297                 }
4298                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4299                 sk->ack_backlog++;
4300                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4301         }
4302         else
4303         {
4304                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4305         }
4306 
4307         /*
4308          *      Now tell the user we may have some data. 
4309          */
4310          
4311         if (!sk->dead) 
4312         {
4313                 if(sk->debug)
4314                         printk("Data wakeup.\n");
4315                 sk->data_ready(sk,0);
4316         } 
4317         return(0);
4318 }
4319 
4320 
4321 /*
4322  *      This routine is only called when we have urgent data
4323  *      signalled. Its the 'slow' part of tcp_urg. It could be
4324  *      moved inline now as tcp_urg is only called from one
4325  *      place. We handle URGent data wrong. We have to - as
4326  *      BSD still doesn't use the correction from RFC961.
4327  */
4328  
4329 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4330 {
4331         u32 ptr = ntohs(th->urg_ptr);
4332 
4333         if (ptr)
4334                 ptr--;
4335         ptr += th->seq;
4336 
4337         /* ignore urgent data that we've already seen and read */
4338         if (after(sk->copied_seq, ptr))
4339                 return;
4340 
4341         /* do we already have a newer (or duplicate) urgent pointer? */
4342         if (sk->urg_data && !after(ptr, sk->urg_seq))
4343                 return;
4344 
4345         /* tell the world about our new urgent pointer */
4346         if (sk->proc != 0) {
4347                 if (sk->proc > 0) {
4348                         kill_proc(sk->proc, SIGURG, 1);
4349                 } else {
4350                         kill_pg(-sk->proc, SIGURG, 1);
4351                 }
4352         }
4353         sk->urg_data = URG_NOTYET;
4354         sk->urg_seq = ptr;
4355 }
4356 
4357 /*
4358  *      This is the 'fast' part of urgent handling.
4359  */
4360  
4361 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4362         unsigned long saddr, unsigned long len)
4363 {
4364         u32 ptr;
4365 
4366         /*
4367          *      Check if we get a new urgent pointer - normally not 
4368          */
4369          
4370         if (th->urg)
4371                 tcp_check_urg(sk,th);
4372 
4373         /*
4374          *      Do we wait for any urgent data? - normally not
4375          */
4376          
4377         if (sk->urg_data != URG_NOTYET)
4378                 return 0;
4379 
4380         /*
4381          *      Is the urgent pointer pointing into this packet? 
4382          */
4383          
4384         ptr = sk->urg_seq - th->seq + th->doff*4;
4385         if (ptr >= len)
4386                 return 0;
4387 
4388         /*
4389          *      Ok, got the correct packet, update info 
4390          */
4391          
4392         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4393         if (!sk->dead)
4394                 sk->data_ready(sk,0);
4395         return 0;
4396 }
4397 
4398 /*
4399  *      This will accept the next outstanding connection. 
4400  */
4401  
4402 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4403 {
4404         struct sock *newsk;
4405         struct sk_buff *skb;
4406   
4407   /*
4408    * We need to make sure that this socket is listening,
4409    * and that it has something pending.
4410    */
4411 
4412         if (sk->state != TCP_LISTEN) 
4413         {
4414                 sk->err = EINVAL;
4415                 return(NULL); 
4416         }
4417 
4418         /* Avoid the race. */
4419         cli();
4420         sk->inuse = 1;
4421 
4422         while((skb = tcp_dequeue_established(sk)) == NULL) 
4423         {
4424                 if (flags & O_NONBLOCK) 
4425                 {
4426                         sti();
4427                         release_sock(sk);
4428                         sk->err = EAGAIN;
4429                         return(NULL);
4430                 }
4431 
4432                 release_sock(sk);
4433                 interruptible_sleep_on(sk->sleep);
4434                 if (current->signal & ~current->blocked) 
4435                 {
4436                         sti();
4437                         sk->err = ERESTARTSYS;
4438                         return(NULL);
4439                 }
4440                 sk->inuse = 1;
4441         }
4442         sti();
4443 
4444         /*
4445          *      Now all we need to do is return skb->sk. 
4446          */
4447 
4448         newsk = skb->sk;
4449 
4450         kfree_skb(skb, FREE_READ);
4451         sk->ack_backlog--;
4452         release_sock(sk);
4453         return(newsk);
4454 }
4455 
4456 
4457 /*
4458  *      This will initiate an outgoing connection. 
4459  */
4460  
4461 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4462 {
4463         struct sk_buff *buff;
4464         struct device *dev=NULL;
4465         unsigned char *ptr;
4466         int tmp;
4467         int atype;
4468         struct tcphdr *t1;
4469         struct rtable *rt;
4470 
4471         if (sk->state != TCP_CLOSE) 
4472                 return(-EISCONN);
4473 
4474         /*
4475          *      Don't allow a double connect.
4476          */
4477                 
4478         if(sk->daddr)
4479                 return -EINVAL;
4480         
4481         if (addr_len < 8) 
4482                 return(-EINVAL);
4483 
4484         if (usin->sin_family && usin->sin_family != AF_INET) 
4485                 return(-EAFNOSUPPORT);
4486 
4487         /*
4488          *      connect() to INADDR_ANY means loopback (BSD'ism).
4489          */
4490         
4491         if(usin->sin_addr.s_addr==INADDR_ANY)
4492                 usin->sin_addr.s_addr=ip_my_addr();
4493                   
4494         /*
4495          *      Don't want a TCP connection going to a broadcast address 
4496          */
4497 
4498         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4499                 return -ENETUNREACH;
4500   
4501         sk->inuse = 1;
4502         sk->daddr = usin->sin_addr.s_addr;
4503         sk->write_seq = tcp_init_seq();
4504         sk->window_seq = sk->write_seq;
4505         sk->rcv_ack_seq = sk->write_seq -1;
4506         sk->err = 0;
4507         sk->dummy_th.dest = usin->sin_port;
4508         release_sock(sk);
4509 
4510         buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4511         if (buff == NULL) 
4512         {
4513                 return(-ENOMEM);
4514         }
4515         sk->inuse = 1;
4516         buff->sk = sk;
4517         buff->free = 0;
4518         buff->localroute = sk->localroute;
4519         
4520 
4521         /*
4522          *      Put in the IP header and routing stuff.
4523          */
4524          
4525         if (sk->localroute)
4526                 rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4527         else
4528                 rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4529 
4530         /*
4531          *      When we connect we enforce receive requirements too.
4532          */
4533          
4534         sk->rcv_saddr=sk->saddr;
4535         
4536         /*
4537          *      We need to build the routing stuff from the things saved in skb. 
4538          */
4539 
4540         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4541                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4542         if (tmp < 0) 
4543         {
4544                 sock_wfree(sk, buff);
4545                 release_sock(sk);
4546                 return(-ENETUNREACH);
4547         }
4548 
4549         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4550 
4551         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4552         t1->seq = ntohl(sk->write_seq++);
4553         sk->sent_seq = sk->write_seq;
4554         buff->h.seq = sk->write_seq;
4555         t1->ack = 0;
4556         t1->window = 2;
4557         t1->res1=0;
4558         t1->res2=0;
4559         t1->rst = 0;
4560         t1->urg = 0;
4561         t1->psh = 0;
4562         t1->syn = 1;
4563         t1->urg_ptr = 0;
4564         t1->doff = 6;
4565         /* use 512 or whatever user asked for */
4566         
4567         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4568                 sk->window_clamp=rt->rt_window;
4569         else
4570                 sk->window_clamp=0;
4571 
4572         if (sk->user_mss)
4573                 sk->mtu = sk->user_mss;
4574         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4575                 sk->mtu = rt->rt_mss;
4576         else 
4577         {
4578 #ifdef CONFIG_INET_SNARL
4579                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4580 #else
4581                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4582 #endif
4583                         sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4584                 else
4585                         sk->mtu = MAX_WINDOW;
4586         }
4587         /*
4588          *      but not bigger than device MTU 
4589          */
4590 
4591         if(sk->mtu <32)
4592                 sk->mtu = 32;   /* Sanity limit */
4593                 
4594         sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4595         
4596         /*
4597          *      Put in the TCP options to say MTU. 
4598          */
4599 
4600         ptr = skb_put(buff,4);
4601         ptr[0] = 2;
4602         ptr[1] = 4;
4603         ptr[2] = (sk->mtu) >> 8;
4604         ptr[3] = (sk->mtu) & 0xff;
4605         tcp_send_check(t1, sk->saddr, sk->daddr,
4606                   sizeof(struct tcphdr) + 4, sk);
4607 
4608         /*
4609          *      This must go first otherwise a really quick response will get reset. 
4610          */
4611 
4612         tcp_cache_zap();
4613         tcp_set_state(sk,TCP_SYN_SENT);
4614         if(rt&&rt->rt_flags&RTF_IRTT)
4615                 sk->rto = rt->rt_irtt;
4616         else
4617                 sk->rto = TCP_TIMEOUT_INIT;
4618         sk->retransmit_timer.function=&retransmit_timer;
4619         sk->retransmit_timer.data = (unsigned long)sk;
4620         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer  */
4621         sk->retransmits = 0;                            /* Now works the right way instead of a hacked 
4622                                                                                         initial setting */
4623 
4624         sk->prot->queue_xmit(sk, dev, buff, 0);  
4625         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4626         tcp_statistics.TcpActiveOpens++;
4627         tcp_statistics.TcpOutSegs++;
4628   
4629         release_sock(sk);
4630         return(0);
4631 }
4632 
4633 
4634 /*
4635  *      This functions checks to see if the tcp header is actually acceptable. 
4636  */
4637  
4638 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4639              struct options *opt, unsigned long saddr, struct device *dev)
4640 {
4641         u32 next_seq;
4642 
4643         next_seq = len - 4*th->doff;
4644         if (th->fin)
4645                 next_seq++;
4646         /* if we have a zero window, we can't have any data in the packet.. */
4647         if (next_seq && !sk->window)
4648                 goto ignore_it;
4649         next_seq += th->seq;
4650 
4651         /*
4652          * This isn't quite right.  sk->acked_seq could be more recent
4653          * than sk->window.  This is however close enough.  We will accept
4654          * slightly more packets than we should, but it should not cause
4655          * problems unless someone is trying to forge packets.
4656          */
4657 
4658         /* have we already seen all of this packet? */
4659         if (!after(next_seq+1, sk->acked_seq))
4660                 goto ignore_it;
4661         /* or does it start beyond the window? */
4662         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4663                 goto ignore_it;
4664 
4665         /* ok, at least part of this packet would seem interesting.. */
4666         return 1;
4667 
4668 ignore_it:
4669         if (th->rst)
4670                 return 0;
4671 
4672         /*
4673          *      Send a reset if we get something not ours and we are
4674          *      unsynchronized. Note: We don't do anything to our end. We
4675          *      are just killing the bogus remote connection then we will
4676          *      connect again and it will work (with luck).
4677          */
4678          
4679         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4680         {
4681                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4682                 return 1;
4683         }
4684 
4685         /* Try to resync things. */
4686         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4687         return 0;
4688 }
4689 
4690 /*
4691  *      When we get a reset we do this.
4692  */
4693 
4694 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4695 {
4696         sk->zapped = 1;
4697         sk->err = ECONNRESET;
4698         if (sk->state == TCP_SYN_SENT)
4699                 sk->err = ECONNREFUSED;
4700         if (sk->state == TCP_CLOSE_WAIT)
4701                 sk->err = EPIPE;
4702 #ifdef TCP_DO_RFC1337           
4703         /*
4704          *      Time wait assassination protection [RFC1337]
4705          */
4706         if(sk->state!=TCP_TIME_WAIT)
4707         {       
4708                 tcp_set_state(sk,TCP_CLOSE);
4709                 sk->shutdown = SHUTDOWN_MASK;
4710         }
4711 #else   
4712         tcp_set_state(sk,TCP_CLOSE);
4713         sk->shutdown = SHUTDOWN_MASK;
4714 #endif  
4715         if (!sk->dead) 
4716                 sk->state_change(sk);
4717         kfree_skb(skb, FREE_READ);
4718         release_sock(sk);
4719         return(0);
4720 }
4721 
4722 /*
4723  *      A TCP packet has arrived.
4724  *              skb->h.raw is the TCP header.
4725  */
4726  
4727 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4728         __u32 daddr, unsigned short len,
4729         __u32 saddr, int redo, struct inet_protocol * protocol)
4730 {
4731         struct tcphdr *th;
4732         struct sock *sk;
4733         int syn_ok=0;
4734         
4735         tcp_statistics.TcpInSegs++;
4736         if(skb->pkt_type!=PACKET_HOST)
4737         {
4738                 kfree_skb(skb,FREE_READ);
4739                 return(0);
4740         }
4741   
4742         th = skb->h.th;
4743 
4744         /*
4745          *      Find the socket, using the last hit cache if applicable.
4746          */
4747 
4748         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4749         {
4750                 sk=(struct sock *)th_cache_sk;
4751                 /*
4752                  *      We think this is causing the bug so
4753                  */
4754                  if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4755                         printk("Cache mismatch on TCP.\n");
4756         }
4757         else
4758         {
4759                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4760                 th_cache_saddr=saddr;
4761                 th_cache_daddr=daddr;
4762                 th_cache_dport=th->dest;
4763                 th_cache_sport=th->source;
4764                 th_cache_sk=sk;
4765         }               
4766 
4767         /*
4768          *      If this socket has got a reset it's to all intents and purposes 
4769          *      really dead. Count closed sockets as dead.
4770          *
4771          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4772          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4773          *      exist so should cause resets as if the port was unreachable.
4774          */
4775          
4776         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4777                 sk=NULL;
4778 
4779         if (!redo) 
4780         {
4781                 /*
4782                  *      Pull up the IP header.
4783                  */
4784                 skb_pull(skb, skb->h.raw-skb->data);
4785                 /*
4786                  *      Try to use the device checksum if provided.
4787                  */
4788                 if (
4789                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4790                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4791                     )
4792                 {
4793                         skb->sk = NULL;
4794                         kfree_skb(skb,FREE_READ);
4795                         /*
4796                          *      We don't release the socket because it was
4797                          *      never marked in use.
4798                          */
4799                         return(0);
4800                 }
4801                 th->seq = ntohl(th->seq);
4802 
4803                 /* See if we know about the socket. */
4804                 if (sk == NULL) 
4805                 {
4806                         /*
4807                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4808                          */
4809                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4810                         skb->sk = NULL;
4811                         /*
4812                          *      Discard frame
4813                          */
4814                         kfree_skb(skb, FREE_READ);
4815                         return(0);
4816                 }
4817 
4818                 skb->acked = 0;
4819                 skb->used = 0;
4820                 skb->free = 0;
4821                 skb->saddr = daddr;
4822                 skb->daddr = saddr;
4823         
4824                 /* We may need to add it to the backlog here. */
4825                 cli();
4826                 if (sk->inuse) 
4827                 {
4828                         skb_queue_tail(&sk->back_log, skb);
4829                         sti();
4830                         return(0);
4831                 }
4832                 sk->inuse = 1;
4833                 sti();
4834         }
4835         else
4836         {
4837                 if (sk==NULL) 
4838                 {
4839                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4840                         skb->sk = NULL;
4841                         kfree_skb(skb, FREE_READ);
4842                         return(0);
4843                 }
4844         }
4845 
4846 
4847         if (!sk->prot) 
4848         {
4849                 printk("IMPOSSIBLE 3\n");
4850                 return(0);
4851         }
4852 
4853 
4854         /*
4855          *      Charge the memory to the socket. 
4856          */
4857          
4858         skb->sk=sk;
4859         sk->rmem_alloc += skb->truesize;
4860 
4861         /*
4862          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4863          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4864          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4865          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4866          */
4867 
4868         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4869         {
4870         
4871                 /*
4872                  *      Now deal with unusual cases.
4873                  */
4874          
4875                 if(sk->state==TCP_LISTEN)
4876                 {
4877                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4878                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4879 
4880                         /*
4881                          *      We don't care for RST, and non SYN are absorbed (old segments)
4882                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4883                          *      netmask on a running connection it can go broadcast. Even Sun's have
4884                          *      this problem so I'm ignoring it 
4885                          */
4886                            
4887                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4888                         {
4889                                 kfree_skb(skb, FREE_READ);
4890                                 release_sock(sk);
4891                                 return 0;
4892                         }
4893                 
4894                         /*      
4895                          *      Guess we need to make a new socket up 
4896                          */
4897                 
4898                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4899                 
4900                         /*
4901                          *      Now we have several options: In theory there is nothing else
4902                          *      in the frame. KA9Q has an option to send data with the syn,
4903                          *      BSD accepts data with the syn up to the [to be] advertised window
4904                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4905                          *      it, that fits the spec precisely and avoids incompatibilities. It
4906                          *      would be nice in future to drop through and process the data.
4907                          */
4908                          
4909                         release_sock(sk);
4910                         return 0;
4911                 }
4912         
4913                 /* retransmitted SYN? */
4914                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4915                 {
4916                         kfree_skb(skb, FREE_READ);
4917                         release_sock(sk);
4918                         return 0;
4919                 }
4920                 
4921                 /*
4922                  *      SYN sent means we have to look for a suitable ack and either reset
4923                  *      for bad matches or go to connected 
4924                  */
4925            
4926                 if(sk->state==TCP_SYN_SENT)
4927                 {
4928                         /* Crossed SYN or previous junk segment */
4929                         if(th->ack)
4930                         {
4931                                 /* We got an ack, but it's not a good ack */
4932                                 if(!tcp_ack(sk,th,saddr,len))
4933                                 {
4934                                         /* Reset the ack - its an ack from a 
4935                                            different connection  [ th->rst is checked in tcp_reset()] */
4936                                         tcp_statistics.TcpAttemptFails++;
4937                                         tcp_reset(daddr, saddr, th,
4938                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4939                                         kfree_skb(skb, FREE_READ);
4940                                         release_sock(sk);
4941                                         return(0);
4942                                 }
4943                                 if(th->rst)
4944                                         return tcp_std_reset(sk,skb);
4945                                 if(!th->syn)
4946                                 {
4947                                         /* A valid ack from a different connection
4948                                            start. Shouldn't happen but cover it */
4949                                         kfree_skb(skb, FREE_READ);
4950                                         release_sock(sk);
4951                                         return 0;
4952                                 }
4953                                 /*
4954                                  *      Ok.. it's good. Set up sequence numbers and
4955                                  *      move to established.
4956                                  */
4957                                 syn_ok=1;       /* Don't reset this connection for the syn */
4958                                 sk->acked_seq=th->seq+1;
4959                                 sk->fin_seq=th->seq;
4960                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4961                                 tcp_set_state(sk, TCP_ESTABLISHED);
4962                                 tcp_options(sk,th);
4963                                 sk->dummy_th.dest=th->source;
4964                                 sk->copied_seq = sk->acked_seq;
4965                                 if(!sk->dead)
4966                                 {
4967                                         sk->state_change(sk);
4968                                         sock_wake_async(sk->socket, 0);
4969                                 }
4970                                 if(sk->max_window==0)
4971                                 {
4972                                         sk->max_window = 32;
4973                                         sk->mss = min(sk->max_window, sk->mtu);
4974                                 }
4975                         }
4976                         else
4977                         {
4978                                 /* See if SYN's cross. Drop if boring */
4979                                 if(th->syn && !th->rst)
4980                                 {
4981                                         /* Crossed SYN's are fine - but talking to
4982                                            yourself is right out... */
4983                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4984                                                 sk->dummy_th.source==th->source &&
4985                                                 sk->dummy_th.dest==th->dest)
4986                                         {
4987                                                 tcp_statistics.TcpAttemptFails++;
4988                                                 return tcp_std_reset(sk,skb);
4989                                         }
4990                                         tcp_set_state(sk,TCP_SYN_RECV);
4991                                         
4992                                         /*
4993                                          *      FIXME:
4994                                          *      Must send SYN|ACK here
4995                                          */
4996                                 }               
4997                                 /* Discard junk segment */
4998                                 kfree_skb(skb, FREE_READ);
4999                                 release_sock(sk);
5000                                 return 0;
5001                         }
5002                         /*
5003                          *      SYN_RECV with data maybe.. drop through
5004                          */
5005                         goto rfc_step6;
5006                 }
5007 
5008         /*
5009          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5010          *      a more complex suggestion for fixing these reuse issues in RFC1644
5011          *      but not yet ready for general use. Also see RFC1379.
5012          */
5013         
5014 #define BSD_TIME_WAIT
5015 #ifdef BSD_TIME_WAIT
5016                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
5017                         after(th->seq, sk->acked_seq) && !th->rst)
5018                 {
5019                         u32 seq = sk->write_seq;
5020                         if(sk->debug)
5021                                 printk("Doing a BSD time wait\n");
5022                         tcp_statistics.TcpEstabResets++;           
5023                         sk->rmem_alloc -= skb->truesize;
5024                         skb->sk = NULL;
5025                         sk->err=ECONNRESET;
5026                         tcp_set_state(sk, TCP_CLOSE);
5027                         sk->shutdown = SHUTDOWN_MASK;
5028                         release_sock(sk);
5029                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5030                         if (sk && sk->state==TCP_LISTEN)
5031                         {
5032                                 sk->inuse=1;
5033                                 skb->sk = sk;
5034                                 sk->rmem_alloc += skb->truesize;
5035                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5036                                 release_sock(sk);
5037                                 return 0;
5038                         }
5039                         kfree_skb(skb, FREE_READ);
5040                         return 0;
5041                 }
5042 #endif  
5043         }
5044 
5045         /*
5046          *      We are now in normal data flow (see the step list in the RFC)
5047          *      Note most of these are inline now. I'll inline the lot when
5048          *      I have time to test it hard and look at what gcc outputs 
5049          */
5050         
5051         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5052         {
5053                 kfree_skb(skb, FREE_READ);
5054                 release_sock(sk);
5055                 return 0;
5056         }
5057 
5058         if(th->rst)
5059                 return tcp_std_reset(sk,skb);
5060         
5061         /*
5062          *      !syn_ok is effectively the state test in RFC793.
5063          */
5064          
5065         if(th->syn && !syn_ok)
5066         {
5067                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5068                 return tcp_std_reset(sk,skb);   
5069         }
5070 
5071         /*
5072          *      Process the ACK
5073          */
5074          
5075 
5076         if(th->ack && !tcp_ack(sk,th,saddr,len))
5077         {
5078                 /*
5079                  *      Our three way handshake failed.
5080                  */
5081                  
5082                 if(sk->state==TCP_SYN_RECV)
5083                 {
5084                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5085                 }
5086                 kfree_skb(skb, FREE_READ);
5087                 release_sock(sk);
5088                 return 0;
5089         }
5090         
5091 rfc_step6:              /* I'll clean this up later */
5092 
5093         /*
5094          *      If the accepted buffer put us over our queue size we
5095          *      now drop it (we must process the ack first to avoid
5096          *      deadlock cases).
5097          */
5098          
5099         if (sk->rmem_alloc  >= sk->rcvbuf) 
5100         {
5101                 kfree_skb(skb, FREE_READ);
5102                 release_sock(sk);
5103                 return(0);
5104         }
5105 
5106 
5107         /*
5108          *      Process urgent data
5109          */
5110                 
5111         if(tcp_urg(sk, th, saddr, len))
5112         {
5113                 kfree_skb(skb, FREE_READ);
5114                 release_sock(sk);
5115                 return 0;
5116         }
5117         
5118         /*
5119          *      Process the encapsulated data
5120          */
5121         
5122         if(tcp_data(skb,sk, saddr, len))
5123         {
5124                 kfree_skb(skb, FREE_READ);
5125                 release_sock(sk);
5126                 return 0;
5127         }
5128 
5129         /*
5130          *      And done
5131          */     
5132         
5133         release_sock(sk);
5134         return 0;
5135 }
5136 
5137 /*
5138  *      This routine sends a packet with an out of date sequence
5139  *      number. It assumes the other end will try to ack it.
5140  */
5141 
5142 static void tcp_write_wakeup(struct sock *sk)
     /*  */
5143 {
5144         struct sk_buff *buff,*skb;
5145         struct tcphdr *t1;
5146         struct device *dev=NULL;
5147         int tmp;
5148 
5149         if (sk->zapped)
5150                 return; /* After a valid reset we can send no more */
5151 
5152         /*
5153          *      Write data can still be transmitted/retransmitted in the
5154          *      following states.  If any other state is encountered, return.
5155          *      [listen/close will never occur here anyway]
5156          */
5157 
5158         if (sk->state != TCP_ESTABLISHED && 
5159             sk->state != TCP_CLOSE_WAIT &&
5160             sk->state != TCP_FIN_WAIT1 && 
5161             sk->state != TCP_LAST_ACK &&
5162             sk->state != TCP_CLOSING
5163         ) 
5164         {
5165                 return;
5166         }
5167         if ( before(sk->sent_seq, sk->window_seq) && 
5168             (skb=skb_peek(&sk->write_queue)))
5169         {
5170                 /*
5171                  * We are probing the opening of a window
5172                  * but the window size is != 0
5173                  * must have been a result SWS advoidance ( sender )
5174                  */
5175             
5176                 struct iphdr *iph;
5177                 struct tcphdr *th;
5178                 struct tcphdr *nth;
5179                 unsigned long win_size;
5180 #if 0
5181                 unsigned long ow_size;
5182 #endif
5183                 void * tcp_data_start;
5184         
5185                 /*
5186                  *      How many bytes can we send ?
5187                  */
5188                  
5189                 win_size = sk->window_seq - sk->sent_seq;
5190 
5191                 /*
5192                  *      Recover the buffer pointers
5193                  */
5194                  
5195                 iph = (struct iphdr *)skb->ip_hdr;
5196                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5197 
5198                 /*
5199                  *      Grab the data for a temporary frame
5200                  */
5201                  
5202                 buff = sock_wmalloc(sk, win_size + th->doff * 4 + 
5203                                      (iph->ihl << 2) +
5204                                      sk->prot->max_header + 15, 
5205                                      1, GFP_ATOMIC);
5206                 if ( buff == NULL )
5207                         return;
5208 
5209                 /* 
5210                  *      If we strip the packet on the write queue we must
5211                  *      be ready to retransmit this one 
5212                  */
5213             
5214                 buff->free = /*0*/1;
5215 
5216                 buff->sk = sk;
5217                 buff->localroute = sk->localroute;
5218                 
5219                 /*
5220                  *      Put headers on the new packet
5221                  */
5222 
5223                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5224                                          IPPROTO_TCP, sk->opt, buff->truesize,
5225                                          sk->ip_tos,sk->ip_ttl);
5226                 if (tmp < 0) 
5227                 {
5228                         sock_wfree(sk, buff);
5229                         return;
5230                 }
5231                 
5232                 /*
5233                  *      Move the TCP header over
5234                  */
5235 
5236                 buff->dev = dev;
5237 
5238                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5239 
5240                 memcpy(nth, th, th->doff * 4);
5241                 
5242                 /*
5243                  *      Correct the new header
5244                  */
5245                  
5246                 nth->ack = 1; 
5247                 nth->ack_seq = ntohl(sk->acked_seq);
5248                 nth->window = ntohs(tcp_select_window(sk));
5249                 nth->check = 0;
5250 
5251                 /*
5252                  *      Find the first data byte.
5253                  */
5254                  
5255                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5256                                 (iph->ihl << 2) + th->doff * 4;
5257 
5258                 /*
5259                  *      Add it to our new buffer
5260                  */
5261                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5262                 
5263                 /*
5264                  *      Remember our right edge sequence number.
5265                  */
5266                  
5267                 buff->h.seq = sk->sent_seq + win_size;
5268                 sk->sent_seq = buff->h.seq;             /* Hack */
5269 #if 0
5270 
5271                 /*
5272                  *      now: shrink the queue head segment 
5273                  */
5274                  
5275                 th->check = 0;
5276                 ow_size = skb->len - win_size - 
5277                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5278 
5279                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5280                 skb_trim(skb,skb->len-win_size);
5281                 sk->sent_seq += win_size;
5282                 th->seq = htonl(sk->sent_seq);
5283                 if (th->urg)
5284                 {
5285                         unsigned short urg_ptr;
5286         
5287                         urg_ptr = ntohs(th->urg_ptr);
5288                         if (urg_ptr <= win_size)
5289                                 th->urg = 0;
5290                         else
5291                         {
5292                                 urg_ptr -= win_size;
5293                                 th->urg_ptr = htons(urg_ptr);
5294                                 nth->urg_ptr = htons(win_size);
5295                         }
5296                 }
5297 #else
5298                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5299                         nth->urg = 0;
5300 #endif          
5301 
5302                 /*
5303                  *      Checksum the split buffer
5304                  */
5305                  
5306                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5307                            nth->doff * 4 + win_size , sk);
5308         }
5309         else
5310         {       
5311                 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5312                 if (buff == NULL) 
5313                         return;
5314 
5315                 buff->free = 1;
5316                 buff->sk = sk;
5317                 buff->localroute = sk->localroute;
5318 
5319                 /*
5320                  *      Put in the IP header and routing stuff. 
5321                  */
5322                  
5323                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5324                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5325                 if (tmp < 0) 
5326                 {
5327                         sock_wfree(sk, buff);
5328                         return;
5329                 }
5330 
5331                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5332                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5333 
5334                 /*
5335                  *      Use a previous sequence.
5336                  *      This should cause the other end to send an ack.
5337                  */
5338          
5339                 t1->seq = htonl(sk->sent_seq-1);
5340                 t1->ack = 1; 
5341                 t1->res1= 0;
5342                 t1->res2= 0;
5343                 t1->rst = 0;
5344                 t1->urg = 0;
5345                 t1->psh = 0;
5346                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5347                 t1->syn = 0;
5348                 t1->ack_seq = ntohl(sk->acked_seq);
5349                 t1->window = ntohs(tcp_select_window(sk));
5350                 t1->doff = sizeof(*t1)/4;
5351                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5352 
5353         }               
5354 
5355         /*
5356          *      Send it.
5357          */
5358         
5359         sk->prot->queue_xmit(sk, dev, buff, 1);
5360         tcp_statistics.TcpOutSegs++;
5361 }
5362 
5363 /*
5364  *      A window probe timeout has occurred.
5365  */
5366 
5367 void tcp_send_probe0(struct sock *sk)
     /*  */
5368 {
5369         if (sk->zapped)
5370                 return;         /* After a valid reset we can send no more */
5371 
5372         tcp_write_wakeup(sk);
5373 
5374         sk->backoff++;
5375         sk->rto = min(sk->rto << 1, 120*HZ);
5376         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5377         sk->retransmits++;
5378         sk->prot->retransmits ++;
5379 }
5380 
5381 /*
5382  *      Socket option code for TCP. 
5383  */
5384   
5385 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5386 {
5387         int val,err;
5388 
5389         if(level!=SOL_TCP)
5390                 return ip_setsockopt(sk,level,optname,optval,optlen);
5391 
5392         if (optval == NULL) 
5393                 return(-EINVAL);
5394 
5395         err=verify_area(VERIFY_READ, optval, sizeof(int));
5396         if(err)
5397                 return err;
5398         
5399         val = get_user((int *)optval);
5400 
5401         switch(optname)
5402         {
5403                 case TCP_MAXSEG:
5404 /*
5405  * values greater than interface MTU won't take effect.  however at
5406  * the point when this call is done we typically don't yet know
5407  * which interface is going to be used
5408  */
5409                         if(val<1||val>MAX_WINDOW)
5410                                 return -EINVAL;
5411                         sk->user_mss=val;
5412                         return 0;
5413                 case TCP_NODELAY:
5414                         sk->nonagle=(val==0)?0:1;
5415                         return 0;
5416                 default:
5417                         return(-ENOPROTOOPT);
5418         }
5419 }
5420 
5421 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5422 {
5423         int val,err;
5424 
5425         if(level!=SOL_TCP)
5426                 return ip_getsockopt(sk,level,optname,optval,optlen);
5427                         
5428         switch(optname)
5429         {
5430                 case TCP_MAXSEG:
5431                         val=sk->user_mss;
5432                         break;
5433                 case TCP_NODELAY:
5434                         val=sk->nonagle;
5435                         break;
5436                 default:
5437                         return(-ENOPROTOOPT);
5438         }
5439         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5440         if(err)
5441                 return err;
5442         put_user(sizeof(int),(int *) optlen);
5443 
5444         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5445         if(err)
5446                 return err;
5447         put_user(val,(int *)optval);
5448 
5449         return(0);
5450 }       
5451 
5452 
5453 struct proto tcp_prot = {
5454         tcp_close,
5455         ip_build_header,
5456         tcp_connect,
5457         tcp_accept,
5458         ip_queue_xmit,
5459         tcp_retransmit,
5460         tcp_write_wakeup,
5461         tcp_read_wakeup,
5462         tcp_rcv,
5463         tcp_select,
5464         tcp_ioctl,
5465         NULL,
5466         tcp_shutdown,
5467         tcp_setsockopt,
5468         tcp_getsockopt,
5469         tcp_sendmsg,
5470         tcp_recvmsg,
5471         NULL,           /* No special bind() */
5472         128,
5473         0,
5474         "TCP",
5475         0, 0,
5476         {NULL,}
5477 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS