net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_write
tcp_sendto
tcp_read_wakeup
cleanup_rbuf
tcp_read_urg
tcp_read
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_recvfrom
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  26  *                                      and was trying to connect (tcp_err()).
  27  *              Alan Cox        :       All icmp error handling was broken
  28  *                                      pointers passed where wrong and the
  29  *                                      socket was looked up backwards. Nobody
  30  *                                      tested any icmp error code obviously.
  31  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  32  *                                      on errors. select behaves and the icmp error race
  33  *                                      has gone by moving it into sock.c
  34  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  35  *                                      packets for unknown sockets.
  36  *              Alan Cox        :       tcp option processing.
  37  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  38  *              Herp Rosmanith  :       More reset fixes
  39  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  40  *                                      any kind of RST is right out.
  41  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  42  *                                      otherwise odd bits of prattle escape still
  43  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  44  *                                      LAN workplace lockups.
  45  *              Alan Cox        :       Some tidyups using the new skb list facilities
  46  *              Alan Cox        :       sk->keepopen now seems to work
  47  *              Alan Cox        :       Pulls options out correctly on accepts
  48  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  49  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  50  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  51  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  52  *              Alan Cox        :       Removed incorrect check for 20 * psh
  53  *      Michael O'Reilly        :       ack < copied bug fix.
  54  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  55  *              Alan Cox        :       FIN with no memory -> CRASH
  56  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  57  *              Alan Cox        :       Added TCP options (SOL_TCP)
  58  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  59  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  60  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  61  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  62  *              Alan Cox        :       Put in missing check for SYN bit.
  63  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  64  *                                      window non shrink trick.
  65  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  66  *              Charles Hedrick :       TCP fixes
  67  *              Toomas Tamm     :       TCP window fixes
  68  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  69  *              Charles Hedrick :       Rewrote most of it to actually work
  70  *              Linus           :       Rewrote tcp_read() and URG handling
  71  *                                      completely
  72  *              Gerhard Koerting:       Fixed some missing timer handling
  73  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  74  *              Gerhard Koerting:       PC/TCP workarounds
  75  *              Adam Caldwell   :       Assorted timer/timing errors
  76  *              Matthew Dillon  :       Fixed another RST bug
  77  *              Alan Cox        :       Move to kernel side addressing changes.
  78  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  79  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  80  *              Alan Cox        :       TCP fast path debugging
  81  *              Alan Cox        :       Window clamping
  82  *              Michael Riepe   :       Bug in tcp_check()
  83  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  84  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  85  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  86  *              Alan Cox        :       BSD accept semantics. 
  87  *              Alan Cox        :       Reset on closedown bug.
  88  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  89  *              Michael Pall    :       Handle select() after URG properly in all cases.
  90  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  91  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  92  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  93  *              Alan Cox        :       Changed the semantics of sk->socket to 
  94  *                                      fix a race and a signal problem with
  95  *                                      accept() and async I/O.
  96  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  97  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  98  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  99  *                                      clients/servers which listen in on
 100  *                                      fixed ports.
 101  *              Alan Cox        :       Cleaned the above up and shrank it to
 102  *                                      a sensible code size.
 103  *              Alan Cox        :       Self connect lockup fix.
 104  *              Alan Cox        :       No connect to multicast.
 105  *              Ross Biro       :       Close unaccepted children on master
 106  *                                      socket close.
 107  *              Alan Cox        :       Reset tracing code.
 108  *              Alan Cox        :       Spurious resets on shutdown.
 109  *              Alan Cox        :       Giant 15 minute/60 second timer error
 110  *              Alan Cox        :       Small whoops in selecting before an accept.
 111  *              Alan Cox        :       Kept the state trace facility since it's
 112  *                                      handy for debugging.
 113  *              Alan Cox        :       More reset handler fixes.
 114  *              Alan Cox        :       Started rewriting the code based on the RFC's
 115  *                                      for other useful protocol references see:  
 116  *                                      Comer, KA9Q NOS, and for a reference on the
 117  *                                      difference between specifications and how BSD
 118  *                                      works see the 4.4lite source.
 119  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 120  *                                      close.
 121  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 122  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 123  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 124  *                                      timers for sanity. 
 125  *              Alan Cox        :       Small bug fixes, and a lot of new
 126  *                                      comments.
 127  *              Alan Cox        :       Fixed dual reader crash by locking
 128  *                                      the buffers (much like datagram.c)
 129  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 130  *                                      now gets fed up of retrying without
 131  *                                      (even a no space) answer.
 132  *              Alan Cox        :       Extracted closing code better
 133  *              Alan Cox        :       Fixed the closing state machine to
 134  *                                      resemble the RFC.
 135  *              Alan Cox        :       More 'per spec' fixes.
 136  *              Jorge Cwik      :       Even faster checksumming.
 137  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 138  *                                      only frames. At least one pc tcp stack
 139  *                                      generates them.
 140  *              Alan Cox        :       Cache last socket.
 141  *              Alan Cox        :       Per route irtt.
 142  *              Matt Day        :       Select() match BSD precisely on error
 143  *              Alan Cox        :       New buffers
 144  *              Mark Tamsky     :       Various sk->prot->retransmits and 
 145  *                                      sk->retransmits misupdating fixed.
 146  *                                      Fixed tcp_write_timeout: stuck close,
 147  *                                      and TCP syn retries gets used now.
 148  *
 149  *
 150  * To Fix:
 151  *              Fast path the code. Two things here - fix the window calculation
 152  *              so it doesn't iterate over the queue, also spot packets with no funny
 153  *              options arriving in order and process directly.
 154  *
 155  *              Implement RFC 1191 [Path MTU discovery]
 156  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 157  *              Rewrite output state machine to use a single queue and do low window
 158  *              situations as per the spec (RFC 1122)
 159  *              Speed up input assembly algorithm.
 160  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 161  *              could do with it working on IPv4
 162  *              User settable/learned rtt/max window/mtu
 163  *              Cope with MTU/device switches when retransmitting in tcp.
 164  *              Fix the window handling to use PR's new code.
 165  *
 166  *              Change the fundamental structure to a single send queue maintained
 167  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 168  *              active routes too]). Cut the queue off in tcp_retransmit/
 169  *              tcp_transmit.
 170  *              Change the receive queue to assemble as it goes. This lets us
 171  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 172  *              tcp_data/tcp_read as well as the window shrink crud.
 173  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 174  *              tcp_queue_skb seem obvious routines to extract.
 175  *      
 176  *              This program is free software; you can redistribute it and/or
 177  *              modify it under the terms of the GNU General Public License
 178  *              as published by the Free Software Foundation; either version
 179  *              2 of the License, or(at your option) any later version.
 180  *
 181  * Description of States:
 182  *
 183  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 184  *
 185  *      TCP_SYN_RECV            received a connection request, sent ack,
 186  *                              waiting for final ack in three-way handshake.
 187  *
 188  *      TCP_ESTABLISHED         connection established
 189  *
 190  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 191  *                              transmission of remaining buffered data
 192  *
 193  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 194  *                              to shutdown
 195  *
 196  *      TCP_CLOSING             both sides have shutdown but we still have
 197  *                              data we have to finish sending
 198  *
 199  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 200  *                              closed, can only be entered from FIN_WAIT2
 201  *                              or CLOSING.  Required because the other end
 202  *                              may not have gotten our last ACK causing it
 203  *                              to retransmit the data packet (which we ignore)
 204  *
 205  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 206  *                              us to finish writing our data and to shutdown
 207  *                              (we have to close() to move on to LAST_ACK)
 208  *
 209  *      TCP_LAST_ACK            out side has shutdown after remote has
 210  *                              shutdown.  There may still be data in our
 211  *                              buffer that we have to finish sending
 212  *              
 213  *      TCP_CLOSE               socket is finished
 214  */
 215 
 216 #include <linux/types.h>
 217 #include <linux/sched.h>
 218 #include <linux/mm.h>
 219 #include <linux/time.h>
 220 #include <linux/string.h>
 221 #include <linux/config.h>
 222 #include <linux/socket.h>
 223 #include <linux/sockios.h>
 224 #include <linux/termios.h>
 225 #include <linux/in.h>
 226 #include <linux/fcntl.h>
 227 #include <linux/inet.h>
 228 #include <linux/netdevice.h>
 229 #include <net/snmp.h>
 230 #include <net/ip.h>
 231 #include <net/protocol.h>
 232 #include <net/icmp.h>
 233 #include <net/tcp.h>
 234 #include <net/arp.h>
 235 #include <linux/skbuff.h>
 236 #include <net/sock.h>
 237 #include <net/route.h>
 238 #include <linux/errno.h>
 239 #include <linux/timer.h>
 240 #include <asm/system.h>
 241 #include <asm/segment.h>
 242 #include <linux/mm.h>
 243 #include <net/checksum.h>
 244 
 245 /*
 246  *      The MSL timer is the 'normal' timer.
 247  */
 248  
 249 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 250 
 251 #define SEQ_TICK 3
 252 unsigned long seq_offset;
 253 struct tcp_mib  tcp_statistics;
 254 
 255 /*
 256  *      Cached last hit socket
 257  */
 258  
 259 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 260 volatile unsigned short  th_cache_dport, th_cache_sport;
 261 volatile struct sock *th_cache_sk;
 262 
 263 void tcp_cache_zap(void)
     /*  */
 264 {
 265         unsigned long flags;
 266         save_flags(flags);
 267         cli();
 268         th_cache_saddr=0;
 269         th_cache_daddr=0;
 270         th_cache_dport=0;
 271         th_cache_sport=0;
 272         th_cache_sk=NULL;
 273         restore_flags(flags);
 274 }
 275 
 276 static void tcp_close(struct sock *sk, int timeout);
 277 
 278 
 279 /*
 280  *      The less said about this the better, but it works and will do for 1.2 
 281  */
 282 
 283 static struct wait_queue *master_select_wakeup;
 284 
 285 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 286 {
 287         if (a < b) 
 288                 return(a);
 289         return(b);
 290 }
 291 
 292 #undef STATE_TRACE
 293 
 294 #ifdef STATE_TRACE
 295 static char *statename[]={
 296         "Unused","Established","Syn Sent","Syn Recv",
 297         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 298         "Close Wait","Last ACK","Listen","Closing"
 299 };
 300 #endif
 301 
 302 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 303 {
 304         if(sk->state==TCP_ESTABLISHED)
 305                 tcp_statistics.TcpCurrEstab--;
 306 #ifdef STATE_TRACE
 307         if(sk->debug)
 308                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 309 #endif  
 310         /* This is a hack but it doesn't occur often and it's going to
 311            be a real        to fix nicely */
 312            
 313         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 314         {
 315                 wake_up_interruptible(&master_select_wakeup);
 316         }
 317         sk->state=state;
 318         if(state==TCP_ESTABLISHED)
 319                 tcp_statistics.TcpCurrEstab++;
 320 }
 321 
 322 /*
 323  *      This routine picks a TCP windows for a socket based on
 324  *      the following constraints
 325  *  
 326  *      1. The window can never be shrunk once it is offered (RFC 793)
 327  *      2. We limit memory per socket
 328  *   
 329  *      For now we use NET2E3's heuristic of offering half the memory
 330  *      we have handy. All is not as bad as this seems however because
 331  *      of two things. Firstly we will bin packets even within the window
 332  *      in order to get the data we are waiting for into the memory limit.
 333  *      Secondly we bin common duplicate forms at receive time
 334  *      Better heuristics welcome
 335  */
 336    
 337 int tcp_select_window(struct sock *sk)
     /*  */
 338 {
 339         int new_window = sk->prot->rspace(sk);
 340         
 341         if(sk->window_clamp)
 342                 new_window=min(sk->window_clamp,new_window);
 343         /*
 344          *      Two things are going on here.  First, we don't ever offer a
 345          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 346          *      receiver side of SWS as specified in RFC1122.
 347          *      Second, we always give them at least the window they
 348          *      had before, in order to avoid retracting window.  This
 349          *      is technically allowed, but RFC1122 advises against it and
 350          *      in practice it causes trouble.
 351          *
 352          *      Fixme: This doesn't correctly handle the case where
 353          *      new_window > sk->window but not by enough to allow for the
 354          *      shift in sequence space. 
 355          */
 356         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 357                 return(sk->window);
 358         return(new_window);
 359 }
 360 
 361 /*
 362  *      Find someone to 'accept'. Must be called with
 363  *      sk->inuse=1 or cli()
 364  */ 
 365 
 366 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 367 {
 368         struct sk_buff *p=skb_peek(&s->receive_queue);
 369         if(p==NULL)
 370                 return NULL;
 371         do
 372         {
 373                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 374                         return p;
 375                 p=p->next;
 376         }
 377         while(p!=(struct sk_buff *)&s->receive_queue);
 378         return NULL;
 379 }
 380 
 381 /*
 382  *      Remove a completed connection and return it. This is used by
 383  *      tcp_accept() to get connections from the queue.
 384  */
 385 
 386 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 387 {
 388         struct sk_buff *skb;
 389         unsigned long flags;
 390         save_flags(flags);
 391         cli(); 
 392         skb=tcp_find_established(s);
 393         if(skb!=NULL)
 394                 skb_unlink(skb);        /* Take it off the queue */
 395         restore_flags(flags);
 396         return skb;
 397 }
 398 
 399 /* 
 400  *      This routine closes sockets which have been at least partially
 401  *      opened, but not yet accepted. Currently it is only called by
 402  *      tcp_close, and timeout mirrors the value there. 
 403  */
 404 
 405 static void tcp_close_pending (struct sock *sk) 
     /*  */
 406 {
 407         struct sk_buff *skb;
 408 
 409         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 410         {
 411                 skb->sk->dead=1;
 412                 tcp_close(skb->sk, 0);
 413                 kfree_skb(skb, FREE_READ);
 414         }
 415         return;
 416 }
 417 
 418 /*
 419  *      Enter the time wait state. 
 420  */
 421 
 422 static void tcp_time_wait(struct sock *sk)
     /*  */
 423 {
 424         tcp_set_state(sk,TCP_TIME_WAIT);
 425         sk->shutdown = SHUTDOWN_MASK;
 426         if (!sk->dead)
 427                 sk->state_change(sk);
 428         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 429 }
 430 
 431 /*
 432  *      A socket has timed out on its send queue and wants to do a
 433  *      little retransmitting. Currently this means TCP.
 434  */
 435 
 436 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 437 {
 438         struct sk_buff * skb;
 439         struct proto *prot;
 440         struct device *dev;
 441         int ct=0;
 442 
 443         prot = sk->prot;
 444         skb = sk->send_head;
 445 
 446         while (skb != NULL)
 447         {
 448                 struct tcphdr *th;
 449                 struct iphdr *iph;
 450                 int size;
 451 
 452                 dev = skb->dev;
 453                 IS_SKB(skb);
 454                 skb->when = jiffies;
 455 
 456                 /*
 457                  * In general it's OK just to use the old packet.  However we
 458                  * need to use the current ack and window fields.  Urg and
 459                  * urg_ptr could possibly stand to be updated as well, but we
 460                  * don't keep the necessary data.  That shouldn't be a problem,
 461                  * if the other end is doing the right thing.  Since we're
 462                  * changing the packet, we have to issue a new IP identifier.
 463                  */
 464 
 465                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 466                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 467                 size = skb->len - (((unsigned char *) th) - skb->data);
 468                 
 469                 /*
 470                  *      Note: We ought to check for window limits here but
 471                  *      currently this is done (less efficiently) elsewhere.
 472                  *      We do need to check for a route change but can't handle
 473                  *      that until we have the new 1.3.x buffers in.
 474                  *
 475                  */
 476 
 477                 iph->id = htons(ip_id_count++);
 478                 ip_send_check(iph);
 479 
 480                 /*
 481                  *      This is not the right way to handle this. We have to
 482                  *      issue an up to date window and ack report with this 
 483                  *      retransmit to keep the odd buggy tcp that relies on 
 484                  *      the fact BSD does this happy. 
 485                  *      We don't however need to recalculate the entire 
 486                  *      checksum, so someone wanting a small problem to play
 487                  *      with might like to implement RFC1141/RFC1624 and speed
 488                  *      this up by avoiding a full checksum.
 489                  */
 490                  
 491                 th->ack_seq = ntohl(sk->acked_seq);
 492                 th->window = ntohs(tcp_select_window(sk));
 493                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 494                 
 495                 /*
 496                  *      If the interface is (still) up and running, kick it.
 497                  */
 498 
 499                 if (dev->flags & IFF_UP)
 500                 {
 501                         /*
 502                          *      If the packet is still being sent by the device/protocol
 503                          *      below then don't retransmit. This is both needed, and good -
 504                          *      especially with connected mode AX.25 where it stops resends
 505                          *      occurring of an as yet unsent anyway frame!
 506                          *      We still add up the counts as the round trip time wants
 507                          *      adjusting.
 508                          */
 509                         if (sk && !skb_device_locked(skb))
 510                         {
 511                                 /* Remove it from any existing driver queue first! */
 512                                 skb_unlink(skb);
 513                                 /* Now queue it */
 514                                 ip_statistics.IpOutRequests++;
 515                                 dev_queue_xmit(skb, dev, sk->priority);
 516                         }
 517                 }
 518 
 519                 /*
 520                  *      Count retransmissions
 521                  */
 522                  
 523                 ct++;
 524                 sk->prot->retransmits ++;
 525                 tcp_statistics.TcpRetransSegs++;
 526                 
 527 
 528                 /*
 529                  *      Only one retransmit requested.
 530                  */
 531         
 532                 if (!all)
 533                         break;
 534 
 535                 /*
 536                  *      This should cut it off before we send too many packets.
 537                  */
 538 
 539                 if (ct >= sk->cong_window)
 540                         break;
 541                 skb = skb->link3;
 542         }
 543 }
 544 
 545 /*
 546  *      Reset the retransmission timer
 547  */
 548  
 549 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 550 {
 551         del_timer(&sk->retransmit_timer);
 552         sk->ip_xmit_timeout = why;
 553         if((int)when < 0)
 554         {
 555                 when=3;
 556                 printk("Error: Negative timer in xmit_timer\n");
 557         }
 558         sk->retransmit_timer.expires=when;
 559         add_timer(&sk->retransmit_timer);
 560 }
 561 
 562 /*
 563  *      This is the normal code called for timeouts.  It does the retransmission
 564  *      and then does backoff.  tcp_do_retransmit is separated out because
 565  *      tcp_ack needs to send stuff from the retransmit queue without
 566  *      initiating a backoff.
 567  */
 568 
 569 
 570 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 571 {
 572         tcp_do_retransmit(sk, all);
 573 
 574         /*
 575          * Increase the timeout each time we retransmit.  Note that
 576          * we do not increase the rtt estimate.  rto is initialized
 577          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 578          * that doubling rto each time is the least we can get away with.
 579          * In KA9Q, Karn uses this for the first few times, and then
 580          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 581          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 582          * defined in the protocol as the maximum possible RTT.  I guess
 583          * we'll have to use something other than TCP to talk to the
 584          * University of Mars.
 585          *
 586          * PAWS allows us longer timeouts and large windows, so once
 587          * implemented ftp to mars will work nicely. We will have to fix
 588          * the 120 second clamps though!
 589          */
 590 
 591         sk->retransmits++;
 592         sk->prot->retransmits++;
 593         sk->backoff++;
 594         sk->rto = min(sk->rto << 1, 120*HZ);
 595         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 596 }
 597 
 598 
 599 /*
 600  *      A timer event has trigger a tcp retransmit timeout. The
 601  *      socket xmit queue is ready and set up to send. Because
 602  *      the ack receive code keeps the queue straight we do
 603  *      nothing clever here.
 604  */
 605 
 606 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 607 {
 608         if (all) 
 609         {
 610                 tcp_retransmit_time(sk, all);
 611                 return;
 612         }
 613 
 614         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 615         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 616         sk->cong_count = 0;
 617 
 618         sk->cong_window = 1;
 619 
 620         /* Do the actual retransmit. */
 621         tcp_retransmit_time(sk, all);
 622 }
 623 
 624 /*
 625  *      A write timeout has occurred. Process the after effects.
 626  */
 627 
 628 static int tcp_write_timeout(struct sock *sk)
     /*  */
 629 {
 630         /*
 631          *      Look for a 'soft' timeout.
 632          */
 633         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 634                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 635         {
 636                 /*
 637                  *      Attempt to recover if arp has changed (unlikely!) or
 638                  *      a route has shifted (not supported prior to 1.3).
 639                  */
 640                 arp_destroy (sk->daddr, 0);
 641                 /*ip_route_check (sk->daddr);*/
 642         }
 643         
 644         /*
 645          *      Have we tried to SYN too many times (repent repent 8))
 646          */
 647          
 648         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 649         {
 650                 sk->err=ETIMEDOUT;
 651                 sk->error_report(sk);
 652                 del_timer(&sk->retransmit_timer);
 653                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 654                 tcp_set_state(sk,TCP_CLOSE);
 655                 /* Don't FIN, we got nothing back */
 656                 release_sock(sk);
 657                 return 0;
 658         }
 659         /*
 660          *      Has it gone just too far ?
 661          */
 662         if (sk->retransmits > TCP_RETR2) 
 663         {
 664                 sk->err = ETIMEDOUT;
 665                 sk->error_report(sk);
 666                 del_timer(&sk->retransmit_timer);
 667                 /*
 668                  *      Time wait the socket 
 669                  */
 670                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 671                 {
 672                         tcp_set_state(sk,TCP_TIME_WAIT);
 673                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 674                 }
 675                 else
 676                 {
 677                         /*
 678                          *      Clean up time.
 679                          */
 680                         tcp_set_state(sk, TCP_CLOSE);
 681                         release_sock(sk);
 682                         return 0;
 683                 }
 684         }
 685         return 1;
 686 }
 687 
 688 /*
 689  *      The TCP retransmit timer. This lacks a few small details.
 690  *
 691  *      1.      An initial rtt timeout on the probe0 should cause what we can
 692  *              of the first write queue buffer to be split and sent.
 693  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 694  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 695  *              tcp_err should save a 'soft error' for us.
 696  */
 697 
 698 static void retransmit_timer(unsigned long data)
     /*  */
 699 {
 700         struct sock *sk = (struct sock*)data;
 701         int why = sk->ip_xmit_timeout;
 702 
 703         /* 
 704          * only process if socket is not in use
 705          */
 706 
 707         cli();
 708         if (sk->inuse || in_bh) 
 709         {
 710                 /* Try again in 1 second */
 711                 sk->retransmit_timer.expires = HZ;
 712                 add_timer(&sk->retransmit_timer);
 713                 sti();
 714                 return;
 715         }
 716 
 717         sk->inuse = 1;
 718         sti();
 719 
 720         /* Always see if we need to send an ack. */
 721 
 722         if (sk->ack_backlog && !sk->zapped) 
 723         {
 724                 sk->prot->read_wakeup (sk);
 725                 if (! sk->dead)
 726                         sk->data_ready(sk,0);
 727         }
 728 
 729         /* Now we need to figure out why the socket was on the timer. */
 730 
 731         switch (why) 
 732         {
 733                 /* Window probing */
 734                 case TIME_PROBE0:
 735                         tcp_send_probe0(sk);
 736                         tcp_write_timeout(sk);
 737                         break;
 738                 /* Retransmitting */
 739                 case TIME_WRITE:
 740                         /* It could be we got here because we needed to send an ack.
 741                          * So we need to check for that.
 742                          */
 743                 {
 744                         struct sk_buff *skb;
 745                         unsigned long flags;
 746 
 747                         save_flags(flags);
 748                         cli();
 749                         skb = sk->send_head;
 750                         if (!skb) 
 751                         {
 752                                 restore_flags(flags);
 753                         } 
 754                         else 
 755                         {
 756                                 /*
 757                                  *      Kicked by a delayed ack. Reset timer
 758                                  *      correctly now
 759                                  */
 760                                 if (jiffies < skb->when + sk->rto) 
 761                                 {
 762                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 763                                         restore_flags(flags);
 764                                         break;
 765                                 }
 766                                 restore_flags(flags);
 767                                 /*
 768                                  *      Retransmission
 769                                  */
 770                                 sk->prot->retransmit (sk, 0);
 771                                 tcp_write_timeout(sk);
 772                         }
 773                         break;
 774                 }
 775                 /* Sending Keepalives */
 776                 case TIME_KEEPOPEN:
 777                         /* 
 778                          * this reset_timer() call is a hack, this is not
 779                          * how KEEPOPEN is supposed to work.
 780                          */
 781                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 782 
 783                         /* Send something to keep the connection open. */
 784                         if (sk->prot->write_wakeup)
 785                                   sk->prot->write_wakeup (sk);
 786                         sk->retransmits++;
 787                         sk->prot->retransmits++;
 788                         tcp_write_timeout(sk);
 789                         break;
 790                 default:
 791                         printk ("rexmit_timer: timer expired - reason unknown\n");
 792                         break;
 793         }
 794         release_sock(sk);
 795 }
 796 
 797 /*
 798  * This routine is called by the ICMP module when it gets some
 799  * sort of error condition.  If err < 0 then the socket should
 800  * be closed and the error returned to the user.  If err > 0
 801  * it's just the icmp type << 8 | icmp code.  After adjustment
 802  * header points to the first 8 bytes of the tcp header.  We need
 803  * to find the appropriate port.
 804  */
 805 
 806 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /*  */
 807         unsigned long saddr, struct inet_protocol *protocol)
 808 {
 809         struct tcphdr *th;
 810         struct sock *sk;
 811         struct iphdr *iph=(struct iphdr *)header;
 812   
 813         header+=4*iph->ihl;
 814    
 815 
 816         th =(struct tcphdr *)header;
 817         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 818 
 819         if (sk == NULL) 
 820                 return;
 821   
 822         if(err<0)
 823         {
 824                 sk->err = -err;
 825                 sk->error_report(sk);
 826                 return;
 827         }
 828 
 829         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 830         {
 831                 /*
 832                  * FIXME:
 833                  * For now we will just trigger a linear backoff.
 834                  * The slow start code should cause a real backoff here.
 835                  */
 836                 if (sk->cong_window > 4)
 837                         sk->cong_window--;
 838                 return;
 839         }
 840 
 841 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 842 
 843         /*
 844          * If we've already connected we will keep trying
 845          * until we time out, or the user gives up.
 846          */
 847 
 848         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 849         {
 850                 if (sk->state == TCP_SYN_SENT) 
 851                 {
 852                         tcp_statistics.TcpAttemptFails++;
 853                         tcp_set_state(sk,TCP_CLOSE);
 854                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 855                 }
 856                 sk->err = icmp_err_convert[err & 0xff].errno;           
 857         }
 858         return;
 859 }
 860 
 861 
 862 /*
 863  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 864  *      in the received data queue (ie a frame missing that needs sending to us). Not
 865  *      sorting using two queues as data arrives makes life so much harder.
 866  */
 867 
 868 static int tcp_readable(struct sock *sk)
     /*  */
 869 {
 870         unsigned long counted;
 871         unsigned long amount;
 872         struct sk_buff *skb;
 873         int sum;
 874         unsigned long flags;
 875 
 876         if(sk && sk->debug)
 877                 printk("tcp_readable: %p - ",sk);
 878 
 879         save_flags(flags);
 880         cli();
 881         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 882         {
 883                 restore_flags(flags);
 884                 if(sk && sk->debug) 
 885                         printk("empty\n");
 886                 return(0);
 887         }
 888   
 889         counted = sk->copied_seq;       /* Where we are at the moment */
 890         amount = 0;
 891   
 892         /* 
 893          *      Do until a push or until we are out of data. 
 894          */
 895          
 896         do 
 897         {
 898                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 899                         break;
 900                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 901                 if (skb->h.th->syn)
 902                         sum++;
 903                 if (sum > 0) 
 904                 {                                       /* Add it up, move on */
 905                         amount += sum;
 906                         if (skb->h.th->syn) 
 907                                 amount--;
 908                         counted += sum;
 909                 }
 910                 /*
 911                  * Don't count urg data ... but do it in the right place!
 912                  * Consider: "old_data (ptr is here) URG PUSH data"
 913                  * The old code would stop at the first push because
 914                  * it counted the urg (amount==1) and then does amount--
 915                  * *after* the loop.  This means tcp_readable() always
 916                  * returned zero if any URG PUSH was in the queue, even
 917                  * though there was normal data available. If we subtract
 918                  * the urg data right here, we even get it to work for more
 919                  * than one URG PUSH skb without normal data.
 920                  * This means that select() finally works now with urg data
 921                  * in the queue.  Note that rlogin was never affected
 922                  * because it doesn't use select(); it uses two processes
 923                  * and a blocking read().  And the queue scan in tcp_read()
 924                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 925                  */
 926                 if (skb->h.th->urg)
 927                         amount--;       /* don't count urg data */
 928                 if (amount && skb->h.th->psh) break;
 929                 skb = skb->next;
 930         }
 931         while(skb != (struct sk_buff *)&sk->receive_queue);
 932 
 933         restore_flags(flags);
 934         if(sk->debug)
 935                 printk("got %lu bytes.\n",amount);
 936         return(amount);
 937 }
 938 
 939 /*
 940  * LISTEN is a special case for select..
 941  */
 942 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 943 {
 944         if (sel_type == SEL_IN) {
 945                 int retval;
 946 
 947                 sk->inuse = 1;
 948                 retval = (tcp_find_established(sk) != NULL);
 949                 release_sock(sk);
 950                 if (!retval)
 951                         select_wait(&master_select_wakeup,wait);
 952                 return retval;
 953         }
 954         return 0;
 955 }
 956 
 957 
 958 /*
 959  *      Wait for a TCP event.
 960  *
 961  *      Note that we don't need to set "sk->inuse", as the upper select layers
 962  *      take care of normal races (between the test and the event) and we don't
 963  *      go look at any of the socket buffers directly.
 964  */
 965 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 966 {
 967         if (sk->state == TCP_LISTEN)
 968                 return tcp_listen_select(sk, sel_type, wait);
 969 
 970         switch(sel_type) {
 971         case SEL_IN:
 972                 if (sk->err)
 973                         return 1;
 974                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 975                         break;
 976 
 977                 if (sk->shutdown & RCV_SHUTDOWN)
 978                         return 1;
 979                         
 980                 if (sk->acked_seq == sk->copied_seq)
 981                         break;
 982 
 983                 if (sk->urg_seq != sk->copied_seq ||
 984                     sk->acked_seq != sk->copied_seq+1 ||
 985                     sk->urginline || !sk->urg_data)
 986                         return 1;
 987                 break;
 988 
 989         case SEL_OUT:
 990                 if (sk->err)
 991                         return 1;
 992                 if (sk->shutdown & SEND_SHUTDOWN) 
 993                         return 0;
 994                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 995                         break;
 996                 /*
 997                  * This is now right thanks to a small fix
 998                  * by Matt Dillon.
 999                  */
1000 
1001                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1002                         break;
1003                 return 1;
1004 
1005         case SEL_EX:
1006                 if (sk->urg_data)
1007                         return 1;
1008                 break;
1009         }
1010         select_wait(sk->sleep, wait);
1011         return 0;
1012 }
1013 
1014 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1015 {
1016         int err;
1017         switch(cmd) 
1018         {
1019 
1020                 case TIOCINQ:
1021 #ifdef FIXME    /* FIXME: */
1022                 case FIONREAD:
1023 #endif
1024                 {
1025                         unsigned long amount;
1026 
1027                         if (sk->state == TCP_LISTEN) 
1028                                 return(-EINVAL);
1029 
1030                         sk->inuse = 1;
1031                         amount = tcp_readable(sk);
1032                         release_sock(sk);
1033                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1034                         if(err)
1035                                 return err;
1036                         put_user(amount, (int *)arg);
1037                         return(0);
1038                 }
1039                 case SIOCATMARK:
1040                 {
1041                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1042 
1043                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1044                         if (err)
1045                                 return err;
1046                         put_user(answ,(int *) arg);
1047                         return(0);
1048                 }
1049                 case TIOCOUTQ:
1050                 {
1051                         unsigned long amount;
1052 
1053                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1054                         amount = sk->prot->wspace(sk);
1055                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1056                         if(err)
1057                                 return err;
1058                         put_user(amount, (int *)arg);
1059                         return(0);
1060                 }
1061                 default:
1062                         return(-EINVAL);
1063         }
1064 }
1065 
1066 
1067 /*
1068  *      This routine computes a TCP checksum. 
1069  *
1070  *      Modified January 1995 from a go-faster DOS routine by
1071  *      Jorge Cwik <jorge@laser.satlink.net>
1072  */
1073  
1074 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1075           unsigned long saddr, unsigned long daddr, unsigned long base)
1076 {     
1077         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1078 }
1079 
1080 
1081 
1082 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1083                 unsigned long daddr, int len, struct sock *sk)
1084 {
1085         th->check = 0;
1086         th->check = tcp_check(th, len, saddr, daddr,
1087                 csum_partial((char *)th,len,0));
1088         return;
1089 }
1090 
1091 /*
1092  *      This is the main buffer sending routine. We queue the buffer
1093  *      having checked it is sane seeming.
1094  */
1095  
1096 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1097 {
1098         int size;
1099         struct tcphdr * th = skb->h.th;
1100 
1101         /*
1102          *      length of packet (not counting length of pre-tcp headers) 
1103          */
1104          
1105         size = skb->len - ((unsigned char *) th - skb->data);
1106 
1107         /*
1108          *      Sanity check it.. 
1109          */
1110          
1111         if (size < sizeof(struct tcphdr) || size > skb->len) 
1112         {
1113                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1114                         skb, skb->data, th, skb->len);
1115                 kfree_skb(skb, FREE_WRITE);
1116                 return;
1117         }
1118 
1119         /*
1120          *      If we have queued a header size packet.. (these crash a few
1121          *      tcp stacks if ack is not set)
1122          */
1123          
1124         if (size == sizeof(struct tcphdr)) 
1125         {
1126                 /* If it's got a syn or fin it's notionally included in the size..*/
1127                 if(!th->syn && !th->fin) 
1128                 {
1129                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1130                         kfree_skb(skb,FREE_WRITE);
1131                         return;
1132                 }
1133         }
1134 
1135         /*
1136          *      Actual processing.
1137          */
1138          
1139         tcp_statistics.TcpOutSegs++;  
1140         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1141         
1142         /*
1143          *      We must queue if
1144          *
1145          *      a) The right edge of this frame exceeds the window
1146          *      b) We are retransmitting (Nagle's rule)
1147          *      c) We have too many packets 'in flight'
1148          */
1149          
1150         if (after(skb->h.seq, sk->window_seq) ||
1151             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1152              sk->packets_out >= sk->cong_window) 
1153         {
1154                 /* checksum will be supplied by tcp_write_xmit.  So
1155                  * we shouldn't need to set it at all.  I'm being paranoid */
1156                 th->check = 0;
1157                 if (skb->next != NULL) 
1158                 {
1159                         printk("tcp_send_partial: next != NULL\n");
1160                         skb_unlink(skb);
1161                 }
1162                 skb_queue_tail(&sk->write_queue, skb);
1163                 
1164                 /*
1165                  *      If we don't fit we have to start the zero window
1166                  *      probes. This is broken - we really need to do a partial
1167                  *      send _first_ (This is what causes the Cisco and PC/TCP
1168                  *      grief).
1169                  */
1170                  
1171                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1172                     sk->send_head == NULL && sk->ack_backlog == 0)
1173                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1174         } 
1175         else 
1176         {
1177                 /*
1178                  *      This is going straight out
1179                  */
1180                  
1181                 th->ack_seq = ntohl(sk->acked_seq);
1182                 th->window = ntohs(tcp_select_window(sk));
1183 
1184                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1185 
1186                 sk->sent_seq = sk->write_seq;
1187                 
1188                 /*
1189                  *      This is mad. The tcp retransmit queue is put together
1190                  *      by the ip layer. This causes half the problems with
1191                  *      unroutable FIN's and other things.
1192                  */
1193                  
1194                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1195                 
1196                 /*
1197                  *      Set for next retransmit based on expected ACK time.
1198                  *      FIXME: We set this every time which means our 
1199                  *      retransmits are really about a window behind.
1200                  */
1201 
1202                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1203         }
1204 }
1205 
1206 /*
1207  *      Locking problems lead us to a messy situation where we can have
1208  *      multiple partially complete buffers queued up. This is really bad
1209  *      as we don't want to be sending partial buffers. Fix this with
1210  *      a semaphore or similar to lock tcp_write per socket.
1211  *
1212  *      These routines are pretty self descriptive.
1213  */
1214  
1215 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1216 {
1217         struct sk_buff * skb;
1218         unsigned long flags;
1219 
1220         save_flags(flags);
1221         cli();
1222         skb = sk->partial;
1223         if (skb) {
1224                 sk->partial = NULL;
1225                 del_timer(&sk->partial_timer);
1226         }
1227         restore_flags(flags);
1228         return skb;
1229 }
1230 
1231 /*
1232  *      Empty the partial queue
1233  */
1234  
1235 static void tcp_send_partial(struct sock *sk)
     /*  */
1236 {
1237         struct sk_buff *skb;
1238 
1239         if (sk == NULL)
1240                 return;
1241         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1242                 tcp_send_skb(sk, skb);
1243 }
1244 
1245 /*
1246  *      Queue a partial frame
1247  */
1248  
1249 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1250 {
1251         struct sk_buff * tmp;
1252         unsigned long flags;
1253 
1254         save_flags(flags);
1255         cli();
1256         tmp = sk->partial;
1257         if (tmp)
1258                 del_timer(&sk->partial_timer);
1259         sk->partial = skb;
1260         init_timer(&sk->partial_timer);
1261         /*
1262          *      Wait up to 1 second for the buffer to fill.
1263          */
1264         sk->partial_timer.expires = HZ;
1265         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1266         sk->partial_timer.data = (unsigned long) sk;
1267         add_timer(&sk->partial_timer);
1268         restore_flags(flags);
1269         if (tmp)
1270                 tcp_send_skb(sk, tmp);
1271 }
1272 
1273 
1274 /*
1275  *      This routine sends an ack and also updates the window. 
1276  */
1277  
1278 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1279              struct sock *sk,
1280              struct tcphdr *th, unsigned long daddr)
1281 {
1282         struct sk_buff *buff;
1283         struct tcphdr *t1;
1284         struct device *dev = NULL;
1285         int tmp;
1286 
1287         if(sk->zapped)
1288                 return;         /* We have been reset, we may not send again */
1289                 
1290         /*
1291          * We need to grab some memory, and put together an ack,
1292          * and then put it into the queue to be sent.
1293          */
1294 
1295         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1296         if (buff == NULL) 
1297         {
1298                 /* 
1299                  *      Force it to send an ack. We don't have to do this
1300                  *      (ACK is unreliable) but it's much better use of 
1301                  *      bandwidth on slow links to send a spare ack than
1302                  *      resend packets. 
1303                  */
1304                  
1305                 sk->ack_backlog++;
1306                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1307                 {
1308                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1309                 }
1310                 return;
1311         }
1312 
1313         /*
1314          *      Assemble a suitable TCP frame
1315          */
1316          
1317         buff->sk = sk;
1318         buff->localroute = sk->localroute;
1319 
1320         /* 
1321          *      Put in the IP header and routing stuff. 
1322          */
1323          
1324         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1325                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1326         if (tmp < 0) 
1327         {
1328                 buff->free = 1;
1329                 sk->prot->wfree(sk, buff);
1330                 return;
1331         }
1332         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1333 
1334         memcpy(t1, th, sizeof(*t1));
1335 
1336         /*
1337          *      Swap the send and the receive. 
1338          */
1339          
1340         t1->dest = th->source;
1341         t1->source = th->dest;
1342         t1->seq = ntohl(sequence);
1343         t1->ack = 1;
1344         sk->window = tcp_select_window(sk);
1345         t1->window = ntohs(sk->window);
1346         t1->res1 = 0;
1347         t1->res2 = 0;
1348         t1->rst = 0;
1349         t1->urg = 0;
1350         t1->syn = 0;
1351         t1->psh = 0;
1352         t1->fin = 0;
1353         
1354         /*
1355          *      If we have nothing queued for transmit and the transmit timer
1356          *      is on we are just doing an ACK timeout and need to switch
1357          *      to a keepalive.
1358          */
1359          
1360         if (ack == sk->acked_seq) 
1361         {
1362                 sk->ack_backlog = 0;
1363                 sk->bytes_rcv = 0;
1364                 sk->ack_timed = 0;
1365                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1366                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1367                 {
1368                         if(sk->keepopen) {
1369                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1370                         } else {
1371                                 delete_timer(sk);
1372                         }
1373                 }
1374         }
1375         
1376         /*
1377          *      Fill in the packet and send it
1378          */
1379          
1380         t1->ack_seq = ntohl(ack);
1381         t1->doff = sizeof(*t1)/4;
1382         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1383         if (sk->debug)
1384                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1385         tcp_statistics.TcpOutSegs++;
1386         sk->prot->queue_xmit(sk, dev, buff, 1);
1387 }
1388 
1389 
1390 /* 
1391  *      This routine builds a generic TCP header. 
1392  */
1393  
1394 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1395 {
1396 
1397         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1398         th->seq = htonl(sk->write_seq);
1399         th->psh =(push == 0) ? 1 : 0;
1400         th->doff = sizeof(*th)/4;
1401         th->ack = 1;
1402         th->fin = 0;
1403         sk->ack_backlog = 0;
1404         sk->bytes_rcv = 0;
1405         sk->ack_timed = 0;
1406         th->ack_seq = htonl(sk->acked_seq);
1407         sk->window = tcp_select_window(sk);
1408         th->window = htons(sk->window);
1409 
1410         return(sizeof(*th));
1411 }
1412 
1413 /*
1414  *      This routine copies from a user buffer into a socket,
1415  *      and starts the transmit system.
1416  */
1417 
1418 static int tcp_write(struct sock *sk, unsigned char *from,
     /*  */
1419           int len, int nonblock, unsigned flags)
1420 {
1421         int copied = 0;
1422         int copy;
1423         int tmp;
1424         struct sk_buff *skb;
1425         struct sk_buff *send_tmp;
1426         struct proto *prot;
1427         struct device *dev = NULL;
1428 
1429         sk->inuse=1;
1430         prot = sk->prot;
1431         while(len > 0) 
1432         {
1433                 if (sk->err) 
1434                 {                       /* Stop on an error */
1435                         release_sock(sk);
1436                         if (copied) 
1437                                 return(copied);
1438                         tmp = -sk->err;
1439                         sk->err = 0;
1440                         return(tmp);
1441                 }
1442 
1443                 /*
1444                  *      First thing we do is make sure that we are established. 
1445                  */
1446         
1447                 if (sk->shutdown & SEND_SHUTDOWN) 
1448                 {
1449                         release_sock(sk);
1450                         sk->err = EPIPE;
1451                         if (copied) 
1452                                 return(copied);
1453                         sk->err = 0;
1454                         return(-EPIPE);
1455                 }
1456 
1457                 /* 
1458                  *      Wait for a connection to finish.
1459                  */
1460         
1461                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1462                 {
1463                         if (sk->err) 
1464                         {
1465                                 release_sock(sk);
1466                                 if (copied) 
1467                                         return(copied);
1468                                 tmp = -sk->err;
1469                                 sk->err = 0;
1470                                 return(tmp);
1471                         }
1472 
1473                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1474                         {
1475                                 release_sock(sk);
1476                                 if (copied) 
1477                                         return(copied);
1478 
1479                                 if (sk->err) 
1480                                 {
1481                                         tmp = -sk->err;
1482                                         sk->err = 0;
1483                                         return(tmp);
1484                                 }
1485 
1486                                 if (sk->keepopen) 
1487                                 {
1488                                         send_sig(SIGPIPE, current, 0);
1489                                 }
1490                                 return(-EPIPE);
1491                         }
1492 
1493                         if (nonblock || copied) 
1494                         {
1495                                 release_sock(sk);
1496                                 if (copied) 
1497                                         return(copied);
1498                                 return(-EAGAIN);
1499                         }
1500 
1501                         release_sock(sk);
1502                         cli();
1503                 
1504                         if (sk->state != TCP_ESTABLISHED &&
1505                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1506                         {
1507                                 interruptible_sleep_on(sk->sleep);
1508                                 if (current->signal & ~current->blocked) 
1509                                 {
1510                                         sti();
1511                                         if (copied) 
1512                                                 return(copied);
1513                                         return(-ERESTARTSYS);
1514                                 }
1515                         }
1516                         sk->inuse = 1;
1517                         sti();
1518                 }
1519 
1520         /*
1521          * The following code can result in copy <= if sk->mss is ever
1522          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1523          * sk->mtu is constant once SYN processing is finished.  I.e. we
1524          * had better not get here until we've seen his SYN and at least one
1525          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1526          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1527          * non-decreasing.  Note that any ioctl to set user_mss must be done
1528          * before the exchange of SYN's.  If the initial ack from the other
1529          * end has a window of 0, max_window and thus mss will both be 0.
1530          */
1531 
1532         /* 
1533          *      Now we need to check if we have a half built packet. 
1534          */
1535 
1536                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1537                 {
1538                         int hdrlen;
1539 
1540                          /* IP header + TCP header */
1541                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1542                                  + sizeof(struct tcphdr);
1543         
1544                         /* Add more stuff to the end of skb->len */
1545                         if (!(flags & MSG_OOB)) 
1546                         {
1547                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1548                                 /* FIXME: this is really a bug. */
1549                                 if (copy <= 0) 
1550                                 {
1551                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1552                                         copy = 0;
1553                                 }
1554           
1555                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1556                                 from += copy;
1557                                 copied += copy;
1558                                 len -= copy;
1559                                 sk->write_seq += copy;
1560                         }
1561                         if ((skb->len - hdrlen) >= sk->mss ||
1562                                 (flags & MSG_OOB) || !sk->packets_out)
1563                                 tcp_send_skb(sk, skb);
1564                         else
1565                                 tcp_enqueue_partial(skb, sk);
1566                         continue;
1567                 }
1568 
1569         /*
1570          * We also need to worry about the window.
1571          * If window < 1/2 the maximum window we've seen from this
1572          *   host, don't use it.  This is sender side
1573          *   silly window prevention, as specified in RFC1122.
1574          *   (Note that this is different than earlier versions of
1575          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1576          *   use the whole MSS.  Since the results in the right
1577          *   edge of the packet being outside the window, it will
1578          *   be queued for later rather than sent.
1579          */
1580 
1581                 copy = sk->window_seq - sk->write_seq;
1582                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1583                         copy = sk->mss;
1584                 if (copy > len)
1585                         copy = len;
1586 
1587         /*
1588          *      We should really check the window here also. 
1589          */
1590          
1591                 send_tmp = NULL;
1592                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1593                 {
1594                         /*
1595                          *      We will release the socket in case we sleep here. 
1596                          */
1597                         release_sock(sk);
1598                         /*
1599                          *      NB: following must be mtu, because mss can be increased.
1600                          *      mss is always <= mtu 
1601                          */
1602                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1603                         sk->inuse = 1;
1604                         send_tmp = skb;
1605                 } 
1606                 else 
1607                 {
1608                         /*
1609                          *      We will release the socket in case we sleep here. 
1610                          */
1611                         release_sock(sk);
1612                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1613                         sk->inuse = 1;
1614                 }
1615 
1616                 /*
1617                  *      If we didn't get any memory, we need to sleep. 
1618                  */
1619 
1620                 if (skb == NULL) 
1621                 {
1622                         sk->socket->flags |= SO_NOSPACE;
1623                         if (nonblock) 
1624                         {
1625                                 release_sock(sk);
1626                                 if (copied) 
1627                                         return(copied);
1628                                 return(-EAGAIN);
1629                         }
1630 
1631                         /*
1632                          *      FIXME: here is another race condition. 
1633                          */
1634 
1635                         tmp = sk->wmem_alloc;
1636                         release_sock(sk);
1637                         cli();
1638                         /*
1639                          *      Again we will try to avoid it. 
1640                          */
1641                         if (tmp <= sk->wmem_alloc &&
1642                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1643                                 && sk->err == 0) 
1644                         {
1645                                 sk->socket->flags &= ~SO_NOSPACE;
1646                                 interruptible_sleep_on(sk->sleep);
1647                                 if (current->signal & ~current->blocked) 
1648                                 {
1649                                         sti();
1650                                         if (copied) 
1651                                                 return(copied);
1652                                         return(-ERESTARTSYS);
1653                                 }
1654                         }
1655                         sk->inuse = 1;
1656                         sti();
1657                         continue;
1658                 }
1659 
1660                 skb->sk = sk;
1661                 skb->free = 0;
1662                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1663         
1664                 /*
1665                  * FIXME: we need to optimize this.
1666                  * Perhaps some hints here would be good.
1667                  */
1668                 
1669                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1670                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1671                 if (tmp < 0 ) 
1672                 {
1673                         prot->wfree(sk, skb);
1674                         release_sock(sk);
1675                         if (copied) 
1676                                 return(copied);
1677                         return(tmp);
1678                 }
1679                 skb->dev = dev;
1680                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1681                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1682                 if (tmp < 0) 
1683                 {
1684                         prot->wfree(sk, skb);
1685                         release_sock(sk);
1686                         if (copied) 
1687                                 return(copied);
1688                         return(tmp);
1689                 }
1690 
1691                 if (flags & MSG_OOB) 
1692                 {
1693                         skb->h.th->urg = 1;
1694                         skb->h.th->urg_ptr = ntohs(copy);
1695                 }
1696 
1697                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1698                 
1699                 from += copy;
1700                 copied += copy;
1701                 len -= copy;
1702                 skb->free = 0;
1703                 sk->write_seq += copy;
1704         
1705                 if (send_tmp != NULL && sk->packets_out) 
1706                 {
1707                         tcp_enqueue_partial(send_tmp, sk);
1708                         continue;
1709                 }
1710                 tcp_send_skb(sk, skb);
1711         }
1712         sk->err = 0;
1713 
1714 /*
1715  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1716  *      interactive fast network servers. It's meant to be on and
1717  *      it really improves the throughput though not the echo time
1718  *      on my slow slip link - Alan
1719  */
1720 
1721 /*
1722  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1723  */
1724  
1725         if(sk->partial && ((!sk->packets_out) 
1726      /* If not nagling we can send on the before case too.. */
1727               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1728         ))
1729                 tcp_send_partial(sk);
1730 
1731         release_sock(sk);
1732         return(copied);
1733 }
1734 
1735 /*
1736  *      This is just a wrapper. 
1737  */
1738 
1739 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /*  */
1740            int len, int nonblock, unsigned flags,
1741            struct sockaddr_in *addr, int addr_len)
1742 {
1743         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1744                 return -EINVAL;
1745         if (sk->state == TCP_CLOSE)
1746                 return -ENOTCONN;
1747         if (addr_len < sizeof(*addr))
1748                 return -EINVAL;
1749         if (addr->sin_family && addr->sin_family != AF_INET) 
1750                 return -EINVAL;
1751         if (addr->sin_port != sk->dummy_th.dest) 
1752                 return -EISCONN;
1753         if (addr->sin_addr.s_addr != sk->daddr) 
1754                 return -EISCONN;
1755         return tcp_write(sk, from, len, nonblock, flags);
1756 }
1757 
1758 
1759 /*
1760  *      Send an ack if one is backlogged at this point. Ought to merge
1761  *      this with tcp_send_ack().
1762  */
1763  
1764 static void tcp_read_wakeup(struct sock *sk)
     /*  */
1765 {
1766         int tmp;
1767         struct device *dev = NULL;
1768         struct tcphdr *t1;
1769         struct sk_buff *buff;
1770 
1771         if (!sk->ack_backlog) 
1772                 return;
1773 
1774         /*
1775          * FIXME: we need to put code here to prevent this routine from
1776          * being called.  Being called once in a while is ok, so only check
1777          * if this is the second time in a row.
1778          */
1779 
1780         /*
1781          * We need to grab some memory, and put together an ack,
1782          * and then put it into the queue to be sent.
1783          */
1784 
1785         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1786         if (buff == NULL) 
1787         {
1788                 /* Try again real soon. */
1789                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1790                 return;
1791         }
1792 
1793         buff->sk = sk;
1794         buff->localroute = sk->localroute;
1795         
1796         /*
1797          *      Put in the IP header and routing stuff. 
1798          */
1799 
1800         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1801                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1802         if (tmp < 0) 
1803         {
1804                 buff->free = 1;
1805                 sk->prot->wfree(sk, buff);
1806                 return;
1807         }
1808 
1809         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1810 
1811         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1812         t1->seq = htonl(sk->sent_seq);
1813         t1->ack = 1;
1814         t1->res1 = 0;
1815         t1->res2 = 0;
1816         t1->rst = 0;
1817         t1->urg = 0;
1818         t1->syn = 0;
1819         t1->psh = 0;
1820         sk->ack_backlog = 0;
1821         sk->bytes_rcv = 0;
1822         sk->window = tcp_select_window(sk);
1823         t1->window = ntohs(sk->window);
1824         t1->ack_seq = ntohl(sk->acked_seq);
1825         t1->doff = sizeof(*t1)/4;
1826         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1827         sk->prot->queue_xmit(sk, dev, buff, 1);
1828         tcp_statistics.TcpOutSegs++;
1829 }
1830 
1831 
1832 /*
1833  *      FIXME:
1834  *      This routine frees used buffers.
1835  *      It should consider sending an ACK to let the
1836  *      other end know we now have a bigger window.
1837  */
1838 
1839 static void cleanup_rbuf(struct sock *sk)
     /*  */
1840 {
1841         unsigned long flags;
1842         unsigned long left;
1843         struct sk_buff *skb;
1844         unsigned long rspace;
1845 
1846         if(sk->debug)
1847                 printk("cleaning rbuf for sk=%p\n", sk);
1848   
1849         save_flags(flags);
1850         cli();
1851   
1852         left = sk->prot->rspace(sk);
1853  
1854         /*
1855          *      We have to loop through all the buffer headers,
1856          *      and try to free up all the space we can.
1857          */
1858 
1859         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1860         {
1861                 if (!skb->used || skb->users) 
1862                         break;
1863                 skb_unlink(skb);
1864                 skb->sk = sk;
1865                 kfree_skb(skb, FREE_READ);
1866         }
1867 
1868         restore_flags(flags);
1869 
1870         /*
1871          *      FIXME:
1872          *      At this point we should send an ack if the difference
1873          *      in the window, and the amount of space is bigger than
1874          *      TCP_WINDOW_DIFF.
1875          */
1876 
1877         if(sk->debug)
1878                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1879                                             left);
1880         if ((rspace=sk->prot->rspace(sk)) != left) 
1881         {
1882                 /*
1883                  * This area has caused the most trouble.  The current strategy
1884                  * is to simply do nothing if the other end has room to send at
1885                  * least 3 full packets, because the ack from those will auto-
1886                  * matically update the window.  If the other end doesn't think
1887                  * we have much space left, but we have room for at least 1 more
1888                  * complete packet than it thinks we do, we will send an ack
1889                  * immediately.  Otherwise we will wait up to .5 seconds in case
1890                  * the user reads some more.
1891                  */
1892                 sk->ack_backlog++;
1893         /*
1894          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1895          * if the other end is offering a window smaller than the agreed on MSS
1896          * (called sk->mtu here).  In theory there's no connection between send
1897          * and receive, and so no reason to think that they're going to send
1898          * small packets.  For the moment I'm using the hack of reducing the mss
1899          * only on the send side, so I'm putting mtu here.
1900          */
1901 
1902                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1903                 {
1904                         /* Send an ack right now. */
1905                         tcp_read_wakeup(sk);
1906                 } 
1907                 else 
1908                 {
1909                         /* Force it to send an ack soon. */
1910                         int was_active = del_timer(&sk->retransmit_timer);
1911                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1912                         {
1913                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1914                         } 
1915                         else
1916                                 add_timer(&sk->retransmit_timer);
1917                 }
1918         }
1919 } 
1920 
1921 
1922 /*
1923  *      Handle reading urgent data. BSD has very simple semantics for
1924  *      this, no blocking and very strange errors 8)
1925  */
1926  
1927 static int tcp_read_urg(struct sock * sk, int nonblock,
     /*  */
1928              unsigned char *to, int len, unsigned flags)
1929 {
1930         /*
1931          *      No URG data to read
1932          */
1933         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1934                 return -EINVAL; /* Yes this is right ! */
1935                 
1936         if (sk->err) 
1937         {
1938                 int tmp = -sk->err;
1939                 sk->err = 0;
1940                 return tmp;
1941         }
1942 
1943         if (sk->state == TCP_CLOSE || sk->done) 
1944         {
1945                 if (!sk->done) {
1946                         sk->done = 1;
1947                         return 0;
1948                 }
1949                 return -ENOTCONN;
1950         }
1951 
1952         if (sk->shutdown & RCV_SHUTDOWN) 
1953         {
1954                 sk->done = 1;
1955                 return 0;
1956         }
1957         sk->inuse = 1;
1958         if (sk->urg_data & URG_VALID) 
1959         {
1960                 char c = sk->urg_data;
1961                 if (!(flags & MSG_PEEK))
1962                         sk->urg_data = URG_READ;
1963                 put_fs_byte(c, to);
1964                 release_sock(sk);
1965                 return 1;
1966         }
1967         release_sock(sk);
1968         
1969         /*
1970          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1971          * the available implementations agree in this case:
1972          * this call should never block, independent of the
1973          * blocking state of the socket.
1974          * Mike <pall@rz.uni-karlsruhe.de>
1975          */
1976         return -EAGAIN;
1977 }
1978 
1979 
1980 /*
1981  *      This routine copies from a sock struct into the user buffer. 
1982  */
1983  
1984 static int tcp_read(struct sock *sk, unsigned char *to,
     /*  */
1985         int len, int nonblock, unsigned flags)
1986 {
1987         struct wait_queue wait = { current, NULL };
1988         int copied = 0;
1989         u32 peek_seq;
1990         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1991         unsigned long used;
1992 
1993         /* 
1994          *      This error should be checked. 
1995          */
1996          
1997         if (sk->state == TCP_LISTEN)
1998                 return -ENOTCONN;
1999 
2000         /*
2001          *      Urgent data needs to be handled specially. 
2002          */
2003          
2004         if (flags & MSG_OOB)
2005                 return tcp_read_urg(sk, nonblock, to, len, flags);
2006 
2007         /*
2008          *      Copying sequence to update. This is volatile to handle
2009          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2010          *      inline and thus not flush cached variables otherwise).
2011          */
2012          
2013         peek_seq = sk->copied_seq;
2014         seq = &sk->copied_seq;
2015         if (flags & MSG_PEEK)
2016                 seq = &peek_seq;
2017 
2018         add_wait_queue(sk->sleep, &wait);
2019         sk->inuse = 1;
2020         while (len > 0) 
2021         {
2022                 struct sk_buff * skb;
2023                 u32 offset;
2024         
2025                 /*
2026                  * Are we at urgent data? Stop if we have read anything.
2027                  */
2028                  
2029                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2030                         break;
2031 
2032                 /*
2033                  *      Next get a buffer.
2034                  */
2035                  
2036                 current->state = TASK_INTERRUPTIBLE;
2037 
2038                 skb = skb_peek(&sk->receive_queue);
2039                 do 
2040                 {
2041                         if (!skb)
2042                                 break;
2043                         if (before(*seq, skb->h.th->seq))
2044                                 break;
2045                         offset = *seq - skb->h.th->seq;
2046                         if (skb->h.th->syn)
2047                                 offset--;
2048                         if (offset < skb->len)
2049                                 goto found_ok_skb;
2050                         if (skb->h.th->fin)
2051                                 goto found_fin_ok;
2052                         if (!(flags & MSG_PEEK))
2053                                 skb->used = 1;
2054                         skb = skb->next;
2055                 }
2056                 while (skb != (struct sk_buff *)&sk->receive_queue);
2057 
2058                 if (copied)
2059                         break;
2060 
2061                 if (sk->err) 
2062                 {
2063                         copied = -sk->err;
2064                         sk->err = 0;
2065                         break;
2066                 }
2067 
2068                 if (sk->state == TCP_CLOSE) 
2069                 {
2070                         if (!sk->done) 
2071                         {
2072                                 sk->done = 1;
2073                                 break;
2074                         }
2075                         copied = -ENOTCONN;
2076                         break;
2077                 }
2078 
2079                 if (sk->shutdown & RCV_SHUTDOWN) 
2080                 {
2081                         sk->done = 1;
2082                         break;
2083                 }
2084                         
2085                 if (nonblock) 
2086                 {
2087                         copied = -EAGAIN;
2088                         break;
2089                 }
2090 
2091                 cleanup_rbuf(sk);
2092                 release_sock(sk);
2093                 sk->socket->flags |= SO_WAITDATA;
2094                 schedule();
2095                 sk->socket->flags &= ~SO_WAITDATA;
2096                 sk->inuse = 1;
2097 
2098                 if (current->signal & ~current->blocked) 
2099                 {
2100                         copied = -ERESTARTSYS;
2101                         break;
2102                 }
2103                 continue;
2104 
2105         found_ok_skb:
2106                 /*
2107                  *      Lock the buffer. We can be fairly relaxed as
2108                  *      an interrupt will never steal a buffer we are 
2109                  *      using unless I've missed something serious in
2110                  *      tcp_data.
2111                  */
2112                 
2113                 skb->users++;
2114                 
2115                 /*
2116                  *      Ok so how much can we use ? 
2117                  */
2118                  
2119                 used = skb->len - offset;
2120                 if (len < used)
2121                         used = len;
2122                 /*
2123                  *      Do we have urgent data here? 
2124                  */
2125                 
2126                 if (sk->urg_data) 
2127                 {
2128                         u32 urg_offset = sk->urg_seq - *seq;
2129                         if (urg_offset < used) 
2130                         {
2131                                 if (!urg_offset) 
2132                                 {
2133                                         if (!sk->urginline) 
2134                                         {
2135                                                 ++*seq;
2136                                                 offset++;
2137                                                 used--;
2138                                         }
2139                                 }
2140                                 else
2141                                         used = urg_offset;
2142                         }
2143                 }
2144                 
2145                 /*
2146                  *      Copy it - We _MUST_ update *seq first so that we
2147                  *      don't ever double read when we have dual readers
2148                  */
2149                  
2150                 *seq += used;
2151 
2152                 /*
2153                  *      This memcpy_tofs can sleep. If it sleeps and we
2154                  *      do a second read it relies on the skb->users to avoid
2155                  *      a crash when cleanup_rbuf() gets called.
2156                  */
2157                  
2158                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2159                         skb->h.th->doff*4 + offset, used);
2160                 copied += used;
2161                 len -= used;
2162                 to += used;
2163                 
2164                 /*
2165                  *      We now will not sleep again until we are finished
2166                  *      with skb. Sorry if you are doing the SMP port
2167                  *      but you'll just have to fix it neatly ;)
2168                  */
2169                  
2170                 skb->users --;
2171                 
2172                 if (after(sk->copied_seq,sk->urg_seq))
2173                         sk->urg_data = 0;
2174                 if (used + offset < skb->len)
2175                         continue;
2176                 
2177                 /*
2178                  *      Process the FIN.
2179                  */
2180 
2181                 if (skb->h.th->fin)
2182                         goto found_fin_ok;
2183                 if (flags & MSG_PEEK)
2184                         continue;
2185                 skb->used = 1;
2186                 continue;
2187 
2188         found_fin_ok:
2189                 ++*seq;
2190                 if (flags & MSG_PEEK)
2191                         break;
2192                         
2193                 /*
2194                  *      All is done
2195                  */
2196                  
2197                 skb->used = 1;
2198                 sk->shutdown |= RCV_SHUTDOWN;
2199                 break;
2200 
2201         }
2202         remove_wait_queue(sk->sleep, &wait);
2203         current->state = TASK_RUNNING;
2204 
2205         /* Clean up data we have read: This will do ACK frames */
2206         cleanup_rbuf(sk);
2207         release_sock(sk);
2208         return copied;
2209 }
2210 
2211 /*
2212  *      State processing on a close. This implements the state shift for
2213  *      sending our FIN frame. Note that we only send a FIN for some 
2214  *      states. A shutdown() may have already sent the FIN, or we may be
2215  *      closed.
2216  */
2217  
2218 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2219 {
2220         int ns=TCP_CLOSE;
2221         int send_fin=0;
2222         switch(sk->state)
2223         {
2224                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2225                         break;
2226                 case TCP_SYN_RECV:
2227                 case TCP_ESTABLISHED:   /* Closedown begin */
2228                         ns=TCP_FIN_WAIT1;
2229                         send_fin=1;
2230                         break;
2231                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2232                 case TCP_FIN_WAIT2:
2233                 case TCP_CLOSING:
2234                         ns=sk->state;
2235                         break;
2236                 case TCP_CLOSE:
2237                 case TCP_LISTEN:
2238                         break;
2239                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2240                                            wait only for the ACK */
2241                         ns=TCP_LAST_ACK;
2242                         send_fin=1;
2243         }
2244         
2245         tcp_set_state(sk,ns);
2246                 
2247         /*
2248          *      This is a (useful) BSD violating of the RFC. There is a
2249          *      problem with TCP as specified in that the other end could
2250          *      keep a socket open forever with no application left this end.
2251          *      We use a 3 minute timeout (about the same as BSD) then kill
2252          *      our end. If they send after that then tough - BUT: long enough
2253          *      that we won't make the old 4*rto = almost no time - whoops
2254          *      reset mistake.
2255          */
2256         if(dead && ns==TCP_FIN_WAIT2)
2257         {
2258                 int timer_active=del_timer(&sk->timer);
2259                 if(timer_active)
2260                         add_timer(&sk->timer);
2261                 else
2262                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2263         }
2264         
2265         return send_fin;
2266 }
2267 
2268 /*
2269  *      Send a fin.
2270  */
2271 
2272 static void tcp_send_fin(struct sock *sk)
     /*  */
2273 {
2274         struct proto *prot =(struct proto *)sk->prot;
2275         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2276         struct tcphdr *t1;
2277         struct sk_buff *buff;
2278         struct device *dev=NULL;
2279         int tmp;
2280                 
2281         release_sock(sk); /* in case the malloc sleeps. */
2282         
2283         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2284         sk->inuse = 1;
2285 
2286         if (buff == NULL)
2287         {
2288                 /* This is a disaster if it occurs */
2289                 printk("tcp_send_fin: Impossible malloc failure");
2290                 return;
2291         }
2292 
2293         /*
2294          *      Administrivia
2295          */
2296          
2297         buff->sk = sk;
2298         buff->localroute = sk->localroute;
2299 
2300         /*
2301          *      Put in the IP header and routing stuff. 
2302          */
2303 
2304         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2305                            IPPROTO_TCP, sk->opt,
2306                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2307         if (tmp < 0) 
2308         {
2309                 int t;
2310                 /*
2311                  *      Finish anyway, treat this as a send that got lost. 
2312                  *      (Not good).
2313                  */
2314                  
2315                 buff->free = 1;
2316                 prot->wfree(sk,buff);
2317                 sk->write_seq++;
2318                 t=del_timer(&sk->timer);
2319                 if(t)
2320                         add_timer(&sk->timer);
2321                 else
2322                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2323                 return;
2324         }
2325         
2326         /*
2327          *      We ought to check if the end of the queue is a buffer and
2328          *      if so simply add the fin to that buffer, not send it ahead.
2329          */
2330 
2331         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2332         buff->dev = dev;
2333         memcpy(t1, th, sizeof(*t1));
2334         t1->seq = ntohl(sk->write_seq);
2335         sk->write_seq++;
2336         buff->h.seq = sk->write_seq;
2337         t1->ack = 1;
2338         t1->ack_seq = ntohl(sk->acked_seq);
2339         t1->window = ntohs(sk->window=tcp_select_window(sk));
2340         t1->fin = 1;
2341         t1->rst = 0;
2342         t1->doff = sizeof(*t1)/4;
2343         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2344 
2345         /*
2346          * If there is data in the write queue, the fin must be appended to
2347          * the write queue.
2348          */
2349         
2350         if (skb_peek(&sk->write_queue) != NULL) 
2351         {
2352                 buff->free = 0;
2353                 if (buff->next != NULL) 
2354                 {
2355                         printk("tcp_send_fin: next != NULL\n");
2356                         skb_unlink(buff);
2357                 }
2358                 skb_queue_tail(&sk->write_queue, buff);
2359         } 
2360         else 
2361         {
2362                 sk->sent_seq = sk->write_seq;
2363                 sk->prot->queue_xmit(sk, dev, buff, 0);
2364                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2365         }
2366 }
2367 
2368 /*
2369  *      Shutdown the sending side of a connection. Much like close except
2370  *      that we don't receive shut down or set sk->dead=1.
2371  */
2372 
2373 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2374 {
2375         /*
2376          *      We need to grab some memory, and put together a FIN,
2377          *      and then put it into the queue to be sent.
2378          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2379          */
2380 
2381         if (!(how & SEND_SHUTDOWN)) 
2382                 return;
2383          
2384         /*
2385          *      If we've already sent a FIN, or it's a closed state
2386          */
2387          
2388         if (sk->state == TCP_FIN_WAIT1 ||
2389             sk->state == TCP_FIN_WAIT2 ||
2390             sk->state == TCP_CLOSING ||
2391             sk->state == TCP_LAST_ACK ||
2392             sk->state == TCP_TIME_WAIT || 
2393             sk->state == TCP_CLOSE ||
2394             sk->state == TCP_LISTEN
2395           )
2396         {
2397                 return;
2398         }
2399         sk->inuse = 1;
2400 
2401         /*
2402          * flag that the sender has shutdown
2403          */
2404 
2405         sk->shutdown |= SEND_SHUTDOWN;
2406 
2407         /*
2408          *  Clear out any half completed packets. 
2409          */
2410 
2411         if (sk->partial)
2412                 tcp_send_partial(sk);
2413                 
2414         /*
2415          *      FIN if needed
2416          */
2417          
2418         if(tcp_close_state(sk,0))
2419                 tcp_send_fin(sk);
2420                 
2421         release_sock(sk);
2422 }
2423 
2424 
2425 static int
2426 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /*  */
2427              int to_len, int nonblock, unsigned flags,
2428              struct sockaddr_in *addr, int *addr_len)
2429 {
2430         int result;
2431   
2432         /* 
2433          *      Have to check these first unlike the old code. If 
2434          *      we check them after we lose data on an error
2435          *      which is wrong 
2436          */
2437 
2438         if(addr_len)
2439                 *addr_len = sizeof(*addr);
2440         result=tcp_read(sk, to, to_len, nonblock, flags);
2441 
2442         if (result < 0) 
2443                 return(result);
2444   
2445         if(addr)
2446         {
2447                 addr->sin_family = AF_INET;
2448                 addr->sin_port = sk->dummy_th.dest;
2449                 addr->sin_addr.s_addr = sk->daddr;
2450         }
2451         return(result);
2452 }
2453 
2454 
2455 /*
2456  *      This routine will send an RST to the other tcp. 
2457  */
2458  
2459 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2460           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2461 {
2462         struct sk_buff *buff;
2463         struct tcphdr *t1;
2464         int tmp;
2465         struct device *ndev=NULL;
2466 
2467         /*
2468          *      Cannot reset a reset (Think about it).
2469          */
2470          
2471         if(th->rst)
2472                 return;
2473   
2474         /*
2475          * We need to grab some memory, and put together an RST,
2476          * and then put it into the queue to be sent.
2477          */
2478 
2479         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2480         if (buff == NULL) 
2481                 return;
2482 
2483         buff->sk = NULL;
2484         buff->dev = dev;
2485         buff->localroute = 0;
2486 
2487         /*
2488          *      Put in the IP header and routing stuff. 
2489          */
2490 
2491         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2492                            sizeof(struct tcphdr),tos,ttl);
2493         if (tmp < 0) 
2494         {
2495                 buff->free = 1;
2496                 prot->wfree(NULL, buff);
2497                 return;
2498         }
2499 
2500         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2501         memcpy(t1, th, sizeof(*t1));
2502 
2503         /*
2504          *      Swap the send and the receive. 
2505          */
2506 
2507         t1->dest = th->source;
2508         t1->source = th->dest;
2509         t1->rst = 1;  
2510         t1->window = 0;
2511   
2512         if(th->ack)
2513         {
2514                 t1->ack = 0;
2515                 t1->seq = th->ack_seq;
2516                 t1->ack_seq = 0;
2517         }
2518         else
2519         {
2520                 t1->ack = 1;
2521                 if(!th->syn)
2522                         t1->ack_seq=htonl(th->seq);
2523                 else
2524                         t1->ack_seq=htonl(th->seq+1);
2525                 t1->seq=0;
2526         }
2527 
2528         t1->syn = 0;
2529         t1->urg = 0;
2530         t1->fin = 0;
2531         t1->psh = 0;
2532         t1->doff = sizeof(*t1)/4;
2533         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2534         prot->queue_xmit(NULL, ndev, buff, 1);
2535         tcp_statistics.TcpOutSegs++;
2536 }
2537 
2538 
2539 /*
2540  *      Look for tcp options. Parses everything but only knows about MSS.
2541  *      This routine is always called with the packet containing the SYN.
2542  *      However it may also be called with the ack to the SYN.  So you
2543  *      can't assume this is always the SYN.  It's always called after
2544  *      we have set up sk->mtu to our own MTU.
2545  *
2546  *      We need at minimum to add PAWS support here. Possibly large windows
2547  *      as Linux gets deployed on 100Mb/sec networks.
2548  */
2549  
2550 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2551 {
2552         unsigned char *ptr;
2553         int length=(th->doff*4)-sizeof(struct tcphdr);
2554         int mss_seen = 0;
2555     
2556         ptr = (unsigned char *)(th + 1);
2557   
2558         while(length>0)
2559         {
2560                 int opcode=*ptr++;
2561                 int opsize=*ptr++;
2562                 switch(opcode)
2563                 {
2564                         case TCPOPT_EOL:
2565                                 return;
2566                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2567                                 length--;
2568                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2569                                 continue;
2570                         
2571                         default:
2572                                 if(opsize<=2)   /* Avoid silly options looping forever */
2573                                         return;
2574                                 switch(opcode)
2575                                 {
2576                                         case TCPOPT_MSS:
2577                                                 if(opsize==4 && th->syn)
2578                                                 {
2579                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2580                                                         mss_seen = 1;
2581                                                 }
2582                                                 break;
2583                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2584                                 }
2585                                 ptr+=opsize-2;
2586                                 length-=opsize;
2587                 }
2588         }
2589         if (th->syn) 
2590         {
2591                 if (! mss_seen)
2592                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2593         }
2594 #ifdef CONFIG_INET_PCTCP
2595         sk->mss = min(sk->max_window >> 1, sk->mtu);
2596 #else    
2597         sk->mss = min(sk->max_window, sk->mtu);
2598 #endif  
2599 }
2600 
2601 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2602 {
2603         dst = ntohl(dst);
2604         if (IN_CLASSA(dst))
2605                 return htonl(IN_CLASSA_NET);
2606         if (IN_CLASSB(dst))
2607                 return htonl(IN_CLASSB_NET);
2608         return htonl(IN_CLASSC_NET);
2609 }
2610 
2611 /*
2612  *      Default sequence number picking algorithm.
2613  *      As close as possible to RFC 793, which
2614  *      suggests using a 250kHz clock.
2615  *      Further reading shows this assumes 2MB/s networks.
2616  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2617  *      That's funny, Linux has one built in!  Use it!
2618  */
2619 
2620 extern inline u32 tcp_init_seq(void)
     /*  */
2621 {
2622         struct timeval tv;
2623         do_gettimeofday(&tv);
2624         return tv.tv_usec+tv.tv_sec*1000000;
2625 }
2626 
2627 /*
2628  *      This routine handles a connection request.
2629  *      It should make sure we haven't already responded.
2630  *      Because of the way BSD works, we have to send a syn/ack now.
2631  *      This also means it will be harder to close a socket which is
2632  *      listening.
2633  */
2634  
2635 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2636                  unsigned long daddr, unsigned long saddr,
2637                  struct options *opt, struct device *dev, u32 seq)
2638 {
2639         struct sk_buff *buff;
2640         struct tcphdr *t1;
2641         unsigned char *ptr;
2642         struct sock *newsk;
2643         struct tcphdr *th;
2644         struct device *ndev=NULL;
2645         int tmp;
2646         struct rtable *rt;
2647   
2648         th = skb->h.th;
2649 
2650         /* If the socket is dead, don't accept the connection. */
2651         if (!sk->dead) 
2652         {
2653                 sk->data_ready(sk,0);
2654         }
2655         else 
2656         {
2657                 if(sk->debug)
2658                         printk("Reset on %p: Connect on dead socket.\n",sk);
2659                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2660                 tcp_statistics.TcpAttemptFails++;
2661                 kfree_skb(skb, FREE_READ);
2662                 return;
2663         }
2664 
2665         /*
2666          * Make sure we can accept more.  This will prevent a
2667          * flurry of syns from eating up all our memory.
2668          */
2669 
2670         if (sk->ack_backlog >= sk->max_ack_backlog) 
2671         {
2672                 tcp_statistics.TcpAttemptFails++;
2673                 kfree_skb(skb, FREE_READ);
2674                 return;
2675         }
2676 
2677         /*
2678          * We need to build a new sock struct.
2679          * It is sort of bad to have a socket without an inode attached
2680          * to it, but the wake_up's will just wake up the listening socket,
2681          * and if the listening socket is destroyed before this is taken
2682          * off of the queue, this will take care of it.
2683          */
2684 
2685         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2686         if (newsk == NULL) 
2687         {
2688                 /* just ignore the syn.  It will get retransmitted. */
2689                 tcp_statistics.TcpAttemptFails++;
2690                 kfree_skb(skb, FREE_READ);
2691                 return;
2692         }
2693 
2694         memcpy(newsk, sk, sizeof(*newsk));
2695         skb_queue_head_init(&newsk->write_queue);
2696         skb_queue_head_init(&newsk->receive_queue);
2697         newsk->send_head = NULL;
2698         newsk->send_tail = NULL;
2699         skb_queue_head_init(&newsk->back_log);
2700         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2701         newsk->rto = TCP_TIMEOUT_INIT;
2702         newsk->mdev = 0;
2703         newsk->max_window = 0;
2704         newsk->cong_window = 1;
2705         newsk->cong_count = 0;
2706         newsk->ssthresh = 0;
2707         newsk->backoff = 0;
2708         newsk->blog = 0;
2709         newsk->intr = 0;
2710         newsk->proc = 0;
2711         newsk->done = 0;
2712         newsk->partial = NULL;
2713         newsk->pair = NULL;
2714         newsk->wmem_alloc = 0;
2715         newsk->rmem_alloc = 0;
2716         newsk->localroute = sk->localroute;
2717 
2718         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2719 
2720         newsk->err = 0;
2721         newsk->shutdown = 0;
2722         newsk->ack_backlog = 0;
2723         newsk->acked_seq = skb->h.th->seq+1;
2724         newsk->copied_seq = skb->h.th->seq+1;
2725         newsk->fin_seq = skb->h.th->seq;
2726         newsk->state = TCP_SYN_RECV;
2727         newsk->timeout = 0;
2728         newsk->ip_xmit_timeout = 0;
2729         newsk->write_seq = seq; 
2730         newsk->window_seq = newsk->write_seq;
2731         newsk->rcv_ack_seq = newsk->write_seq;
2732         newsk->urg_data = 0;
2733         newsk->retransmits = 0;
2734         newsk->linger=0;
2735         newsk->destroy = 0;
2736         init_timer(&newsk->timer);
2737         newsk->timer.data = (unsigned long)newsk;
2738         newsk->timer.function = &net_timer;
2739         init_timer(&newsk->retransmit_timer);
2740         newsk->retransmit_timer.data = (unsigned long)newsk;
2741         newsk->retransmit_timer.function=&retransmit_timer;
2742         newsk->dummy_th.source = skb->h.th->dest;
2743         newsk->dummy_th.dest = skb->h.th->source;
2744         
2745         /*
2746          *      Swap these two, they are from our point of view. 
2747          */
2748          
2749         newsk->daddr = saddr;
2750         newsk->saddr = daddr;
2751 
2752         put_sock(newsk->num,newsk);
2753         newsk->dummy_th.res1 = 0;
2754         newsk->dummy_th.doff = 6;
2755         newsk->dummy_th.fin = 0;
2756         newsk->dummy_th.syn = 0;
2757         newsk->dummy_th.rst = 0;        
2758         newsk->dummy_th.psh = 0;
2759         newsk->dummy_th.ack = 0;
2760         newsk->dummy_th.urg = 0;
2761         newsk->dummy_th.res2 = 0;
2762         newsk->acked_seq = skb->h.th->seq + 1;
2763         newsk->copied_seq = skb->h.th->seq + 1;
2764         newsk->socket = NULL;
2765 
2766         /*
2767          *      Grab the ttl and tos values and use them 
2768          */
2769 
2770         newsk->ip_ttl=sk->ip_ttl;
2771         newsk->ip_tos=skb->ip_hdr->tos;
2772 
2773         /*
2774          *      Use 512 or whatever user asked for 
2775          */
2776 
2777         /*
2778          *      Note use of sk->user_mss, since user has no direct access to newsk 
2779          */
2780 
2781         rt=ip_rt_route(saddr, NULL,NULL);
2782         
2783         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2784                 newsk->window_clamp = rt->rt_window;
2785         else
2786                 newsk->window_clamp = 0;
2787                 
2788         if (sk->user_mss)
2789                 newsk->mtu = sk->user_mss;
2790         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2791                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2792         else 
2793         {
2794 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2795                 if ((saddr ^ daddr) & default_mask(saddr))
2796 #else
2797                 if ((saddr ^ daddr) & dev->pa_mask)
2798 #endif
2799                         newsk->mtu = 576 - HEADER_SIZE;
2800                 else
2801                         newsk->mtu = MAX_WINDOW;
2802         }
2803 
2804         /*
2805          *      But not bigger than device MTU 
2806          */
2807 
2808         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2809 
2810         /*
2811          *      This will min with what arrived in the packet 
2812          */
2813 
2814         tcp_options(newsk,skb->h.th);
2815         
2816         tcp_cache_zap();
2817 
2818         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2819         if (buff == NULL) 
2820         {
2821                 sk->err = ENOMEM;
2822                 newsk->dead = 1;
2823                 newsk->state = TCP_CLOSE;
2824                 /* And this will destroy it */
2825                 release_sock(newsk);
2826                 kfree_skb(skb, FREE_READ);
2827                 tcp_statistics.TcpAttemptFails++;
2828                 return;
2829         }
2830   
2831         buff->sk = newsk;
2832         buff->localroute = newsk->localroute;
2833 
2834         /*
2835          *      Put in the IP header and routing stuff. 
2836          */
2837 
2838         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840 
2841         /*
2842          *      Something went wrong. 
2843          */
2844 
2845         if (tmp < 0) 
2846         {
2847                 sk->err = tmp;
2848                 buff->free = 1;
2849                 kfree_skb(buff,FREE_WRITE);
2850                 newsk->dead = 1;
2851                 newsk->state = TCP_CLOSE;
2852                 release_sock(newsk);
2853                 skb->sk = sk;
2854                 kfree_skb(skb, FREE_READ);
2855                 tcp_statistics.TcpAttemptFails++;
2856                 return;
2857         }
2858 
2859         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2860   
2861         memcpy(t1, skb->h.th, sizeof(*t1));
2862         buff->h.seq = newsk->write_seq;
2863         /*
2864          *      Swap the send and the receive. 
2865          */
2866         t1->dest = skb->h.th->source;
2867         t1->source = newsk->dummy_th.source;
2868         t1->seq = ntohl(newsk->write_seq++);
2869         t1->ack = 1;
2870         newsk->window = tcp_select_window(newsk);
2871         newsk->sent_seq = newsk->write_seq;
2872         t1->window = ntohs(newsk->window);
2873         t1->res1 = 0;
2874         t1->res2 = 0;
2875         t1->rst = 0;
2876         t1->urg = 0;
2877         t1->psh = 0;
2878         t1->syn = 1;
2879         t1->ack_seq = ntohl(skb->h.th->seq+1);
2880         t1->doff = sizeof(*t1)/4+1;
2881         ptr = skb_put(buff,4);
2882         ptr[0] = 2;
2883         ptr[1] = 4;
2884         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2885         ptr[3] =(newsk->mtu) & 0xff;
2886 
2887         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2888         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2889         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2890         skb->sk = newsk;
2891 
2892         /*
2893          *      Charge the sock_buff to newsk. 
2894          */
2895          
2896         sk->rmem_alloc -= skb->truesize;
2897         newsk->rmem_alloc += skb->truesize;
2898         
2899         skb_queue_tail(&sk->receive_queue,skb);
2900         sk->ack_backlog++;
2901         release_sock(newsk);
2902         tcp_statistics.TcpOutSegs++;
2903 }
2904 
2905 
2906 static void tcp_close(struct sock *sk, int timeout)
     /*  */
2907 {
2908         /*
2909          * We need to grab some memory, and put together a FIN, 
2910          * and then put it into the queue to be sent.
2911          */
2912         
2913         sk->inuse = 1;
2914         
2915         if(th_cache_sk==sk)
2916                 tcp_cache_zap();
2917         if(sk->state == TCP_LISTEN)
2918         {
2919                 /* Special case */
2920                 tcp_set_state(sk, TCP_CLOSE);
2921                 tcp_close_pending(sk);
2922                 release_sock(sk);
2923                 return;
2924         }
2925         
2926         sk->keepopen = 1;
2927         sk->shutdown = SHUTDOWN_MASK;
2928 
2929         if (!sk->dead) 
2930                 sk->state_change(sk);
2931 
2932         if (timeout == 0) 
2933         {
2934                 struct sk_buff *skb;
2935                 
2936                 /*
2937                  *  We need to flush the recv. buffs.  We do this only on the
2938                  *  descriptor close, not protocol-sourced closes, because the
2939                  *  reader process may not have drained the data yet!
2940                  */
2941                  
2942                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2943                         kfree_skb(skb, FREE_READ);
2944                 /*
2945                  *      Get rid off any half-completed packets. 
2946                  */
2947 
2948                 if (sk->partial) 
2949                         tcp_send_partial(sk);
2950         }
2951 
2952                 
2953         /*
2954          *      Timeout is not the same thing - however the code likes
2955          *      to send both the same way (sigh).
2956          */
2957          
2958         if(timeout)
2959         {
2960                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2961         }
2962         else
2963         {
2964                 if(tcp_close_state(sk,1)==1)
2965                 {
2966                         tcp_send_fin(sk);
2967                 }
2968         }
2969         release_sock(sk);
2970 }
2971 
2972 
2973 /*
2974  *      This routine takes stuff off of the write queue,
2975  *      and puts it in the xmit queue. This happens as incoming acks
2976  *      open up the remote window for us.
2977  */
2978  
2979 static void tcp_write_xmit(struct sock *sk)
     /*  */
2980 {
2981         struct sk_buff *skb;
2982 
2983         /*
2984          *      The bytes will have to remain here. In time closedown will
2985          *      empty the write queue and all will be happy 
2986          */
2987 
2988         if(sk->zapped)
2989                 return;
2990 
2991         /*
2992          *      Anything on the transmit queue that fits the window can
2993          *      be added providing we are not
2994          *
2995          *      a) retransmitting (Nagle's rule)
2996          *      b) exceeding our congestion window.
2997          */
2998          
2999         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3000                 before(skb->h.seq, sk->window_seq + 1) &&
3001                 (sk->retransmits == 0 ||
3002                  sk->ip_xmit_timeout != TIME_WRITE ||
3003                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3004                 && sk->packets_out < sk->cong_window) 
3005         {
3006                 IS_SKB(skb);
3007                 skb_unlink(skb);
3008                 
3009                 /*
3010                  *      See if we really need to send the packet. 
3011                  */
3012                  
3013                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3014                 {
3015                         /*
3016                          *      This is acked data. We can discard it. This 
3017                          *      cannot currently occur.
3018                          */
3019                          
3020                         sk->retransmits = 0;
3021                         kfree_skb(skb, FREE_WRITE);
3022                         if (!sk->dead) 
3023                                 sk->write_space(sk);
3024                 } 
3025                 else
3026                 {
3027                         struct tcphdr *th;
3028                         struct iphdr *iph;
3029                         int size;
3030 /*
3031  * put in the ack seq and window at this point rather than earlier,
3032  * in order to keep them monotonic.  We really want to avoid taking
3033  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3034  * Ack and window will in general have changed since this packet was put
3035  * on the write queue.
3036  */
3037                         iph = (struct iphdr *)(skb->data +
3038                                                skb->dev->hard_header_len);
3039                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3040                         size = skb->len - (((unsigned char *) th) - skb->data);
3041                         
3042                         th->ack_seq = ntohl(sk->acked_seq);
3043                         th->window = ntohs(tcp_select_window(sk));
3044 
3045                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3046 
3047                         sk->sent_seq = skb->h.seq;
3048                         
3049                         /*
3050                          *      IP manages our queue for some crazy reason
3051                          */
3052                          
3053                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3054                         
3055                         /*
3056                          *      Again we slide the timer wrongly
3057                          */
3058                          
3059                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3060                 }
3061         }
3062 }
3063 
3064 
3065 /*
3066  *      This routine deals with incoming acks, but not outgoing ones.
3067  */
3068 
3069 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3070 {
3071         u32 ack;
3072         int flag = 0;
3073 
3074         /* 
3075          * 1 - there was data in packet as well as ack or new data is sent or 
3076          *     in shutdown state
3077          * 2 - data from retransmit queue was acked and removed
3078          * 4 - window shrunk or data from retransmit queue was acked and removed
3079          */
3080 
3081         if(sk->zapped)
3082                 return(1);      /* Dead, cant ack any more so why bother */
3083 
3084         /*
3085          *      Have we discovered a larger window
3086          */
3087          
3088         ack = ntohl(th->ack_seq);
3089 
3090         if (ntohs(th->window) > sk->max_window) 
3091         {
3092                 sk->max_window = ntohs(th->window);
3093 #ifdef CONFIG_INET_PCTCP
3094                 /* Hack because we don't send partial packets to non SWS
3095                    handling hosts */
3096                 sk->mss = min(sk->max_window>>1, sk->mtu);
3097 #else
3098                 sk->mss = min(sk->max_window, sk->mtu);
3099 #endif  
3100         }
3101 
3102         /*
3103          *      We have dropped back to keepalive timeouts. Thus we have
3104          *      no retransmits pending.
3105          */
3106          
3107         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3108                 sk->retransmits = 0;
3109 
3110         /*
3111          *      If the ack is newer than sent or older than previous acks
3112          *      then we can probably ignore it.
3113          */
3114          
3115         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3116         {
3117                 if(sk->debug)
3118                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3119                         
3120                 /*
3121                  *      Keepalive processing.
3122                  */
3123                  
3124                 if (after(ack, sk->sent_seq)) 
3125                 {
3126                         return(0);
3127                 }
3128                 
3129                 /*
3130                  *      Restart the keepalive timer.
3131                  */
3132                  
3133                 if (sk->keepopen) 
3134                 {
3135                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3136                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3137                 }
3138                 return(1);
3139         }
3140 
3141         /*
3142          *      If there is data set flag 1
3143          */
3144          
3145         if (len != th->doff*4) 
3146                 flag |= 1;
3147 
3148         /*
3149          *      See if our window has been shrunk. 
3150          */
3151 
3152         if (after(sk->window_seq, ack+ntohs(th->window))) 
3153         {
3154                 /*
3155                  * We may need to move packets from the send queue
3156                  * to the write queue, if the window has been shrunk on us.
3157                  * The RFC says you are not allowed to shrink your window
3158                  * like this, but if the other end does, you must be able
3159                  * to deal with it.
3160                  */
3161                 struct sk_buff *skb;
3162                 struct sk_buff *skb2;
3163                 struct sk_buff *wskb = NULL;
3164         
3165                 skb2 = sk->send_head;
3166                 sk->send_head = NULL;
3167                 sk->send_tail = NULL;
3168         
3169                 /*
3170                  *      This is an artifact of a flawed concept. We want one
3171                  *      queue and a smarter send routine when we send all.
3172                  */
3173         
3174                 flag |= 4;      /* Window changed */
3175         
3176                 sk->window_seq = ack + ntohs(th->window);
3177                 cli();
3178                 while (skb2 != NULL) 
3179                 {
3180                         skb = skb2;
3181                         skb2 = skb->link3;
3182                         skb->link3 = NULL;
3183                         if (after(skb->h.seq, sk->window_seq)) 
3184                         {
3185                                 if (sk->packets_out > 0) 
3186                                         sk->packets_out--;
3187                                 /* We may need to remove this from the dev send list. */
3188                                 if (skb->next != NULL) 
3189                                 {
3190                                         skb_unlink(skb);                                
3191                                 }
3192                                 /* Now add it to the write_queue. */
3193                                 if (wskb == NULL)
3194                                         skb_queue_head(&sk->write_queue,skb);
3195                                 else
3196                                         skb_append(wskb,skb);
3197                                 wskb = skb;
3198                         } 
3199                         else 
3200                         {
3201                                 if (sk->send_head == NULL) 
3202                                 {
3203                                         sk->send_head = skb;
3204                                         sk->send_tail = skb;
3205                                 }
3206                                 else
3207                                 {
3208                                         sk->send_tail->link3 = skb;
3209                                         sk->send_tail = skb;
3210                                 }
3211                                 skb->link3 = NULL;
3212                         }
3213                 }
3214                 sti();
3215         }
3216 
3217         /*
3218          *      Pipe has emptied
3219          */
3220          
3221         if (sk->send_tail == NULL || sk->send_head == NULL) 
3222         {
3223                 sk->send_head = NULL;
3224                 sk->send_tail = NULL;
3225                 sk->packets_out= 0;
3226         }
3227 
3228         /*
3229          *      Update the right hand window edge of the host
3230          */
3231          
3232         sk->window_seq = ack + ntohs(th->window);
3233 
3234         /*
3235          *      We don't want too many packets out there. 
3236          */
3237          
3238         if (sk->ip_xmit_timeout == TIME_WRITE && 
3239                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3240         {
3241                 /* 
3242                  * This is Jacobson's slow start and congestion avoidance. 
3243                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3244                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3245                  * counter and increment it once every cwnd times.  It's possible
3246                  * that this should be done only if sk->retransmits == 0.  I'm
3247                  * interpreting "new data is acked" as including data that has
3248                  * been retransmitted but is just now being acked.
3249                  */
3250                 if (sk->cong_window < sk->ssthresh)  
3251                         /* 
3252                          *      In "safe" area, increase
3253                          */
3254                         sk->cong_window++;
3255                 else 
3256                 {
3257                         /*
3258                          *      In dangerous area, increase slowly.  In theory this is
3259                          *      sk->cong_window += 1 / sk->cong_window
3260                          */
3261                         if (sk->cong_count >= sk->cong_window) 
3262                         {
3263                                 sk->cong_window++;
3264                                 sk->cong_count = 0;
3265                         }
3266                         else 
3267                                 sk->cong_count++;
3268                 }
3269         }
3270 
3271         /*
3272          *      Remember the highest ack received.
3273          */
3274          
3275         sk->rcv_ack_seq = ack;
3276 
3277         /*
3278          *      If this ack opens up a zero window, clear backoff.  It was
3279          *      being used to time the probes, and is probably far higher than
3280          *      it needs to be for normal retransmission.
3281          */
3282 
3283         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3284         {
3285                 sk->retransmits = 0;    /* Our probe was answered */
3286                 
3287                 /*
3288                  *      Was it a usable window open ?
3289                  */
3290                  
3291                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3292                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3293                 {
3294                         sk->backoff = 0;
3295                         
3296                         /*
3297                          *      Recompute rto from rtt.  this eliminates any backoff.
3298                          */
3299 
3300                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3301                         if (sk->rto > 120*HZ)
3302                                 sk->rto = 120*HZ;
3303                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3304                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3305                                                    .2 of a second is going to need huge windows (SIGH) */
3306                         sk->rto = 20;
3307                 }
3308         }
3309 
3310         /* 
3311          *      See if we can take anything off of the retransmit queue.
3312          */
3313    
3314         while(sk->send_head != NULL) 
3315         {
3316                 /* Check for a bug. */
3317                 if (sk->send_head->link3 &&
3318                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3319                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3320                         
3321                 /*
3322                  *      If our packet is before the ack sequence we can
3323                  *      discard it as it's confirmed to have arrived the other end.
3324                  */
3325                  
3326                 if (before(sk->send_head->h.seq, ack+1)) 
3327                 {
3328                         struct sk_buff *oskb;   
3329                         if (sk->retransmits) 
3330                         {       
3331                                 /*
3332                                  *      We were retransmitting.  don't count this in RTT est 
3333                                  */
3334                                 flag |= 2;
3335 
3336                                 /*
3337                                  * even though we've gotten an ack, we're still
3338                                  * retransmitting as long as we're sending from
3339                                  * the retransmit queue.  Keeping retransmits non-zero
3340                                  * prevents us from getting new data interspersed with
3341                                  * retransmissions.
3342                                  */
3343 
3344                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3345                                         sk->retransmits = 1;
3346                                 else
3347                                         sk->retransmits = 0;
3348                         }
3349                         /*
3350                          * Note that we only reset backoff and rto in the
3351                          * rtt recomputation code.  And that doesn't happen
3352                          * if there were retransmissions in effect.  So the
3353                          * first new packet after the retransmissions is
3354                          * sent with the backoff still in effect.  Not until
3355                          * we get an ack from a non-retransmitted packet do
3356                          * we reset the backoff and rto.  This allows us to deal
3357                          * with a situation where the network delay has increased
3358                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3359                          */
3360 
3361                         /*
3362                          *      We have one less packet out there. 
3363                          */
3364                          
3365                         if (sk->packets_out > 0) 
3366                                 sk->packets_out --;
3367                         /* 
3368                          *      Wake up the process, it can probably write more. 
3369                          */
3370                         if (!sk->dead) 
3371                                 sk->write_space(sk);
3372                         oskb = sk->send_head;
3373 
3374                         if (!(flag&2))  /* Not retransmitting */
3375                         {
3376                                 long m;
3377         
3378                                 /*
3379                                  *      The following amusing code comes from Jacobson's
3380                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3381                                  *      are scaled versions of rtt and mean deviation.
3382                                  *      This is designed to be as fast as possible 
3383                                  *      m stands for "measurement".
3384                                  */
3385         
3386                                 m = jiffies - oskb->when;  /* RTT */
3387                                 if(m<=0)
3388                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3389                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3390                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3391                                 if (m < 0)
3392                                         m = -m;         /* m is now abs(error) */
3393                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3394                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3395         
3396                                 /*
3397                                  *      Now update timeout.  Note that this removes any backoff.
3398                                  */
3399                          
3400                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3401                                 if (sk->rto > 120*HZ)
3402                                         sk->rto = 120*HZ;
3403                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3404                                         sk->rto = 20;
3405                                 sk->backoff = 0;
3406                         }
3407                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3408                                            In this case as we just set it up */
3409                         cli();
3410                         oskb = sk->send_head;
3411                         IS_SKB(oskb);
3412                         sk->send_head = oskb->link3;
3413                         if (sk->send_head == NULL) 
3414                         {
3415                                 sk->send_tail = NULL;
3416                         }
3417 
3418                 /*
3419                  *      We may need to remove this from the dev send list. 
3420                  */
3421 
3422                         if (oskb->next)
3423                                 skb_unlink(oskb);
3424                         sti();
3425                         kfree_skb(oskb, FREE_WRITE); /* write. */
3426                         if (!sk->dead) 
3427                                 sk->write_space(sk);
3428                 }
3429                 else
3430                 {
3431                         break;
3432                 }
3433         }
3434 
3435         /*
3436          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3437          * returns non-NULL, we complete ignore the timer stuff in the else
3438          * clause.  We ought to organize the code so that else clause can
3439          * (should) be executed regardless, possibly moving the PROBE timer
3440          * reset over.  The skb_peek() thing should only move stuff to the
3441          * write queue, NOT also manage the timer functions.
3442          */
3443 
3444         /*
3445          * Maybe we can take some stuff off of the write queue,
3446          * and put it onto the xmit queue.
3447          */
3448         if (skb_peek(&sk->write_queue) != NULL) 
3449         {
3450                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3451                         (sk->retransmits == 0 || 
3452                          sk->ip_xmit_timeout != TIME_WRITE ||
3453                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3454                         && sk->packets_out < sk->cong_window) 
3455                 {
3456                         /*
3457                          *      Add more data to the send queue.
3458                          */
3459                         flag |= 1;
3460                         tcp_write_xmit(sk);
3461                 }
3462                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3463                         sk->send_head == NULL &&
3464                         sk->ack_backlog == 0 &&
3465                         sk->state != TCP_TIME_WAIT) 
3466                 {
3467                         /*
3468                          *      Data to queue but no room.
3469                          */
3470                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3471                 }               
3472         }
3473         else
3474         {
3475                 /*
3476                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3477                  * from TCP_CLOSE we don't do anything
3478                  *
3479                  * from anything else, if there is write data (or fin) pending,
3480                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3481                  * a KEEPALIVE timeout, else we delete the timer.
3482                  *
3483                  * We do not set flag for nominal write data, otherwise we may
3484                  * force a state where we start to write itsy bitsy tidbits
3485                  * of data.
3486                  */
3487 
3488                 switch(sk->state) {
3489                 case TCP_TIME_WAIT:
3490                         /*
3491                          * keep us in TIME_WAIT until we stop getting packets,
3492                          * reset the timeout.
3493                          */
3494                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3495                         break;
3496                 case TCP_CLOSE:
3497                         /*
3498                          * don't touch the timer.
3499                          */
3500                         break;
3501                 default:
3502                         /*
3503                          *      Must check send_head, write_queue, and ack_backlog
3504                          *      to determine which timeout to use.
3505                          */
3506                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3507                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3508                         } else if (sk->keepopen) {
3509                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3510                         } else {
3511                                 del_timer(&sk->retransmit_timer);
3512                                 sk->ip_xmit_timeout = 0;
3513                         }
3514                         break;
3515                 }
3516         }
3517 
3518         /*
3519          *      We have nothing queued but space to send. Send any partial
3520          *      packets immediately (end of Nagle rule application).
3521          */
3522          
3523         if (sk->packets_out == 0 && sk->partial != NULL &&
3524                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3525         {
3526                 flag |= 1;
3527                 tcp_send_partial(sk);
3528         }
3529 
3530         /*
3531          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3532          * we are now waiting for an acknowledge to our FIN.  The other end is
3533          * already in TIME_WAIT.
3534          *
3535          * Move to TCP_CLOSE on success.
3536          */
3537 
3538         if (sk->state == TCP_LAST_ACK) 
3539         {
3540                 if (!sk->dead)
3541                         sk->state_change(sk);
3542                 if(sk->debug)
3543                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3544                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3545                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3546                 {
3547                         flag |= 1;
3548                         tcp_set_state(sk,TCP_CLOSE);
3549                         sk->shutdown = SHUTDOWN_MASK;
3550                 }
3551         }
3552 
3553         /*
3554          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3555          *
3556          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3557          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3558          */
3559 
3560         if (sk->state == TCP_FIN_WAIT1) 
3561         {
3562 
3563                 if (!sk->dead) 
3564                         sk->state_change(sk);
3565                 if (sk->rcv_ack_seq == sk->write_seq) 
3566                 {
3567                         flag |= 1;
3568                         sk->shutdown |= SEND_SHUTDOWN;
3569                         tcp_set_state(sk, TCP_FIN_WAIT2);
3570                 }
3571         }
3572 
3573         /*
3574          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3575          *
3576          *      Move to TIME_WAIT
3577          */
3578 
3579         if (sk->state == TCP_CLOSING) 
3580         {
3581 
3582                 if (!sk->dead) 
3583                         sk->state_change(sk);
3584                 if (sk->rcv_ack_seq == sk->write_seq) 
3585                 {
3586                         flag |= 1;
3587                         tcp_time_wait(sk);
3588                 }
3589         }
3590         
3591         /*
3592          *      Final ack of a three way shake 
3593          */
3594          
3595         if(sk->state==TCP_SYN_RECV)
3596         {
3597                 tcp_set_state(sk, TCP_ESTABLISHED);
3598                 tcp_options(sk,th);
3599                 sk->dummy_th.dest=th->source;
3600                 sk->copied_seq = sk->acked_seq;
3601                 if(!sk->dead)
3602                         sk->state_change(sk);
3603                 if(sk->max_window==0)
3604                 {
3605                         sk->max_window=32;      /* Sanity check */
3606                         sk->mss=min(sk->max_window,sk->mtu);
3607                 }
3608         }
3609         
3610         /*
3611          * I make no guarantees about the first clause in the following
3612          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3613          * what conditions "!flag" would be true.  However I think the rest
3614          * of the conditions would prevent that from causing any
3615          * unnecessary retransmission. 
3616          *   Clearly if the first packet has expired it should be 
3617          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3618          * harder to explain:  You have to look carefully at how and when the
3619          * timer is set and with what timeout.  The most recent transmission always
3620          * sets the timer.  So in general if the most recent thing has timed
3621          * out, everything before it has as well.  So we want to go ahead and
3622          * retransmit some more.  If we didn't explicitly test for this
3623          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3624          * would not be true.  If you look at the pattern of timing, you can
3625          * show that rto is increased fast enough that the next packet would
3626          * almost never be retransmitted immediately.  Then you'd end up
3627          * waiting for a timeout to send each packet on the retransmission
3628          * queue.  With my implementation of the Karn sampling algorithm,
3629          * the timeout would double each time.  The net result is that it would
3630          * take a hideous amount of time to recover from a single dropped packet.
3631          * It's possible that there should also be a test for TIME_WRITE, but
3632          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3633          * got to be in real retransmission mode.
3634          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3635          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3636          * As long as no further losses occur, this seems reasonable.
3637          */
3638         
3639         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3640                (((flag&2) && sk->retransmits) ||
3641                (sk->send_head->when + sk->rto < jiffies))) 
3642         {
3643                 if(sk->send_head->when + sk->rto < jiffies)
3644                         tcp_retransmit(sk,0);   
3645                 else
3646                 {
3647                         tcp_do_retransmit(sk, 1);
3648                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3649                 }
3650         }
3651 
3652         return(1);
3653 }
3654 
3655 
3656 /*
3657  *      Process the FIN bit. This now behaves as it is supposed to work
3658  *      and the FIN takes effect when it is validly part of sequence
3659  *      space. Not before when we get holes.
3660  *
3661  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3662  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3663  *      TIME-WAIT)
3664  *
3665  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3666  *      close and we go into CLOSING (and later onto TIME-WAIT)
3667  *
3668  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3669  *
3670  */
3671  
3672 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3673 {
3674         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3675 
3676         if (!sk->dead) 
3677         {
3678                 sk->state_change(sk);
3679                 sock_wake_async(sk->socket, 1);
3680         }
3681 
3682         switch(sk->state) 
3683         {
3684                 case TCP_SYN_RECV:
3685                 case TCP_SYN_SENT:
3686                 case TCP_ESTABLISHED:
3687                         /*
3688                          * move to CLOSE_WAIT, tcp_data() already handled
3689                          * sending the ack.
3690                          */
3691                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3692                         if (th->rst)
3693                                 sk->shutdown = SHUTDOWN_MASK;
3694                         break;
3695 
3696                 case TCP_CLOSE_WAIT:
3697                 case TCP_CLOSING:
3698                         /*
3699                          * received a retransmission of the FIN, do
3700                          * nothing.
3701                          */
3702                         break;
3703                 case TCP_TIME_WAIT:
3704                         /*
3705                          * received a retransmission of the FIN,
3706                          * restart the TIME_WAIT timer.
3707                          */
3708                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3709                         return(0);
3710                 case TCP_FIN_WAIT1:
3711                         /*
3712                          * This case occurs when a simultaneous close
3713                          * happens, we must ack the received FIN and
3714                          * enter the CLOSING state.
3715                          *
3716                          * This causes a WRITE timeout, which will either
3717                          * move on to TIME_WAIT when we timeout, or resend
3718                          * the FIN properly (maybe we get rid of that annoying
3719                          * FIN lost hang). The TIME_WRITE code is already correct
3720                          * for handling this timeout.
3721                          */
3722 
3723                         if(sk->ip_xmit_timeout != TIME_WRITE)
3724                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3725                         tcp_set_state(sk,TCP_CLOSING);
3726                         break;
3727                 case TCP_FIN_WAIT2:
3728                         /*
3729                          * received a FIN -- send ACK and enter TIME_WAIT
3730                          */
3731                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3732                         sk->shutdown|=SHUTDOWN_MASK;
3733                         tcp_set_state(sk,TCP_TIME_WAIT);
3734                         break;
3735                 case TCP_CLOSE:
3736                         /*
3737                          * already in CLOSE
3738                          */
3739                         break;
3740                 default:
3741                         tcp_set_state(sk,TCP_LAST_ACK);
3742         
3743                         /* Start the timers. */
3744                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3745                         return(0);
3746         }
3747 
3748         return(0);
3749 }
3750 
3751 
3752 
3753 /*
3754  *      This routine handles the data.  If there is room in the buffer,
3755  *      it will be have already been moved into it.  If there is no
3756  *      room, then we will just have to discard the packet.
3757  */
3758 
3759 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
3760          unsigned long saddr, unsigned short len)
3761 {
3762         struct sk_buff *skb1, *skb2;
3763         struct tcphdr *th;
3764         int dup_dumped=0;
3765         u32 new_seq, shut_seq;
3766 
3767         th = skb->h.th;
3768         skb_pull(skb,th->doff*4);
3769         skb_trim(skb,len-(th->doff*4));
3770 
3771         /*
3772          *      The bytes in the receive read/assembly queue has increased. Needed for the
3773          *      low memory discard algorithm 
3774          */
3775            
3776         sk->bytes_rcv += skb->len;
3777         
3778         if (skb->len == 0 && !th->fin) 
3779         {
3780                 /* 
3781                  *      Don't want to keep passing ack's back and forth. 
3782                  *      (someone sent us dataless, boring frame)
3783                  */
3784                 if (!th->ack)
3785                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3786                 kfree_skb(skb, FREE_READ);
3787                 return(0);
3788         }
3789         
3790         /*
3791          *      We no longer have anyone receiving data on this connection.
3792          */
3793 
3794 #ifndef TCP_DONT_RST_SHUTDOWN            
3795 
3796         if(sk->shutdown & RCV_SHUTDOWN)
3797         {
3798                 /*
3799                  *      FIXME: BSD has some magic to avoid sending resets to
3800                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3801                  *      BSD stacks still have broken keepalives so we want to
3802                  *      cope with it.
3803                  */
3804 
3805                 if(skb->len)    /* We don't care if it's just an ack or
3806                                    a keepalive/window probe */
3807                 {
3808                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3809                         
3810                         /* Do this the way 4.4BSD treats it. Not what I'd
3811                            regard as the meaning of the spec but it's what BSD
3812                            does and clearly they know everything 8) */
3813 
3814                         /*
3815                          *      This is valid because of two things
3816                          *
3817                          *      a) The way tcp_data behaves at the bottom.
3818                          *      b) A fin takes effect when read not when received.
3819                          */
3820                          
3821                         shut_seq=sk->acked_seq+1;       /* Last byte */
3822                         
3823                         if(after(new_seq,shut_seq))
3824                         {
3825                                 if(sk->debug)
3826                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3827                                                 sk, new_seq, shut_seq, sk->blog);
3828                                 if(sk->dead)
3829                                 {
3830                                         sk->acked_seq = new_seq + th->fin;
3831                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3832                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3833                                         tcp_statistics.TcpEstabResets++;
3834                                         tcp_set_state(sk,TCP_CLOSE);
3835                                         sk->err = EPIPE;
3836                                         sk->shutdown = SHUTDOWN_MASK;
3837                                         kfree_skb(skb, FREE_READ);
3838                                         return 0;
3839                                 }
3840                         }
3841                 }
3842         }
3843 
3844 #endif
3845 
3846         /*
3847          *      Now we have to walk the chain, and figure out where this one
3848          *      goes into it.  This is set up so that the last packet we received
3849          *      will be the first one we look at, that way if everything comes
3850          *      in order, there will be no performance loss, and if they come
3851          *      out of order we will be able to fit things in nicely.
3852          *
3853          *      [AC: This is wrong. We should assume in order first and then walk
3854          *       forwards from the first hole based upon real traffic patterns.]
3855          *      
3856          */
3857 
3858         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3859         {
3860                 skb_queue_head(&sk->receive_queue,skb);
3861                 skb1= NULL;
3862         } 
3863         else
3864         {
3865                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3866                 {
3867                         if(sk->debug)
3868                         {
3869                                 printk("skb1=%p :", skb1);
3870                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3871                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3872                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3873                                                 sk->acked_seq);
3874                         }
3875                         
3876                         /*
3877                          *      Optimisation: Duplicate frame or extension of previous frame from
3878                          *      same sequence point (lost ack case).
3879                          *      The frame contains duplicate data or replaces a previous frame
3880                          *      discard the previous frame (safe as sk->inuse is set) and put
3881                          *      the new one in its place.
3882                          */
3883                          
3884                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3885                         {
3886                                 skb_append(skb1,skb);
3887                                 skb_unlink(skb1);
3888                                 kfree_skb(skb1,FREE_READ);
3889                                 dup_dumped=1;
3890                                 skb1=NULL;
3891                                 break;
3892                         }
3893                         
3894                         /*
3895                          *      Found where it fits
3896                          */
3897                          
3898                         if (after(th->seq+1, skb1->h.th->seq))
3899                         {
3900                                 skb_append(skb1,skb);
3901                                 break;
3902                         }
3903                         
3904                         /*
3905                          *      See if we've hit the start. If so insert.
3906                          */
3907                         if (skb1 == skb_peek(&sk->receive_queue))
3908                         {
3909                                 skb_queue_head(&sk->receive_queue, skb);
3910                                 break;
3911                         }
3912                 }
3913         }
3914 
3915         /*
3916          *      Figure out what the ack value for this frame is
3917          */
3918          
3919         th->ack_seq = th->seq + skb->len;
3920         if (th->syn) 
3921                 th->ack_seq++;
3922         if (th->fin)
3923                 th->ack_seq++;
3924 
3925         if (before(sk->acked_seq, sk->copied_seq)) 
3926         {
3927                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3928                 sk->acked_seq = sk->copied_seq;
3929         }
3930 
3931         /*
3932          *      Now figure out if we can ack anything. This is very messy because we really want two
3933          *      receive queues, a completed and an assembly queue. We also want only one transmit
3934          *      queue.
3935          */
3936 
3937         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3938         {
3939                 if (before(th->seq, sk->acked_seq+1)) 
3940                 {
3941                         int newwindow;
3942 
3943                         if (after(th->ack_seq, sk->acked_seq)) 
3944                         {
3945                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3946                                 if (newwindow < 0)
3947                                         newwindow = 0;  
3948                                 sk->window = newwindow;
3949                                 sk->acked_seq = th->ack_seq;
3950                         }
3951                         skb->acked = 1;
3952 
3953                         /*
3954                          *      When we ack the fin, we do the FIN 
3955                          *      processing.
3956                          */
3957 
3958                         if (skb->h.th->fin) 
3959                         {
3960                                 tcp_fin(skb,sk,skb->h.th);
3961                         }
3962           
3963                         for(skb2 = skb->next;
3964                             skb2 != (struct sk_buff *)&sk->receive_queue;
3965                             skb2 = skb2->next) 
3966                         {
3967                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3968                                 {
3969                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3970                                         {
3971                                                 newwindow = sk->window -
3972                                                  (skb2->h.th->ack_seq - sk->acked_seq);
3973                                                 if (newwindow < 0)
3974                                                         newwindow = 0;  
3975                                                 sk->window = newwindow;
3976                                                 sk->acked_seq = skb2->h.th->ack_seq;
3977                                         }
3978                                         skb2->acked = 1;
3979                                         /*
3980                                          *      When we ack the fin, we do
3981                                          *      the fin handling.
3982                                          */
3983                                         if (skb2->h.th->fin) 
3984                                         {
3985                                                 tcp_fin(skb,sk,skb->h.th);
3986                                         }
3987 
3988                                         /*
3989                                          *      Force an immediate ack.
3990                                          */
3991                                          
3992                                         sk->ack_backlog = sk->max_ack_backlog;
3993                                 }
3994                                 else
3995                                 {
3996                                         break;
3997                                 }
3998                         }
3999 
4000                         /*
4001                          *      This also takes care of updating the window.
4002                          *      This if statement needs to be simplified.
4003                          */
4004                         if (!sk->delay_acks ||
4005                             sk->ack_backlog >= sk->max_ack_backlog || 
4006                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4007         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4008                         }
4009                         else 
4010                         {
4011                                 sk->ack_backlog++;
4012                                 if(sk->debug)
4013                                         printk("Ack queued.\n");
4014                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4015                         }
4016                 }
4017         }
4018 
4019         /*
4020          *      If we've missed a packet, send an ack.
4021          *      Also start a timer to send another.
4022          */
4023          
4024         if (!skb->acked) 
4025         {
4026         
4027         /*
4028          *      This is important.  If we don't have much room left,
4029          *      we need to throw out a few packets so we have a good
4030          *      window.  Note that mtu is used, not mss, because mss is really
4031          *      for the send side.  He could be sending us stuff as large as mtu.
4032          */
4033                  
4034                 while (sk->prot->rspace(sk) < sk->mtu) 
4035                 {
4036                         skb1 = skb_peek(&sk->receive_queue);
4037                         if (skb1 == NULL) 
4038                         {
4039                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4040                                 break;
4041                         }
4042 
4043                         /*
4044                          *      Don't throw out something that has been acked. 
4045                          */
4046                  
4047                         if (skb1->acked) 
4048                         {
4049                                 break;
4050                         }
4051                 
4052                         skb_unlink(skb1);
4053                         kfree_skb(skb1, FREE_READ);
4054                 }
4055                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4056                 sk->ack_backlog++;
4057                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4058         }
4059         else
4060         {
4061                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4062         }
4063 
4064         /*
4065          *      Now tell the user we may have some data. 
4066          */
4067          
4068         if (!sk->dead) 
4069         {
4070                 if(sk->debug)
4071                         printk("Data wakeup.\n");
4072                 sk->data_ready(sk,0);
4073         } 
4074         return(0);
4075 }
4076 
4077 
4078 /*
4079  *      This routine is only called when we have urgent data
4080  *      signalled. Its the 'slow' part of tcp_urg. It could be
4081  *      moved inline now as tcp_urg is only called from one
4082  *      place. We handle URGent data wrong. We have to - as
4083  *      BSD still doesn't use the correction from RFC961.
4084  */
4085  
4086 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4087 {
4088         u32 ptr = ntohs(th->urg_ptr);
4089 
4090         if (ptr)
4091                 ptr--;
4092         ptr += th->seq;
4093 
4094         /* ignore urgent data that we've already seen and read */
4095         if (after(sk->copied_seq, ptr))
4096                 return;
4097 
4098         /* do we already have a newer (or duplicate) urgent pointer? */
4099         if (sk->urg_data && !after(ptr, sk->urg_seq))
4100                 return;
4101 
4102         /* tell the world about our new urgent pointer */
4103         if (sk->proc != 0) {
4104                 if (sk->proc > 0) {
4105                         kill_proc(sk->proc, SIGURG, 1);
4106                 } else {
4107                         kill_pg(-sk->proc, SIGURG, 1);
4108                 }
4109         }
4110         sk->urg_data = URG_NOTYET;
4111         sk->urg_seq = ptr;
4112 }
4113 
4114 /*
4115  *      This is the 'fast' part of urgent handling.
4116  */
4117  
4118 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4119         unsigned long saddr, unsigned long len)
4120 {
4121         u32 ptr;
4122 
4123         /*
4124          *      Check if we get a new urgent pointer - normally not 
4125          */
4126          
4127         if (th->urg)
4128                 tcp_check_urg(sk,th);
4129 
4130         /*
4131          *      Do we wait for any urgent data? - normally not
4132          */
4133          
4134         if (sk->urg_data != URG_NOTYET)
4135                 return 0;
4136 
4137         /*
4138          *      Is the urgent pointer pointing into this packet? 
4139          */
4140          
4141         ptr = sk->urg_seq - th->seq + th->doff*4;
4142         if (ptr >= len)
4143                 return 0;
4144 
4145         /*
4146          *      Ok, got the correct packet, update info 
4147          */
4148          
4149         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4150         if (!sk->dead)
4151                 sk->data_ready(sk,0);
4152         return 0;
4153 }
4154 
4155 /*
4156  *      This will accept the next outstanding connection. 
4157  */
4158  
4159 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4160 {
4161         struct sock *newsk;
4162         struct sk_buff *skb;
4163   
4164   /*
4165    * We need to make sure that this socket is listening,
4166    * and that it has something pending.
4167    */
4168 
4169         if (sk->state != TCP_LISTEN) 
4170         {
4171                 sk->err = EINVAL;
4172                 return(NULL); 
4173         }
4174 
4175         /* Avoid the race. */
4176         cli();
4177         sk->inuse = 1;
4178 
4179         while((skb = tcp_dequeue_established(sk)) == NULL) 
4180         {
4181                 if (flags & O_NONBLOCK) 
4182                 {
4183                         sti();
4184                         release_sock(sk);
4185                         sk->err = EAGAIN;
4186                         return(NULL);
4187                 }
4188 
4189                 release_sock(sk);
4190                 interruptible_sleep_on(sk->sleep);
4191                 if (current->signal & ~current->blocked) 
4192                 {
4193                         sti();
4194                         sk->err = ERESTARTSYS;
4195                         return(NULL);
4196                 }
4197                 sk->inuse = 1;
4198         }
4199         sti();
4200 
4201         /*
4202          *      Now all we need to do is return skb->sk. 
4203          */
4204 
4205         newsk = skb->sk;
4206 
4207         kfree_skb(skb, FREE_READ);
4208         sk->ack_backlog--;
4209         release_sock(sk);
4210         return(newsk);
4211 }
4212 
4213 
4214 /*
4215  *      This will initiate an outgoing connection. 
4216  */
4217  
4218 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4219 {
4220         struct sk_buff *buff;
4221         struct device *dev=NULL;
4222         unsigned char *ptr;
4223         int tmp;
4224         int atype;
4225         struct tcphdr *t1;
4226         struct rtable *rt;
4227 
4228         if (sk->state != TCP_CLOSE) 
4229         {
4230                 return(-EISCONN);
4231         }
4232         
4233         if (addr_len < 8) 
4234                 return(-EINVAL);
4235 
4236         if (usin->sin_family && usin->sin_family != AF_INET) 
4237                 return(-EAFNOSUPPORT);
4238 
4239         /*
4240          *      connect() to INADDR_ANY means loopback (BSD'ism).
4241          */
4242         
4243         if(usin->sin_addr.s_addr==INADDR_ANY)
4244                 usin->sin_addr.s_addr=ip_my_addr();
4245                   
4246         /*
4247          *      Don't want a TCP connection going to a broadcast address 
4248          */
4249 
4250         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4251                 return -ENETUNREACH;
4252   
4253         sk->inuse = 1;
4254         sk->daddr = usin->sin_addr.s_addr;
4255         sk->write_seq = tcp_init_seq();
4256         sk->window_seq = sk->write_seq;
4257         sk->rcv_ack_seq = sk->write_seq -1;
4258         sk->err = 0;
4259         sk->dummy_th.dest = usin->sin_port;
4260         release_sock(sk);
4261 
4262         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4263         if (buff == NULL) 
4264         {
4265                 return(-ENOMEM);
4266         }
4267         sk->inuse = 1;
4268         buff->sk = sk;
4269         buff->free = 0;
4270         buff->localroute = sk->localroute;
4271         
4272 
4273         /*
4274          *      Put in the IP header and routing stuff. 
4275          */
4276          
4277         rt=ip_rt_route(sk->daddr, NULL, NULL);
4278         
4279 
4280         /*
4281          *      We need to build the routing stuff from the things saved in skb. 
4282          */
4283 
4284         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4285                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4286         if (tmp < 0) 
4287         {
4288                 sk->prot->wfree(sk, buff);
4289                 release_sock(sk);
4290                 return(-ENETUNREACH);
4291         }
4292 
4293         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4294 
4295         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4296         t1->seq = ntohl(sk->write_seq++);
4297         sk->sent_seq = sk->write_seq;
4298         buff->h.seq = sk->write_seq;
4299         t1->ack = 0;
4300         t1->window = 2;
4301         t1->res1=0;
4302         t1->res2=0;
4303         t1->rst = 0;
4304         t1->urg = 0;
4305         t1->psh = 0;
4306         t1->syn = 1;
4307         t1->urg_ptr = 0;
4308         t1->doff = 6;
4309         /* use 512 or whatever user asked for */
4310         
4311         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4312                 sk->window_clamp=rt->rt_window;
4313         else
4314                 sk->window_clamp=0;
4315 
4316         if (sk->user_mss)
4317                 sk->mtu = sk->user_mss;
4318         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4319                 sk->mtu = rt->rt_mss;
4320         else 
4321         {
4322 #ifdef CONFIG_INET_SNARL
4323                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4324 #else
4325                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4326 #endif
4327                         sk->mtu = 576 - HEADER_SIZE;
4328                 else
4329                         sk->mtu = MAX_WINDOW;
4330         }
4331         /*
4332          *      but not bigger than device MTU 
4333          */
4334 
4335         if(sk->mtu <32)
4336                 sk->mtu = 32;   /* Sanity limit */
4337                 
4338         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4339         
4340         /*
4341          *      Put in the TCP options to say MTU. 
4342          */
4343 
4344         ptr = skb_put(buff,4);
4345         ptr[0] = 2;
4346         ptr[1] = 4;
4347         ptr[2] = (sk->mtu) >> 8;
4348         ptr[3] = (sk->mtu) & 0xff;
4349         tcp_send_check(t1, sk->saddr, sk->daddr,
4350                   sizeof(struct tcphdr) + 4, sk);
4351 
4352         /*
4353          *      This must go first otherwise a really quick response will get reset. 
4354          */
4355 
4356         tcp_cache_zap();
4357         tcp_set_state(sk,TCP_SYN_SENT);
4358         if(rt&&rt->rt_flags&RTF_IRTT)
4359                 sk->rto = rt->rt_irtt;
4360         else
4361                 sk->rto = TCP_TIMEOUT_INIT;
4362         sk->retransmit_timer.function=&retransmit_timer;
4363         sk->retransmit_timer.data = (unsigned long)sk;
4364         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4365         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4366 
4367         sk->prot->queue_xmit(sk, dev, buff, 0);  
4368         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4369         tcp_statistics.TcpActiveOpens++;
4370         tcp_statistics.TcpOutSegs++;
4371   
4372         release_sock(sk);
4373         return(0);
4374 }
4375 
4376 
4377 /* This functions checks to see if the tcp header is actually acceptable. */
4378 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4379              struct options *opt, unsigned long saddr, struct device *dev)
4380 {
4381         u32 next_seq;
4382 
4383         next_seq = len - 4*th->doff;
4384         if (th->fin)
4385                 next_seq++;
4386         /* if we have a zero window, we can't have any data in the packet.. */
4387         if (next_seq && !sk->window)
4388                 goto ignore_it;
4389         next_seq += th->seq;
4390 
4391         /*
4392          * This isn't quite right.  sk->acked_seq could be more recent
4393          * than sk->window.  This is however close enough.  We will accept
4394          * slightly more packets than we should, but it should not cause
4395          * problems unless someone is trying to forge packets.
4396          */
4397 
4398         /* have we already seen all of this packet? */
4399         if (!after(next_seq+1, sk->acked_seq))
4400                 goto ignore_it;
4401         /* or does it start beyond the window? */
4402         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4403                 goto ignore_it;
4404 
4405         /* ok, at least part of this packet would seem interesting.. */
4406         return 1;
4407 
4408 ignore_it:
4409         if (th->rst)
4410                 return 0;
4411 
4412         /*
4413          *      Send a reset if we get something not ours and we are
4414          *      unsynchronized. Note: We don't do anything to our end. We
4415          *      are just killing the bogus remote connection then we will
4416          *      connect again and it will work (with luck).
4417          */
4418          
4419         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4420         {
4421                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4422                 return 1;
4423         }
4424 
4425         /* Try to resync things. */
4426         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4427         return 0;
4428 }
4429 
4430 /*
4431  *      When we get a reset we do this.
4432  */
4433 
4434 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4435 {
4436         sk->zapped = 1;
4437         sk->err = ECONNRESET;
4438         if (sk->state == TCP_SYN_SENT)
4439                 sk->err = ECONNREFUSED;
4440         if (sk->state == TCP_CLOSE_WAIT)
4441                 sk->err = EPIPE;
4442 #ifdef TCP_DO_RFC1337           
4443         /*
4444          *      Time wait assassination protection [RFC1337]
4445          */
4446         if(sk->state!=TCP_TIME_WAIT)
4447         {       
4448                 tcp_set_state(sk,TCP_CLOSE);
4449                 sk->shutdown = SHUTDOWN_MASK;
4450         }
4451 #else   
4452         tcp_set_state(sk,TCP_CLOSE);
4453         sk->shutdown = SHUTDOWN_MASK;
4454 #endif  
4455         if (!sk->dead) 
4456                 sk->state_change(sk);
4457         kfree_skb(skb, FREE_READ);
4458         release_sock(sk);
4459         return(0);
4460 }
4461 
4462 /*
4463  *      A TCP packet has arrived.
4464  *              skb->h.raw is the TCP header.
4465  */
4466  
4467 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4468         unsigned long daddr, unsigned short len,
4469         unsigned long saddr, int redo, struct inet_protocol * protocol)
4470 {
4471         struct tcphdr *th;
4472         struct sock *sk;
4473         int syn_ok=0;
4474         
4475         tcp_statistics.TcpInSegs++;
4476         if(skb->pkt_type!=PACKET_HOST)
4477         {
4478                 kfree_skb(skb,FREE_READ);
4479                 return(0);
4480         }
4481   
4482         th = skb->h.th;
4483 
4484         /*
4485          *      Find the socket, using the last hit cache if applicable.
4486          */
4487 
4488         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4489                 sk=(struct sock *)th_cache_sk;
4490         else
4491         {
4492                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4493                 th_cache_saddr=saddr;
4494                 th_cache_daddr=daddr;
4495                 th_cache_dport=th->dest;
4496                 th_cache_sport=th->source;
4497                 th_cache_sk=sk;
4498         }               
4499 
4500         /*
4501          *      If this socket has got a reset it's to all intents and purposes 
4502          *      really dead. Count closed sockets as dead.
4503          *
4504          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4505          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4506          *      exist so should cause resets as if the port was unreachable.
4507          */
4508          
4509         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4510                 sk=NULL;
4511 
4512         if (!redo) 
4513         {
4514                 /*
4515                  *      Pull up the IP header.
4516                  */
4517                 skb_pull(skb, skb->h.raw-skb->data);
4518                 /*
4519                  *      Try to use the device checksum if provided.
4520                  */
4521                 if (
4522                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4523                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4524                     )
4525                 {
4526                         skb->sk = NULL;
4527                         kfree_skb(skb,FREE_READ);
4528                         /*
4529                          *      We don't release the socket because it was
4530                          *      never marked in use.
4531                          */
4532                         return(0);
4533                 }
4534                 th->seq = ntohl(th->seq);
4535 
4536                 /* See if we know about the socket. */
4537                 if (sk == NULL) 
4538                 {
4539                         /*
4540                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4541                          */
4542                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4543                         skb->sk = NULL;
4544                         /*
4545                          *      Discard frame
4546                          */
4547                         kfree_skb(skb, FREE_READ);
4548                         return(0);
4549                 }
4550 
4551 /*              skb->len = len;*/
4552                 skb->acked = 0;
4553                 skb->used = 0;
4554                 skb->free = 0;
4555                 skb->saddr = daddr;
4556                 skb->daddr = saddr;
4557         
4558                 /* We may need to add it to the backlog here. */
4559                 cli();
4560                 if (sk->inuse) 
4561                 {
4562                         skb_queue_tail(&sk->back_log, skb);
4563                         sti();
4564                         return(0);
4565                 }
4566                 sk->inuse = 1;
4567                 sti();
4568         }
4569         else
4570         {
4571                 if (sk==NULL) 
4572                 {
4573                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4574                         skb->sk = NULL;
4575                         kfree_skb(skb, FREE_READ);
4576                         return(0);
4577                 }
4578         }
4579 
4580 
4581         if (!sk->prot) 
4582         {
4583                 printk("IMPOSSIBLE 3\n");
4584                 return(0);
4585         }
4586 
4587 
4588         /*
4589          *      Charge the memory to the socket. 
4590          */
4591          
4592         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4593         {
4594                 kfree_skb(skb, FREE_READ);
4595                 release_sock(sk);
4596                 return(0);
4597         }
4598 
4599         skb->sk=sk;
4600         sk->rmem_alloc += skb->truesize;
4601 
4602         /*
4603          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4604          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4605          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4606          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4607          */
4608 
4609         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4610         {
4611         
4612                 /*
4613                  *      Now deal with unusual cases.
4614                  */
4615          
4616                 if(sk->state==TCP_LISTEN)
4617                 {
4618                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4619                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4620 
4621                         /*
4622                          *      We don't care for RST, and non SYN are absorbed (old segments)
4623                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4624                          *      netmask on a running connection it can go broadcast. Even Sun's have
4625                          *      this problem so I'm ignoring it 
4626                          */
4627                            
4628                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4629                         {
4630                                 kfree_skb(skb, FREE_READ);
4631                                 release_sock(sk);
4632                                 return 0;
4633                         }
4634                 
4635                         /*      
4636                          *      Guess we need to make a new socket up 
4637                          */
4638                 
4639                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4640                 
4641                         /*
4642                          *      Now we have several options: In theory there is nothing else
4643                          *      in the frame. KA9Q has an option to send data with the syn,
4644                          *      BSD accepts data with the syn up to the [to be] advertised window
4645                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4646                          *      it, that fits the spec precisely and avoids incompatibilities. It
4647                          *      would be nice in future to drop through and process the data.
4648                          */
4649                          
4650                         release_sock(sk);
4651                         return 0;
4652                 }
4653         
4654                 /* retransmitted SYN? */
4655                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4656                 {
4657                         kfree_skb(skb, FREE_READ);
4658                         release_sock(sk);
4659                         return 0;
4660                 }
4661                 
4662                 /*
4663                  *      SYN sent means we have to look for a suitable ack and either reset
4664                  *      for bad matches or go to connected 
4665                  */
4666            
4667                 if(sk->state==TCP_SYN_SENT)
4668                 {
4669                         /* Crossed SYN or previous junk segment */
4670                         if(th->ack)
4671                         {
4672                                 /* We got an ack, but it's not a good ack */
4673                                 if(!tcp_ack(sk,th,saddr,len))
4674                                 {
4675                                         /* Reset the ack - its an ack from a 
4676                                            different connection  [ th->rst is checked in tcp_reset()] */
4677                                         tcp_statistics.TcpAttemptFails++;
4678                                         tcp_reset(daddr, saddr, th,
4679                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4680                                         kfree_skb(skb, FREE_READ);
4681                                         release_sock(sk);
4682                                         return(0);
4683                                 }
4684                                 if(th->rst)
4685                                         return tcp_std_reset(sk,skb);
4686                                 if(!th->syn)
4687                                 {
4688                                         /* A valid ack from a different connection
4689                                            start. Shouldn't happen but cover it */
4690                                         kfree_skb(skb, FREE_READ);
4691                                         release_sock(sk);
4692                                         return 0;
4693                                 }
4694                                 /*
4695                                  *      Ok.. it's good. Set up sequence numbers and
4696                                  *      move to established.
4697                                  */
4698                                 syn_ok=1;       /* Don't reset this connection for the syn */
4699                                 sk->acked_seq=th->seq+1;
4700                                 sk->fin_seq=th->seq;
4701                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4702                                 tcp_set_state(sk, TCP_ESTABLISHED);
4703                                 tcp_options(sk,th);
4704                                 sk->dummy_th.dest=th->source;
4705                                 sk->copied_seq = sk->acked_seq;
4706                                 if(!sk->dead)
4707                                 {
4708                                         sk->state_change(sk);
4709                                         sock_wake_async(sk->socket, 0);
4710                                 }
4711                                 if(sk->max_window==0)
4712                                 {
4713                                         sk->max_window = 32;
4714                                         sk->mss = min(sk->max_window, sk->mtu);
4715                                 }
4716                         }
4717                         else
4718                         {
4719                                 /* See if SYN's cross. Drop if boring */
4720                                 if(th->syn && !th->rst)
4721                                 {
4722                                         /* Crossed SYN's are fine - but talking to
4723                                            yourself is right out... */
4724                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4725                                                 sk->dummy_th.source==th->source &&
4726                                                 sk->dummy_th.dest==th->dest)
4727                                         {
4728                                                 tcp_statistics.TcpAttemptFails++;
4729                                                 return tcp_std_reset(sk,skb);
4730                                         }
4731                                         tcp_set_state(sk,TCP_SYN_RECV);
4732                                         
4733                                         /*
4734                                          *      FIXME:
4735                                          *      Must send SYN|ACK here
4736                                          */
4737                                 }               
4738                                 /* Discard junk segment */
4739                                 kfree_skb(skb, FREE_READ);
4740                                 release_sock(sk);
4741                                 return 0;
4742                         }
4743                         /*
4744                          *      SYN_RECV with data maybe.. drop through
4745                          */
4746                         goto rfc_step6;
4747                 }
4748 
4749         /*
4750          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4751          *      a more complex suggestion for fixing these reuse issues in RFC1644
4752          *      but not yet ready for general use. Also see RFC1379.
4753          */
4754         
4755 #define BSD_TIME_WAIT
4756 #ifdef BSD_TIME_WAIT
4757                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4758                         after(th->seq, sk->acked_seq) && !th->rst)
4759                 {
4760                         u32 seq = sk->write_seq;
4761                         if(sk->debug)
4762                                 printk("Doing a BSD time wait\n");
4763                         tcp_statistics.TcpEstabResets++;           
4764                         sk->rmem_alloc -= skb->truesize;
4765                         skb->sk = NULL;
4766                         sk->err=ECONNRESET;
4767                         tcp_set_state(sk, TCP_CLOSE);
4768                         sk->shutdown = SHUTDOWN_MASK;
4769                         release_sock(sk);
4770                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4771                         if (sk && sk->state==TCP_LISTEN)
4772                         {
4773                                 sk->inuse=1;
4774                                 skb->sk = sk;
4775                                 sk->rmem_alloc += skb->truesize;
4776                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4777                                 release_sock(sk);
4778                                 return 0;
4779                         }
4780                         kfree_skb(skb, FREE_READ);
4781                         return 0;
4782                 }
4783 #endif  
4784         }
4785 
4786         /*
4787          *      We are now in normal data flow (see the step list in the RFC)
4788          *      Note most of these are inline now. I'll inline the lot when
4789          *      I have time to test it hard and look at what gcc outputs 
4790          */
4791         
4792         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4793         {
4794                 kfree_skb(skb, FREE_READ);
4795                 release_sock(sk);
4796                 return 0;
4797         }
4798 
4799         if(th->rst)
4800                 return tcp_std_reset(sk,skb);
4801         
4802         /*
4803          *      !syn_ok is effectively the state test in RFC793.
4804          */
4805          
4806         if(th->syn && !syn_ok)
4807         {
4808                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4809                 return tcp_std_reset(sk,skb);   
4810         }
4811 
4812         /*
4813          *      Process the ACK
4814          */
4815          
4816 
4817         if(th->ack && !tcp_ack(sk,th,saddr,len))
4818         {
4819                 /*
4820                  *      Our three way handshake failed.
4821                  */
4822                  
4823                 if(sk->state==TCP_SYN_RECV)
4824                 {
4825                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4826                 }
4827                 kfree_skb(skb, FREE_READ);
4828                 release_sock(sk);
4829                 return 0;
4830         }
4831         
4832 rfc_step6:              /* I'll clean this up later */
4833 
4834         /*
4835          *      Process urgent data
4836          */
4837                 
4838         if(tcp_urg(sk, th, saddr, len))
4839         {
4840                 kfree_skb(skb, FREE_READ);
4841                 release_sock(sk);
4842                 return 0;
4843         }
4844         
4845         
4846         /*
4847          *      Process the encapsulated data
4848          */
4849         
4850         if(tcp_data(skb,sk, saddr, len))
4851         {
4852                 kfree_skb(skb, FREE_READ);
4853                 release_sock(sk);
4854                 return 0;
4855         }
4856 
4857         /*
4858          *      And done
4859          */     
4860         
4861         release_sock(sk);
4862         return 0;
4863 }
4864 
4865 /*
4866  *      This routine sends a packet with an out of date sequence
4867  *      number. It assumes the other end will try to ack it.
4868  */
4869 
4870 static void tcp_write_wakeup(struct sock *sk)
     /*  */
4871 {
4872         struct sk_buff *buff,*skb;
4873         struct tcphdr *t1;
4874         struct device *dev=NULL;
4875         int tmp;
4876 
4877         if (sk->zapped)
4878                 return; /* After a valid reset we can send no more */
4879 
4880         /*
4881          *      Write data can still be transmitted/retransmitted in the
4882          *      following states.  If any other state is encountered, return.
4883          *      [listen/close will never occur here anyway]
4884          */
4885 
4886         if (sk->state != TCP_ESTABLISHED && 
4887             sk->state != TCP_CLOSE_WAIT &&
4888             sk->state != TCP_FIN_WAIT1 && 
4889             sk->state != TCP_LAST_ACK &&
4890             sk->state != TCP_CLOSING
4891         ) 
4892         {
4893                 return;
4894         }
4895         if ( before(sk->sent_seq, sk->window_seq) && 
4896             (skb=skb_peek(&sk->write_queue)))
4897         {
4898                 /*
4899                  * We are probing the opening of a window
4900                  * but the window size is != 0
4901                  * must have been a result SWS advoidance ( sender )
4902                  */
4903             
4904                 struct iphdr *iph;
4905                 struct tcphdr *th;
4906                 struct tcphdr *nth;
4907                 unsigned long win_size, ow_size;
4908                 void * tcp_data_start;
4909         
4910                 /*
4911                  *      How many bytes can we send ?
4912                  */
4913                  
4914                 win_size = sk->window_seq - sk->sent_seq;
4915 
4916                 /*
4917                  *      Recover the buffer pointers
4918                  */
4919                  
4920                 iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len);
4921                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4922 
4923                 /*
4924                  *      Grab the data for a temporary frame
4925                  */
4926                  
4927                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
4928                                      (iph->ihl << 2) +
4929                                      skb->dev->hard_header_len + 15, 
4930                                      1, GFP_ATOMIC);
4931                 if ( buff == NULL )
4932                         return;
4933 
4934                 /* 
4935                  *      If we strip the packet on the write queue we must
4936                  *      be ready to retransmit this one 
4937                  */
4938             
4939                 buff->free = /*0*/1;
4940 
4941                 buff->sk = sk;
4942                 buff->localroute = sk->localroute;
4943                 
4944                 /*
4945                  *      Put headers on the new packet
4946                  */
4947 
4948                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4949                                          IPPROTO_TCP, sk->opt, buff->truesize,
4950                                          sk->ip_tos,sk->ip_ttl);
4951                 if (tmp < 0) 
4952                 {
4953                         sk->prot->wfree(sk, buff);
4954                         return;
4955                 }
4956                 
4957                 /*
4958                  *      Move the TCP header over
4959                  */
4960 
4961                 buff->dev = dev;
4962 
4963                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
4964 
4965                 memcpy(nth, th, th->doff * 4);
4966                 
4967                 /*
4968                  *      Correct the new header
4969                  */
4970                  
4971                 nth->ack = 1; 
4972                 nth->ack_seq = ntohl(sk->acked_seq);
4973                 nth->window = ntohs(tcp_select_window(sk));
4974                 nth->check = 0;
4975 
4976                 /*
4977                  *      Find the first data byte.
4978                  */
4979                  
4980                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
4981                                 (iph->ihl << 2) + th->doff * 4;
4982 
4983                 /*
4984                  *      Add it to our new buffer
4985                  */
4986                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
4987                 
4988                 /*
4989                  *      Remember our right edge sequence number.
4990                  */
4991                  
4992                 buff->h.seq = sk->sent_seq + win_size;
4993                 sk->sent_seq = buff->h.seq;             /* Hack */
4994 #if 0
4995 
4996                 /*
4997                  *      now: shrink the queue head segment 
4998                  */
4999                  
5000                 th->check = 0;
5001                 ow_size = skb->len - win_size - 
5002                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5003 
5004                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5005                 skb_trim(skb,skb->len-win_size);
5006                 sk->sent_seq += win_size;
5007                 th->seq = htonl(sk->sent_seq);
5008                 if (th->urg)
5009                 {
5010                         unsigned short urg_ptr;
5011         
5012                         urg_ptr = ntohs(th->urg_ptr);
5013                         if (urg_ptr <= win_size)
5014                                 th->urg = 0;
5015                         else
5016                         {
5017                                 urg_ptr -= win_size;
5018                                 th->urg_ptr = htons(urg_ptr);
5019                                 nth->urg_ptr = htons(win_size);
5020                         }
5021                 }
5022 #else
5023                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5024                         nth->urg = 0;
5025 #endif          
5026 
5027                 /*
5028                  *      Checksum the split buffer
5029                  */
5030                  
5031                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5032                            nth->doff * 4 + win_size , sk);
5033         }
5034         else
5035         {       
5036                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5037                 if (buff == NULL) 
5038                         return;
5039 
5040                 buff->free = 1;
5041                 buff->sk = sk;
5042                 buff->localroute = sk->localroute;
5043 
5044                 /*
5045                  *      Put in the IP header and routing stuff. 
5046                  */
5047                  
5048                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5049                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5050                 if (tmp < 0) 
5051                 {
5052                         sk->prot->wfree(sk, buff);
5053                         return;
5054                 }
5055 
5056                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5057                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5058 
5059                 /*
5060                  *      Use a previous sequence.
5061                  *      This should cause the other end to send an ack.
5062                  */
5063          
5064                 t1->seq = htonl(sk->sent_seq-1);
5065                 t1->ack = 1; 
5066                 t1->res1= 0;
5067                 t1->res2= 0;
5068                 t1->rst = 0;
5069                 t1->urg = 0;
5070                 t1->psh = 0;
5071                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5072                 t1->syn = 0;
5073                 t1->ack_seq = ntohl(sk->acked_seq);
5074                 t1->window = ntohs(tcp_select_window(sk));
5075                 t1->doff = sizeof(*t1)/4;
5076                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5077 
5078         }               
5079 
5080         /*
5081          *      Send it.
5082          */
5083         
5084         sk->prot->queue_xmit(sk, dev, buff, 1);
5085         tcp_statistics.TcpOutSegs++;
5086 }
5087 
5088 /*
5089  *      A window probe timeout has occurred.
5090  */
5091 
5092 void tcp_send_probe0(struct sock *sk)
     /*  */
5093 {
5094         if (sk->zapped)
5095                 return;         /* After a valid reset we can send no more */
5096 
5097         tcp_write_wakeup(sk);
5098 
5099         sk->backoff++;
5100         sk->rto = min(sk->rto << 1, 120*HZ);
5101         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5102         sk->retransmits++;
5103         sk->prot->retransmits ++;
5104 }
5105 
5106 /*
5107  *      Socket option code for TCP. 
5108  */
5109   
5110 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5111 {
5112         int val,err;
5113 
5114         if(level!=SOL_TCP)
5115                 return ip_setsockopt(sk,level,optname,optval,optlen);
5116 
5117         if (optval == NULL) 
5118                 return(-EINVAL);
5119 
5120         err=verify_area(VERIFY_READ, optval, sizeof(int));
5121         if(err)
5122                 return err;
5123         
5124         val = get_user((int *)optval);
5125 
5126         switch(optname)
5127         {
5128                 case TCP_MAXSEG:
5129 /*
5130  * values greater than interface MTU won't take effect.  however at
5131  * the point when this call is done we typically don't yet know
5132  * which interface is going to be used
5133  */
5134                         if(val<1||val>MAX_WINDOW)
5135                                 return -EINVAL;
5136                         sk->user_mss=val;
5137                         return 0;
5138                 case TCP_NODELAY:
5139                         sk->nonagle=(val==0)?0:1;
5140                         return 0;
5141                 default:
5142                         return(-ENOPROTOOPT);
5143         }
5144 }
5145 
5146 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5147 {
5148         int val,err;
5149 
5150         if(level!=SOL_TCP)
5151                 return ip_getsockopt(sk,level,optname,optval,optlen);
5152                         
5153         switch(optname)
5154         {
5155                 case TCP_MAXSEG:
5156                         val=sk->user_mss;
5157                         break;
5158                 case TCP_NODELAY:
5159                         val=sk->nonagle;
5160                         break;
5161                 default:
5162                         return(-ENOPROTOOPT);
5163         }
5164         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5165         if(err)
5166                 return err;
5167         put_user(sizeof(int),(int *) optlen);
5168 
5169         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5170         if(err)
5171                 return err;
5172         put_user(val,(int *)optval);
5173 
5174         return(0);
5175 }       
5176 
5177 
5178 struct proto tcp_prot = {
5179         sock_wmalloc,
5180         sock_rmalloc,
5181         sock_wfree,
5182         sock_rfree,
5183         sock_rspace,
5184         sock_wspace,
5185         tcp_close,
5186         tcp_read,
5187         tcp_write,
5188         tcp_sendto,
5189         tcp_recvfrom,
5190         ip_build_header,
5191         tcp_connect,
5192         tcp_accept,
5193         ip_queue_xmit,
5194         tcp_retransmit,
5195         tcp_write_wakeup,
5196         tcp_read_wakeup,
5197         tcp_rcv,
5198         tcp_select,
5199         tcp_ioctl,
5200         NULL,
5201         tcp_shutdown,
5202         tcp_setsockopt,
5203         tcp_getsockopt,
5204         128,
5205         0,
5206         "TCP",
5207         0, 0,
5208         {NULL,}
5209 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS