net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_write
tcp_sendto
tcp_read_wakeup
cleanup_rbuf
tcp_read_urg
tcp_read
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_recvfrom
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  26  *                                      and was trying to connect (tcp_err()).
  27  *              Alan Cox        :       All icmp error handling was broken
  28  *                                      pointers passed where wrong and the
  29  *                                      socket was looked up backwards. Nobody
  30  *                                      tested any icmp error code obviously.
  31  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  32  *                                      on errors. select behaves and the icmp error race
  33  *                                      has gone by moving it into sock.c
  34  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  35  *                                      packets for unknown sockets.
  36  *              Alan Cox        :       tcp option processing.
  37  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  38  *              Herp Rosmanith  :       More reset fixes
  39  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  40  *                                      any kind of RST is right out.
  41  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  42  *                                      otherwise odd bits of prattle escape still
  43  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  44  *                                      LAN workplace lockups.
  45  *              Alan Cox        :       Some tidyups using the new skb list facilities
  46  *              Alan Cox        :       sk->keepopen now seems to work
  47  *              Alan Cox        :       Pulls options out correctly on accepts
  48  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  49  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  50  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  51  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  52  *              Alan Cox        :       Removed incorrect check for 20 * psh
  53  *      Michael O'Reilly        :       ack < copied bug fix.
  54  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  55  *              Alan Cox        :       FIN with no memory -> CRASH
  56  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  57  *              Alan Cox        :       Added TCP options (SOL_TCP)
  58  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  59  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  60  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  61  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  62  *              Alan Cox        :       Put in missing check for SYN bit.
  63  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  64  *                                      window non shrink trick.
  65  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  66  *              Charles Hedrick :       TCP fixes
  67  *              Toomas Tamm     :       TCP window fixes
  68  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  69  *              Charles Hedrick :       Rewrote most of it to actually work
  70  *              Linus           :       Rewrote tcp_read() and URG handling
  71  *                                      completely
  72  *              Gerhard Koerting:       Fixed some missing timer handling
  73  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  74  *              Gerhard Koerting:       PC/TCP workarounds
  75  *              Adam Caldwell   :       Assorted timer/timing errors
  76  *              Matthew Dillon  :       Fixed another RST bug
  77  *              Alan Cox        :       Move to kernel side addressing changes.
  78  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  79  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  80  *              Alan Cox        :       TCP fast path debugging
  81  *              Alan Cox        :       Window clamping
  82  *              Michael Riepe   :       Bug in tcp_check()
  83  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  84  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  85  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  86  *              Alan Cox        :       BSD accept semantics. 
  87  *              Alan Cox        :       Reset on closedown bug.
  88  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  89  *              Michael Pall    :       Handle select() after URG properly in all cases.
  90  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  91  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  92  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  93  *              Alan Cox        :       Changed the semantics of sk->socket to 
  94  *                                      fix a race and a signal problem with
  95  *                                      accept() and async I/O.
  96  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  97  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  98  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  99  *                                      clients/servers which listen in on
 100  *                                      fixed ports.
 101  *              Alan Cox        :       Cleaned the above up and shrank it to
 102  *                                      a sensible code size.
 103  *              Alan Cox        :       Self connect lockup fix.
 104  *              Alan Cox        :       No connect to multicast.
 105  *              Ross Biro       :       Close unaccepted children on master
 106  *                                      socket close.
 107  *              Alan Cox        :       Reset tracing code.
 108  *              Alan Cox        :       Spurious resets on shutdown.
 109  *              Alan Cox        :       Giant 15 minute/60 second timer error
 110  *              Alan Cox        :       Small whoops in selecting before an accept.
 111  *              Alan Cox        :       Kept the state trace facility since it's
 112  *                                      handy for debugging.
 113  *              Alan Cox        :       More reset handler fixes.
 114  *              Alan Cox        :       Started rewriting the code based on the RFC's
 115  *                                      for other useful protocol references see:  
 116  *                                      Comer, KA9Q NOS, and for a reference on the
 117  *                                      difference between specifications and how BSD
 118  *                                      works see the 4.4lite source.
 119  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 120  *                                      close.
 121  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 122  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 123  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 124  *                                      timers for sanity. 
 125  *              Alan Cox        :       Small bug fixes, and a lot of new
 126  *                                      comments.
 127  *              Alan Cox        :       Fixed dual reader crash by locking
 128  *                                      the buffers (much like datagram.c)
 129  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 130  *                                      now gets fed up of retrying without
 131  *                                      (even a no space) answer.
 132  *              Alan Cox        :       Extracted closing code better
 133  *              Alan Cox        :       Fixed the closing state machine to
 134  *                                      resemble the RFC.
 135  *              Alan Cox        :       More 'per spec' fixes.
 136  *              Jorge Cwik      :       Even faster checksumming.
 137  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 138  *                                      only frames. At least one pc tcp stack
 139  *                                      generates them.
 140  *              Alan Cox        :       Cache last socket.
 141  *              Alan Cox        :       Per route irtt.
 142  *              Matt Day        :       Select() match BSD precisely on error
 143  *              Alan Cox        :       New buffers
 144  *              Mark Tamsky     :       Various sk->prot->retransmits and 
 145  *                                      sk->retransmits misupdating fixed.
 146  *                                      Fixed tcp_write_timeout: stuck close,
 147  *                                      and TCP syn retries gets used now.
 148  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 149  *                                      ack if stat is TCP_CLOSED.
 150  *              Alan Cox        :       Look up device on a retransmit - routes may
 151  *                                      change. Doesn't yet cope with MSS shrink right
 152  *                                      but its a start!
 153  *
 154  *
 155  * To Fix:
 156  *              Fast path the code. Two things here - fix the window calculation
 157  *              so it doesn't iterate over the queue, also spot packets with no funny
 158  *              options arriving in order and process directly.
 159  *
 160  *              Implement RFC 1191 [Path MTU discovery]
 161  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 162  *              Rewrite output state machine to use a single queue and do low window
 163  *              situations as per the spec (RFC 1122)
 164  *              Speed up input assembly algorithm.
 165  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 166  *              could do with it working on IPv4
 167  *              User settable/learned rtt/max window/mtu
 168  *              Cope with MTU/device switches when retransmitting in tcp.
 169  *              Fix the window handling to use PR's new code.
 170  *
 171  *              Change the fundamental structure to a single send queue maintained
 172  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 173  *              active routes too]). Cut the queue off in tcp_retransmit/
 174  *              tcp_transmit.
 175  *              Change the receive queue to assemble as it goes. This lets us
 176  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 177  *              tcp_data/tcp_read as well as the window shrink crud.
 178  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 179  *              tcp_queue_skb seem obvious routines to extract.
 180  *      
 181  *              This program is free software; you can redistribute it and/or
 182  *              modify it under the terms of the GNU General Public License
 183  *              as published by the Free Software Foundation; either version
 184  *              2 of the License, or(at your option) any later version.
 185  *
 186  * Description of States:
 187  *
 188  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 189  *
 190  *      TCP_SYN_RECV            received a connection request, sent ack,
 191  *                              waiting for final ack in three-way handshake.
 192  *
 193  *      TCP_ESTABLISHED         connection established
 194  *
 195  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 196  *                              transmission of remaining buffered data
 197  *
 198  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 199  *                              to shutdown
 200  *
 201  *      TCP_CLOSING             both sides have shutdown but we still have
 202  *                              data we have to finish sending
 203  *
 204  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 205  *                              closed, can only be entered from FIN_WAIT2
 206  *                              or CLOSING.  Required because the other end
 207  *                              may not have gotten our last ACK causing it
 208  *                              to retransmit the data packet (which we ignore)
 209  *
 210  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 211  *                              us to finish writing our data and to shutdown
 212  *                              (we have to close() to move on to LAST_ACK)
 213  *
 214  *      TCP_LAST_ACK            out side has shutdown after remote has
 215  *                              shutdown.  There may still be data in our
 216  *                              buffer that we have to finish sending
 217  *              
 218  *      TCP_CLOSE               socket is finished
 219  */
 220 
 221 #include <linux/types.h>
 222 #include <linux/sched.h>
 223 #include <linux/mm.h>
 224 #include <linux/time.h>
 225 #include <linux/string.h>
 226 #include <linux/config.h>
 227 #include <linux/socket.h>
 228 #include <linux/sockios.h>
 229 #include <linux/termios.h>
 230 #include <linux/in.h>
 231 #include <linux/fcntl.h>
 232 #include <linux/inet.h>
 233 #include <linux/netdevice.h>
 234 #include <net/snmp.h>
 235 #include <net/ip.h>
 236 #include <net/protocol.h>
 237 #include <net/icmp.h>
 238 #include <net/tcp.h>
 239 #include <net/arp.h>
 240 #include <linux/skbuff.h>
 241 #include <net/sock.h>
 242 #include <net/route.h>
 243 #include <linux/errno.h>
 244 #include <linux/timer.h>
 245 #include <asm/system.h>
 246 #include <asm/segment.h>
 247 #include <linux/mm.h>
 248 #include <net/checksum.h>
 249 
 250 /*
 251  *      The MSL timer is the 'normal' timer.
 252  */
 253  
 254 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 255 
 256 #define SEQ_TICK 3
 257 unsigned long seq_offset;
 258 struct tcp_mib  tcp_statistics;
 259 
 260 /*
 261  *      Cached last hit socket
 262  */
 263  
 264 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 265 volatile unsigned short  th_cache_dport, th_cache_sport;
 266 volatile struct sock *th_cache_sk;
 267 
 268 void tcp_cache_zap(void)
     /*  */
 269 {
 270         unsigned long flags;
 271         save_flags(flags);
 272         cli();
 273         th_cache_saddr=0;
 274         th_cache_daddr=0;
 275         th_cache_dport=0;
 276         th_cache_sport=0;
 277         th_cache_sk=NULL;
 278         restore_flags(flags);
 279 }
 280 
 281 static void tcp_close(struct sock *sk, int timeout);
 282 
 283 
 284 /*
 285  *      The less said about this the better, but it works and will do for 1.2 
 286  */
 287 
 288 static struct wait_queue *master_select_wakeup;
 289 
 290 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 291 {
 292         if (a < b) 
 293                 return(a);
 294         return(b);
 295 }
 296 
 297 #undef STATE_TRACE
 298 
 299 #ifdef STATE_TRACE
 300 static char *statename[]={
 301         "Unused","Established","Syn Sent","Syn Recv",
 302         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 303         "Close Wait","Last ACK","Listen","Closing"
 304 };
 305 #endif
 306 
 307 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 308 {
 309         if(sk->state==TCP_ESTABLISHED)
 310                 tcp_statistics.TcpCurrEstab--;
 311 #ifdef STATE_TRACE
 312         if(sk->debug)
 313                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 314 #endif  
 315         /* This is a hack but it doesn't occur often and it's going to
 316            be a real        to fix nicely */
 317            
 318         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 319         {
 320                 wake_up_interruptible(&master_select_wakeup);
 321         }
 322         sk->state=state;
 323         if(state==TCP_ESTABLISHED)
 324                 tcp_statistics.TcpCurrEstab++;
 325 }
 326 
 327 /*
 328  *      This routine picks a TCP windows for a socket based on
 329  *      the following constraints
 330  *  
 331  *      1. The window can never be shrunk once it is offered (RFC 793)
 332  *      2. We limit memory per socket
 333  *   
 334  *      For now we use NET2E3's heuristic of offering half the memory
 335  *      we have handy. All is not as bad as this seems however because
 336  *      of two things. Firstly we will bin packets even within the window
 337  *      in order to get the data we are waiting for into the memory limit.
 338  *      Secondly we bin common duplicate forms at receive time
 339  *      Better heuristics welcome
 340  */
 341    
 342 int tcp_select_window(struct sock *sk)
     /*  */
 343 {
 344         int new_window = sk->prot->rspace(sk);
 345         
 346         if(sk->window_clamp)
 347                 new_window=min(sk->window_clamp,new_window);
 348         /*
 349          *      Two things are going on here.  First, we don't ever offer a
 350          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 351          *      receiver side of SWS as specified in RFC1122.
 352          *      Second, we always give them at least the window they
 353          *      had before, in order to avoid retracting window.  This
 354          *      is technically allowed, but RFC1122 advises against it and
 355          *      in practice it causes trouble.
 356          *
 357          *      Fixme: This doesn't correctly handle the case where
 358          *      new_window > sk->window but not by enough to allow for the
 359          *      shift in sequence space. 
 360          */
 361         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 362                 return(sk->window);
 363         return(new_window);
 364 }
 365 
 366 /*
 367  *      Find someone to 'accept'. Must be called with
 368  *      sk->inuse=1 or cli()
 369  */ 
 370 
 371 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 372 {
 373         struct sk_buff *p=skb_peek(&s->receive_queue);
 374         if(p==NULL)
 375                 return NULL;
 376         do
 377         {
 378                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 379                         return p;
 380                 p=p->next;
 381         }
 382         while(p!=(struct sk_buff *)&s->receive_queue);
 383         return NULL;
 384 }
 385 
 386 /*
 387  *      Remove a completed connection and return it. This is used by
 388  *      tcp_accept() to get connections from the queue.
 389  */
 390 
 391 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 392 {
 393         struct sk_buff *skb;
 394         unsigned long flags;
 395         save_flags(flags);
 396         cli(); 
 397         skb=tcp_find_established(s);
 398         if(skb!=NULL)
 399                 skb_unlink(skb);        /* Take it off the queue */
 400         restore_flags(flags);
 401         return skb;
 402 }
 403 
 404 /* 
 405  *      This routine closes sockets which have been at least partially
 406  *      opened, but not yet accepted. Currently it is only called by
 407  *      tcp_close, and timeout mirrors the value there. 
 408  */
 409 
 410 static void tcp_close_pending (struct sock *sk) 
     /*  */
 411 {
 412         struct sk_buff *skb;
 413 
 414         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 415         {
 416                 skb->sk->dead=1;
 417                 tcp_close(skb->sk, 0);
 418                 kfree_skb(skb, FREE_READ);
 419         }
 420         return;
 421 }
 422 
 423 /*
 424  *      Enter the time wait state. 
 425  */
 426 
 427 static void tcp_time_wait(struct sock *sk)
     /*  */
 428 {
 429         tcp_set_state(sk,TCP_TIME_WAIT);
 430         sk->shutdown = SHUTDOWN_MASK;
 431         if (!sk->dead)
 432                 sk->state_change(sk);
 433         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 434 }
 435 
 436 /*
 437  *      A socket has timed out on its send queue and wants to do a
 438  *      little retransmitting. Currently this means TCP.
 439  */
 440 
 441 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 442 {
 443         struct sk_buff * skb;
 444         struct proto *prot;
 445         struct device *dev;
 446         int ct=0;
 447         struct rtable *rt;
 448 
 449         prot = sk->prot;
 450         skb = sk->send_head;
 451 
 452         while (skb != NULL)
 453         {
 454                 struct tcphdr *th;
 455                 struct iphdr *iph;
 456                 int size;
 457 
 458                 dev = skb->dev;
 459                 IS_SKB(skb);
 460                 skb->when = jiffies;
 461 
 462                 /*
 463                  *      Discard the surplus MAC header
 464                  */
 465                  
 466                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 467 
 468                 /*
 469                  * In general it's OK just to use the old packet.  However we
 470                  * need to use the current ack and window fields.  Urg and
 471                  * urg_ptr could possibly stand to be updated as well, but we
 472                  * don't keep the necessary data.  That shouldn't be a problem,
 473                  * if the other end is doing the right thing.  Since we're
 474                  * changing the packet, we have to issue a new IP identifier.
 475                  */
 476 
 477                 iph = (struct iphdr *)skb->data;
 478                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 479                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 480                 
 481                 /*
 482                  *      Note: We ought to check for window limits here but
 483                  *      currently this is done (less efficiently) elsewhere.
 484                  */
 485 
 486                 iph->id = htons(ip_id_count++);
 487                 ip_send_check(iph);
 488                 
 489                 /*
 490                  *      Put a MAC header back on (may cause ARPing)
 491                  */
 492                  
 493                 if(skb->localroute)
 494                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 495                 else
 496                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 497                         
 498                 if(rt==NULL)    /* Deep poo */
 499                 {
 500                         if(skb->sk)
 501                         {
 502                                 skb->sk->err=ENETUNREACH;
 503                                 skb->sk->error_report(skb->sk);
 504                         }
 505                 }
 506                 else
 507                 {
 508                         dev=rt->rt_dev;
 509                         skb->raddr=rt->rt_gateway;
 510                         if(skb->raddr==0)
 511                                 skb->raddr=iph->daddr;
 512                         skb->dev=dev;
 513                         skb->arp=1;
 514                         if(dev->hard_header)
 515                         {
 516                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 517                                         skb->arp=0;
 518                         }
 519                 
 520                         /*
 521                          *      This is not the right way to handle this. We have to
 522                          *      issue an up to date window and ack report with this 
 523                          *      retransmit to keep the odd buggy tcp that relies on 
 524                          *      the fact BSD does this happy. 
 525                          *      We don't however need to recalculate the entire 
 526                          *      checksum, so someone wanting a small problem to play
 527                          *      with might like to implement RFC1141/RFC1624 and speed
 528                          *      this up by avoiding a full checksum.
 529                          */
 530                  
 531                         th->ack_seq = ntohl(sk->acked_seq);
 532                         th->window = ntohs(tcp_select_window(sk));
 533                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 534                 
 535                         /*
 536                          *      If the interface is (still) up and running, kick it.
 537                          */
 538         
 539                         if (dev->flags & IFF_UP)
 540                         {
 541                                 /*
 542                                  *      If the packet is still being sent by the device/protocol
 543                                  *      below then don't retransmit. This is both needed, and good -
 544                                  *      especially with connected mode AX.25 where it stops resends
 545                                  *      occurring of an as yet unsent anyway frame!
 546                                  *      We still add up the counts as the round trip time wants
 547                                  *      adjusting.
 548                                  */
 549                                 if (sk && !skb_device_locked(skb))
 550                                 {
 551                                         /* Remove it from any existing driver queue first! */
 552                                         skb_unlink(skb);
 553                                         /* Now queue it */
 554                                         ip_statistics.IpOutRequests++;
 555                                         dev_queue_xmit(skb, dev, sk->priority);
 556                                 }
 557                         }
 558                 }
 559                 
 560                 /*
 561                  *      Count retransmissions
 562                  */
 563                  
 564                 ct++;
 565                 sk->prot->retransmits ++;
 566                 tcp_statistics.TcpRetransSegs++;
 567                 
 568 
 569                 /*
 570                  *      Only one retransmit requested.
 571                  */
 572         
 573                 if (!all)
 574                         break;
 575 
 576                 /*
 577                  *      This should cut it off before we send too many packets.
 578                  */
 579 
 580                 if (ct >= sk->cong_window)
 581                         break;
 582                 skb = skb->link3;
 583         }
 584 }
 585 
 586 /*
 587  *      Reset the retransmission timer
 588  */
 589  
 590 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 591 {
 592         del_timer(&sk->retransmit_timer);
 593         sk->ip_xmit_timeout = why;
 594         if((int)when < 0)
 595         {
 596                 when=3;
 597                 printk("Error: Negative timer in xmit_timer\n");
 598         }
 599         sk->retransmit_timer.expires=jiffies+when;
 600         add_timer(&sk->retransmit_timer);
 601 }
 602 
 603 /*
 604  *      This is the normal code called for timeouts.  It does the retransmission
 605  *      and then does backoff.  tcp_do_retransmit is separated out because
 606  *      tcp_ack needs to send stuff from the retransmit queue without
 607  *      initiating a backoff.
 608  */
 609 
 610 
 611 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 612 {
 613         tcp_do_retransmit(sk, all);
 614 
 615         /*
 616          * Increase the timeout each time we retransmit.  Note that
 617          * we do not increase the rtt estimate.  rto is initialized
 618          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 619          * that doubling rto each time is the least we can get away with.
 620          * In KA9Q, Karn uses this for the first few times, and then
 621          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 622          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 623          * defined in the protocol as the maximum possible RTT.  I guess
 624          * we'll have to use something other than TCP to talk to the
 625          * University of Mars.
 626          *
 627          * PAWS allows us longer timeouts and large windows, so once
 628          * implemented ftp to mars will work nicely. We will have to fix
 629          * the 120 second clamps though!
 630          */
 631 
 632         sk->retransmits++;
 633         sk->prot->retransmits++;
 634         sk->backoff++;
 635         sk->rto = min(sk->rto << 1, 120*HZ);
 636         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 637 }
 638 
 639 
 640 /*
 641  *      A timer event has trigger a tcp retransmit timeout. The
 642  *      socket xmit queue is ready and set up to send. Because
 643  *      the ack receive code keeps the queue straight we do
 644  *      nothing clever here.
 645  */
 646 
 647 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 648 {
 649         if (all) 
 650         {
 651                 tcp_retransmit_time(sk, all);
 652                 return;
 653         }
 654 
 655         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 656         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 657         sk->cong_count = 0;
 658 
 659         sk->cong_window = 1;
 660 
 661         /* Do the actual retransmit. */
 662         tcp_retransmit_time(sk, all);
 663 }
 664 
 665 /*
 666  *      A write timeout has occurred. Process the after effects.
 667  */
 668 
 669 static int tcp_write_timeout(struct sock *sk)
     /*  */
 670 {
 671         /*
 672          *      Look for a 'soft' timeout.
 673          */
 674         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 675                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 676         {
 677                 /*
 678                  *      Attempt to recover if arp has changed (unlikely!) or
 679                  *      a route has shifted (not supported prior to 1.3).
 680                  */
 681                 arp_destroy (sk->daddr, 0);
 682                 /*ip_route_check (sk->daddr);*/
 683         }
 684         
 685         /*
 686          *      Have we tried to SYN too many times (repent repent 8))
 687          */
 688          
 689         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 690         {
 691                 sk->err=ETIMEDOUT;
 692                 sk->error_report(sk);
 693                 del_timer(&sk->retransmit_timer);
 694                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 695                 tcp_set_state(sk,TCP_CLOSE);
 696                 /* Don't FIN, we got nothing back */
 697                 release_sock(sk);
 698                 return 0;
 699         }
 700         /*
 701          *      Has it gone just too far ?
 702          */
 703         if (sk->retransmits > TCP_RETR2) 
 704         {
 705                 sk->err = ETIMEDOUT;
 706                 sk->error_report(sk);
 707                 del_timer(&sk->retransmit_timer);
 708                 /*
 709                  *      Time wait the socket 
 710                  */
 711                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 712                 {
 713                         tcp_set_state(sk,TCP_TIME_WAIT);
 714                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 715                 }
 716                 else
 717                 {
 718                         /*
 719                          *      Clean up time.
 720                          */
 721                         tcp_set_state(sk, TCP_CLOSE);
 722                         release_sock(sk);
 723                         return 0;
 724                 }
 725         }
 726         return 1;
 727 }
 728 
 729 /*
 730  *      The TCP retransmit timer. This lacks a few small details.
 731  *
 732  *      1.      An initial rtt timeout on the probe0 should cause what we can
 733  *              of the first write queue buffer to be split and sent.
 734  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 735  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 736  *              tcp_err should save a 'soft error' for us.
 737  */
 738 
 739 static void retransmit_timer(unsigned long data)
     /*  */
 740 {
 741         struct sock *sk = (struct sock*)data;
 742         int why = sk->ip_xmit_timeout;
 743 
 744         /* 
 745          * only process if socket is not in use
 746          */
 747 
 748         cli();
 749         if (sk->inuse || in_bh) 
 750         {
 751                 /* Try again in 1 second */
 752                 sk->retransmit_timer.expires = jiffies+HZ;
 753                 add_timer(&sk->retransmit_timer);
 754                 sti();
 755                 return;
 756         }
 757 
 758         sk->inuse = 1;
 759         sti();
 760 
 761         /* Always see if we need to send an ack. */
 762 
 763         if (sk->ack_backlog && !sk->zapped) 
 764         {
 765                 sk->prot->read_wakeup (sk);
 766                 if (! sk->dead)
 767                         sk->data_ready(sk,0);
 768         }
 769 
 770         /* Now we need to figure out why the socket was on the timer. */
 771 
 772         switch (why) 
 773         {
 774                 /* Window probing */
 775                 case TIME_PROBE0:
 776                         tcp_send_probe0(sk);
 777                         tcp_write_timeout(sk);
 778                         break;
 779                 /* Retransmitting */
 780                 case TIME_WRITE:
 781                         /* It could be we got here because we needed to send an ack.
 782                          * So we need to check for that.
 783                          */
 784                 {
 785                         struct sk_buff *skb;
 786                         unsigned long flags;
 787 
 788                         save_flags(flags);
 789                         cli();
 790                         skb = sk->send_head;
 791                         if (!skb) 
 792                         {
 793                                 restore_flags(flags);
 794                         } 
 795                         else 
 796                         {
 797                                 /*
 798                                  *      Kicked by a delayed ack. Reset timer
 799                                  *      correctly now
 800                                  */
 801                                 if (jiffies < skb->when + sk->rto) 
 802                                 {
 803                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 804                                         restore_flags(flags);
 805                                         break;
 806                                 }
 807                                 restore_flags(flags);
 808                                 /*
 809                                  *      Retransmission
 810                                  */
 811                                 sk->prot->retransmit (sk, 0);
 812                                 tcp_write_timeout(sk);
 813                         }
 814                         break;
 815                 }
 816                 /* Sending Keepalives */
 817                 case TIME_KEEPOPEN:
 818                         /* 
 819                          * this reset_timer() call is a hack, this is not
 820                          * how KEEPOPEN is supposed to work.
 821                          */
 822                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 823 
 824                         /* Send something to keep the connection open. */
 825                         if (sk->prot->write_wakeup)
 826                                   sk->prot->write_wakeup (sk);
 827                         sk->retransmits++;
 828                         sk->prot->retransmits++;
 829                         tcp_write_timeout(sk);
 830                         break;
 831                 default:
 832                         printk ("rexmit_timer: timer expired - reason unknown\n");
 833                         break;
 834         }
 835         release_sock(sk);
 836 }
 837 
 838 /*
 839  * This routine is called by the ICMP module when it gets some
 840  * sort of error condition.  If err < 0 then the socket should
 841  * be closed and the error returned to the user.  If err > 0
 842  * it's just the icmp type << 8 | icmp code.  After adjustment
 843  * header points to the first 8 bytes of the tcp header.  We need
 844  * to find the appropriate port.
 845  */
 846 
 847 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /*  */
 848         unsigned long saddr, struct inet_protocol *protocol)
 849 {
 850         struct tcphdr *th;
 851         struct sock *sk;
 852         struct iphdr *iph=(struct iphdr *)header;
 853   
 854         header+=4*iph->ihl;
 855    
 856 
 857         th =(struct tcphdr *)header;
 858         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 859 
 860         if (sk == NULL) 
 861                 return;
 862   
 863         if(err<0)
 864         {
 865                 sk->err = -err;
 866                 sk->error_report(sk);
 867                 return;
 868         }
 869 
 870         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 871         {
 872                 /*
 873                  * FIXME:
 874                  * For now we will just trigger a linear backoff.
 875                  * The slow start code should cause a real backoff here.
 876                  */
 877                 if (sk->cong_window > 4)
 878                         sk->cong_window--;
 879                 return;
 880         }
 881 
 882 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 883 
 884         /*
 885          * If we've already connected we will keep trying
 886          * until we time out, or the user gives up.
 887          */
 888 
 889         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 890         {
 891                 if (sk->state == TCP_SYN_SENT) 
 892                 {
 893                         tcp_statistics.TcpAttemptFails++;
 894                         tcp_set_state(sk,TCP_CLOSE);
 895                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 896                 }
 897                 sk->err = icmp_err_convert[err & 0xff].errno;           
 898         }
 899         return;
 900 }
 901 
 902 
 903 /*
 904  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 905  *      in the received data queue (ie a frame missing that needs sending to us). Not
 906  *      sorting using two queues as data arrives makes life so much harder.
 907  */
 908 
 909 static int tcp_readable(struct sock *sk)
     /*  */
 910 {
 911         unsigned long counted;
 912         unsigned long amount;
 913         struct sk_buff *skb;
 914         int sum;
 915         unsigned long flags;
 916 
 917         if(sk && sk->debug)
 918                 printk("tcp_readable: %p - ",sk);
 919 
 920         save_flags(flags);
 921         cli();
 922         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 923         {
 924                 restore_flags(flags);
 925                 if(sk && sk->debug) 
 926                         printk("empty\n");
 927                 return(0);
 928         }
 929   
 930         counted = sk->copied_seq;       /* Where we are at the moment */
 931         amount = 0;
 932   
 933         /* 
 934          *      Do until a push or until we are out of data. 
 935          */
 936          
 937         do 
 938         {
 939                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 940                         break;
 941                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 942                 if (skb->h.th->syn)
 943                         sum++;
 944                 if (sum > 0) 
 945                 {                                       /* Add it up, move on */
 946                         amount += sum;
 947                         if (skb->h.th->syn) 
 948                                 amount--;
 949                         counted += sum;
 950                 }
 951                 /*
 952                  * Don't count urg data ... but do it in the right place!
 953                  * Consider: "old_data (ptr is here) URG PUSH data"
 954                  * The old code would stop at the first push because
 955                  * it counted the urg (amount==1) and then does amount--
 956                  * *after* the loop.  This means tcp_readable() always
 957                  * returned zero if any URG PUSH was in the queue, even
 958                  * though there was normal data available. If we subtract
 959                  * the urg data right here, we even get it to work for more
 960                  * than one URG PUSH skb without normal data.
 961                  * This means that select() finally works now with urg data
 962                  * in the queue.  Note that rlogin was never affected
 963                  * because it doesn't use select(); it uses two processes
 964                  * and a blocking read().  And the queue scan in tcp_read()
 965                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 966                  */
 967                 if (skb->h.th->urg)
 968                         amount--;       /* don't count urg data */
 969                 if (amount && skb->h.th->psh) break;
 970                 skb = skb->next;
 971         }
 972         while(skb != (struct sk_buff *)&sk->receive_queue);
 973 
 974         restore_flags(flags);
 975         if(sk->debug)
 976                 printk("got %lu bytes.\n",amount);
 977         return(amount);
 978 }
 979 
 980 /*
 981  * LISTEN is a special case for select..
 982  */
 983 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 984 {
 985         if (sel_type == SEL_IN) {
 986                 int retval;
 987 
 988                 sk->inuse = 1;
 989                 retval = (tcp_find_established(sk) != NULL);
 990                 release_sock(sk);
 991                 if (!retval)
 992                         select_wait(&master_select_wakeup,wait);
 993                 return retval;
 994         }
 995         return 0;
 996 }
 997 
 998 
 999 /*
1000  *      Wait for a TCP event.
1001  *
1002  *      Note that we don't need to set "sk->inuse", as the upper select layers
1003  *      take care of normal races (between the test and the event) and we don't
1004  *      go look at any of the socket buffers directly.
1005  */
1006 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1007 {
1008         if (sk->state == TCP_LISTEN)
1009                 return tcp_listen_select(sk, sel_type, wait);
1010 
1011         switch(sel_type) {
1012         case SEL_IN:
1013                 if (sk->err)
1014                         return 1;
1015                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1016                         break;
1017 
1018                 if (sk->shutdown & RCV_SHUTDOWN)
1019                         return 1;
1020                         
1021                 if (sk->acked_seq == sk->copied_seq)
1022                         break;
1023 
1024                 if (sk->urg_seq != sk->copied_seq ||
1025                     sk->acked_seq != sk->copied_seq+1 ||
1026                     sk->urginline || !sk->urg_data)
1027                         return 1;
1028                 break;
1029 
1030         case SEL_OUT:
1031                 if (sk->err)
1032                         return 1;
1033                 if (sk->shutdown & SEND_SHUTDOWN) 
1034                         return 0;
1035                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1036                         break;
1037                 /*
1038                  * This is now right thanks to a small fix
1039                  * by Matt Dillon.
1040                  */
1041 
1042                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1043                         break;
1044                 return 1;
1045 
1046         case SEL_EX:
1047                 if (sk->urg_data)
1048                         return 1;
1049                 break;
1050         }
1051         select_wait(sk->sleep, wait);
1052         return 0;
1053 }
1054 
1055 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1056 {
1057         int err;
1058         switch(cmd) 
1059         {
1060 
1061                 case TIOCINQ:
1062 #ifdef FIXME    /* FIXME: */
1063                 case FIONREAD:
1064 #endif
1065                 {
1066                         unsigned long amount;
1067 
1068                         if (sk->state == TCP_LISTEN) 
1069                                 return(-EINVAL);
1070 
1071                         sk->inuse = 1;
1072                         amount = tcp_readable(sk);
1073                         release_sock(sk);
1074                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1075                         if(err)
1076                                 return err;
1077                         put_user(amount, (int *)arg);
1078                         return(0);
1079                 }
1080                 case SIOCATMARK:
1081                 {
1082                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1083 
1084                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1085                         if (err)
1086                                 return err;
1087                         put_user(answ,(int *) arg);
1088                         return(0);
1089                 }
1090                 case TIOCOUTQ:
1091                 {
1092                         unsigned long amount;
1093 
1094                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1095                         amount = sk->prot->wspace(sk);
1096                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1097                         if(err)
1098                                 return err;
1099                         put_user(amount, (int *)arg);
1100                         return(0);
1101                 }
1102                 default:
1103                         return(-EINVAL);
1104         }
1105 }
1106 
1107 
1108 /*
1109  *      This routine computes a TCP checksum. 
1110  *
1111  *      Modified January 1995 from a go-faster DOS routine by
1112  *      Jorge Cwik <jorge@laser.satlink.net>
1113  */
1114  
1115 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1116           unsigned long saddr, unsigned long daddr, unsigned long base)
1117 {     
1118         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1119 }
1120 
1121 
1122 
1123 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1124                 unsigned long daddr, int len, struct sock *sk)
1125 {
1126         th->check = 0;
1127         th->check = tcp_check(th, len, saddr, daddr,
1128                 csum_partial((char *)th,len,0));
1129         return;
1130 }
1131 
1132 /*
1133  *      This is the main buffer sending routine. We queue the buffer
1134  *      having checked it is sane seeming.
1135  */
1136  
1137 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1138 {
1139         int size;
1140         struct tcphdr * th = skb->h.th;
1141 
1142         /*
1143          *      length of packet (not counting length of pre-tcp headers) 
1144          */
1145          
1146         size = skb->len - ((unsigned char *) th - skb->data);
1147 
1148         /*
1149          *      Sanity check it.. 
1150          */
1151          
1152         if (size < sizeof(struct tcphdr) || size > skb->len) 
1153         {
1154                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1155                         skb, skb->data, th, skb->len);
1156                 kfree_skb(skb, FREE_WRITE);
1157                 return;
1158         }
1159 
1160         /*
1161          *      If we have queued a header size packet.. (these crash a few
1162          *      tcp stacks if ack is not set)
1163          */
1164          
1165         if (size == sizeof(struct tcphdr)) 
1166         {
1167                 /* If it's got a syn or fin it's notionally included in the size..*/
1168                 if(!th->syn && !th->fin) 
1169                 {
1170                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1171                         kfree_skb(skb,FREE_WRITE);
1172                         return;
1173                 }
1174         }
1175 
1176         /*
1177          *      Actual processing.
1178          */
1179          
1180         tcp_statistics.TcpOutSegs++;  
1181         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1182         
1183         /*
1184          *      We must queue if
1185          *
1186          *      a) The right edge of this frame exceeds the window
1187          *      b) We are retransmitting (Nagle's rule)
1188          *      c) We have too many packets 'in flight'
1189          */
1190          
1191         if (after(skb->h.seq, sk->window_seq) ||
1192             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1193              sk->packets_out >= sk->cong_window) 
1194         {
1195                 /* checksum will be supplied by tcp_write_xmit.  So
1196                  * we shouldn't need to set it at all.  I'm being paranoid */
1197                 th->check = 0;
1198                 if (skb->next != NULL) 
1199                 {
1200                         printk("tcp_send_partial: next != NULL\n");
1201                         skb_unlink(skb);
1202                 }
1203                 skb_queue_tail(&sk->write_queue, skb);
1204                 
1205                 /*
1206                  *      If we don't fit we have to start the zero window
1207                  *      probes. This is broken - we really need to do a partial
1208                  *      send _first_ (This is what causes the Cisco and PC/TCP
1209                  *      grief).
1210                  */
1211                  
1212                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1213                     sk->send_head == NULL && sk->ack_backlog == 0)
1214                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1215         } 
1216         else 
1217         {
1218                 /*
1219                  *      This is going straight out
1220                  */
1221                  
1222                 th->ack_seq = ntohl(sk->acked_seq);
1223                 th->window = ntohs(tcp_select_window(sk));
1224 
1225                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1226 
1227                 sk->sent_seq = sk->write_seq;
1228                 
1229                 /*
1230                  *      This is mad. The tcp retransmit queue is put together
1231                  *      by the ip layer. This causes half the problems with
1232                  *      unroutable FIN's and other things.
1233                  */
1234                  
1235                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1236                 
1237                 /*
1238                  *      Set for next retransmit based on expected ACK time.
1239                  *      FIXME: We set this every time which means our 
1240                  *      retransmits are really about a window behind.
1241                  */
1242 
1243                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1244         }
1245 }
1246 
1247 /*
1248  *      Locking problems lead us to a messy situation where we can have
1249  *      multiple partially complete buffers queued up. This is really bad
1250  *      as we don't want to be sending partial buffers. Fix this with
1251  *      a semaphore or similar to lock tcp_write per socket.
1252  *
1253  *      These routines are pretty self descriptive.
1254  */
1255  
1256 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1257 {
1258         struct sk_buff * skb;
1259         unsigned long flags;
1260 
1261         save_flags(flags);
1262         cli();
1263         skb = sk->partial;
1264         if (skb) {
1265                 sk->partial = NULL;
1266                 del_timer(&sk->partial_timer);
1267         }
1268         restore_flags(flags);
1269         return skb;
1270 }
1271 
1272 /*
1273  *      Empty the partial queue
1274  */
1275  
1276 static void tcp_send_partial(struct sock *sk)
     /*  */
1277 {
1278         struct sk_buff *skb;
1279 
1280         if (sk == NULL)
1281                 return;
1282         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1283                 tcp_send_skb(sk, skb);
1284 }
1285 
1286 /*
1287  *      Queue a partial frame
1288  */
1289  
1290 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1291 {
1292         struct sk_buff * tmp;
1293         unsigned long flags;
1294 
1295         save_flags(flags);
1296         cli();
1297         tmp = sk->partial;
1298         if (tmp)
1299                 del_timer(&sk->partial_timer);
1300         sk->partial = skb;
1301         init_timer(&sk->partial_timer);
1302         /*
1303          *      Wait up to 1 second for the buffer to fill.
1304          */
1305         sk->partial_timer.expires = jiffies+HZ;
1306         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1307         sk->partial_timer.data = (unsigned long) sk;
1308         add_timer(&sk->partial_timer);
1309         restore_flags(flags);
1310         if (tmp)
1311                 tcp_send_skb(sk, tmp);
1312 }
1313 
1314 
1315 /*
1316  *      This routine sends an ack and also updates the window. 
1317  */
1318  
1319 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1320              struct sock *sk,
1321              struct tcphdr *th, unsigned long daddr)
1322 {
1323         struct sk_buff *buff;
1324         struct tcphdr *t1;
1325         struct device *dev = NULL;
1326         int tmp;
1327 
1328         if(sk->zapped)
1329                 return;         /* We have been reset, we may not send again */
1330                 
1331         /*
1332          * We need to grab some memory, and put together an ack,
1333          * and then put it into the queue to be sent.
1334          */
1335 
1336         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1337         if (buff == NULL) 
1338         {
1339                 /* 
1340                  *      Force it to send an ack. We don't have to do this
1341                  *      (ACK is unreliable) but it's much better use of 
1342                  *      bandwidth on slow links to send a spare ack than
1343                  *      resend packets. 
1344                  */
1345                  
1346                 sk->ack_backlog++;
1347                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1348                 {
1349                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1350                 }
1351                 return;
1352         }
1353 
1354         /*
1355          *      Assemble a suitable TCP frame
1356          */
1357          
1358         buff->sk = sk;
1359         buff->localroute = sk->localroute;
1360 
1361         /* 
1362          *      Put in the IP header and routing stuff. 
1363          */
1364          
1365         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1366                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1367         if (tmp < 0) 
1368         {
1369                 buff->free = 1;
1370                 sk->prot->wfree(sk, buff);
1371                 return;
1372         }
1373         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1374 
1375         memcpy(t1, th, sizeof(*t1));
1376 
1377         /*
1378          *      Swap the send and the receive. 
1379          */
1380          
1381         t1->dest = th->source;
1382         t1->source = th->dest;
1383         t1->seq = ntohl(sequence);
1384         t1->ack = 1;
1385         sk->window = tcp_select_window(sk);
1386         t1->window = ntohs(sk->window);
1387         t1->res1 = 0;
1388         t1->res2 = 0;
1389         t1->rst = 0;
1390         t1->urg = 0;
1391         t1->syn = 0;
1392         t1->psh = 0;
1393         t1->fin = 0;
1394         
1395         /*
1396          *      If we have nothing queued for transmit and the transmit timer
1397          *      is on we are just doing an ACK timeout and need to switch
1398          *      to a keepalive.
1399          */
1400          
1401         if (ack == sk->acked_seq) 
1402         {
1403                 sk->ack_backlog = 0;
1404                 sk->bytes_rcv = 0;
1405                 sk->ack_timed = 0;
1406                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1407                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1408                 {
1409                         if(sk->keepopen) {
1410                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1411                         } else {
1412                                 delete_timer(sk);
1413                         }
1414                 }
1415         }
1416         
1417         /*
1418          *      Fill in the packet and send it
1419          */
1420          
1421         t1->ack_seq = ntohl(ack);
1422         t1->doff = sizeof(*t1)/4;
1423         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1424         if (sk->debug)
1425                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1426         tcp_statistics.TcpOutSegs++;
1427         sk->prot->queue_xmit(sk, dev, buff, 1);
1428 }
1429 
1430 
1431 /* 
1432  *      This routine builds a generic TCP header. 
1433  */
1434  
1435 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1436 {
1437 
1438         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1439         th->seq = htonl(sk->write_seq);
1440         th->psh =(push == 0) ? 1 : 0;
1441         th->doff = sizeof(*th)/4;
1442         th->ack = 1;
1443         th->fin = 0;
1444         sk->ack_backlog = 0;
1445         sk->bytes_rcv = 0;
1446         sk->ack_timed = 0;
1447         th->ack_seq = htonl(sk->acked_seq);
1448         sk->window = tcp_select_window(sk);
1449         th->window = htons(sk->window);
1450 
1451         return(sizeof(*th));
1452 }
1453 
1454 /*
1455  *      This routine copies from a user buffer into a socket,
1456  *      and starts the transmit system.
1457  */
1458 
1459 static int tcp_write(struct sock *sk, const unsigned char *from,
     /*  */
1460           int len, int nonblock, unsigned flags)
1461 {
1462         int copied = 0;
1463         int copy;
1464         int tmp;
1465         struct sk_buff *skb;
1466         struct sk_buff *send_tmp;
1467         struct proto *prot;
1468         struct device *dev = NULL;
1469 
1470         sk->inuse=1;
1471         prot = sk->prot;
1472         while(len > 0) 
1473         {
1474                 if (sk->err) 
1475                 {                       /* Stop on an error */
1476                         release_sock(sk);
1477                         if (copied) 
1478                                 return(copied);
1479                         tmp = -sk->err;
1480                         sk->err = 0;
1481                         return(tmp);
1482                 }
1483 
1484                 /*
1485                  *      First thing we do is make sure that we are established. 
1486                  */
1487         
1488                 if (sk->shutdown & SEND_SHUTDOWN) 
1489                 {
1490                         release_sock(sk);
1491                         sk->err = EPIPE;
1492                         if (copied) 
1493                                 return(copied);
1494                         sk->err = 0;
1495                         return(-EPIPE);
1496                 }
1497 
1498                 /* 
1499                  *      Wait for a connection to finish.
1500                  */
1501         
1502                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1503                 {
1504                         if (sk->err) 
1505                         {
1506                                 release_sock(sk);
1507                                 if (copied) 
1508                                         return(copied);
1509                                 tmp = -sk->err;
1510                                 sk->err = 0;
1511                                 return(tmp);
1512                         }
1513 
1514                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1515                         {
1516                                 release_sock(sk);
1517                                 if (copied) 
1518                                         return(copied);
1519 
1520                                 if (sk->err) 
1521                                 {
1522                                         tmp = -sk->err;
1523                                         sk->err = 0;
1524                                         return(tmp);
1525                                 }
1526 
1527                                 if (sk->keepopen) 
1528                                 {
1529                                         send_sig(SIGPIPE, current, 0);
1530                                 }
1531                                 return(-EPIPE);
1532                         }
1533 
1534                         if (nonblock || copied) 
1535                         {
1536                                 release_sock(sk);
1537                                 if (copied) 
1538                                         return(copied);
1539                                 return(-EAGAIN);
1540                         }
1541 
1542                         release_sock(sk);
1543                         cli();
1544                 
1545                         if (sk->state != TCP_ESTABLISHED &&
1546                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1547                         {
1548                                 interruptible_sleep_on(sk->sleep);
1549                                 if (current->signal & ~current->blocked) 
1550                                 {
1551                                         sti();
1552                                         if (copied) 
1553                                                 return(copied);
1554                                         return(-ERESTARTSYS);
1555                                 }
1556                         }
1557                         sk->inuse = 1;
1558                         sti();
1559                 }
1560 
1561         /*
1562          * The following code can result in copy <= if sk->mss is ever
1563          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1564          * sk->mtu is constant once SYN processing is finished.  I.e. we
1565          * had better not get here until we've seen his SYN and at least one
1566          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1567          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1568          * non-decreasing.  Note that any ioctl to set user_mss must be done
1569          * before the exchange of SYN's.  If the initial ack from the other
1570          * end has a window of 0, max_window and thus mss will both be 0.
1571          */
1572 
1573         /* 
1574          *      Now we need to check if we have a half built packet. 
1575          */
1576 
1577                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1578                 {
1579                         int hdrlen;
1580 
1581                          /* IP header + TCP header */
1582                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1583                                  + sizeof(struct tcphdr);
1584         
1585                         /* Add more stuff to the end of skb->len */
1586                         if (!(flags & MSG_OOB)) 
1587                         {
1588                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1589                                 /* FIXME: this is really a bug. */
1590                                 if (copy <= 0) 
1591                                 {
1592                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1593                                         copy = 0;
1594                                 }
1595           
1596                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1597                                 from += copy;
1598                                 copied += copy;
1599                                 len -= copy;
1600                                 sk->write_seq += copy;
1601                         }
1602                         if ((skb->len - hdrlen) >= sk->mss ||
1603                                 (flags & MSG_OOB) || !sk->packets_out)
1604                                 tcp_send_skb(sk, skb);
1605                         else
1606                                 tcp_enqueue_partial(skb, sk);
1607                         continue;
1608                 }
1609 
1610         /*
1611          * We also need to worry about the window.
1612          * If window < 1/2 the maximum window we've seen from this
1613          *   host, don't use it.  This is sender side
1614          *   silly window prevention, as specified in RFC1122.
1615          *   (Note that this is different than earlier versions of
1616          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1617          *   use the whole MSS.  Since the results in the right
1618          *   edge of the packet being outside the window, it will
1619          *   be queued for later rather than sent.
1620          */
1621 
1622                 copy = sk->window_seq - sk->write_seq;
1623                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1624                         copy = sk->mss;
1625                 if (copy > len)
1626                         copy = len;
1627 
1628         /*
1629          *      We should really check the window here also. 
1630          */
1631          
1632                 send_tmp = NULL;
1633                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1634                 {
1635                         /*
1636                          *      We will release the socket in case we sleep here. 
1637                          */
1638                         release_sock(sk);
1639                         /*
1640                          *      NB: following must be mtu, because mss can be increased.
1641                          *      mss is always <= mtu 
1642                          */
1643                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1644                         sk->inuse = 1;
1645                         send_tmp = skb;
1646                 } 
1647                 else 
1648                 {
1649                         /*
1650                          *      We will release the socket in case we sleep here. 
1651                          */
1652                         release_sock(sk);
1653                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1654                         sk->inuse = 1;
1655                 }
1656 
1657                 /*
1658                  *      If we didn't get any memory, we need to sleep. 
1659                  */
1660 
1661                 if (skb == NULL) 
1662                 {
1663                         sk->socket->flags |= SO_NOSPACE;
1664                         if (nonblock) 
1665                         {
1666                                 release_sock(sk);
1667                                 if (copied) 
1668                                         return(copied);
1669                                 return(-EAGAIN);
1670                         }
1671 
1672                         /*
1673                          *      FIXME: here is another race condition. 
1674                          */
1675 
1676                         tmp = sk->wmem_alloc;
1677                         release_sock(sk);
1678                         cli();
1679                         /*
1680                          *      Again we will try to avoid it. 
1681                          */
1682                         if (tmp <= sk->wmem_alloc &&
1683                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1684                                 && sk->err == 0) 
1685                         {
1686                                 sk->socket->flags &= ~SO_NOSPACE;
1687                                 interruptible_sleep_on(sk->sleep);
1688                                 if (current->signal & ~current->blocked) 
1689                                 {
1690                                         sti();
1691                                         if (copied) 
1692                                                 return(copied);
1693                                         return(-ERESTARTSYS);
1694                                 }
1695                         }
1696                         sk->inuse = 1;
1697                         sti();
1698                         continue;
1699                 }
1700 
1701                 skb->sk = sk;
1702                 skb->free = 0;
1703                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1704         
1705                 /*
1706                  * FIXME: we need to optimize this.
1707                  * Perhaps some hints here would be good.
1708                  */
1709                 
1710                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1711                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1712                 if (tmp < 0 ) 
1713                 {
1714                         prot->wfree(sk, skb);
1715                         release_sock(sk);
1716                         if (copied) 
1717                                 return(copied);
1718                         return(tmp);
1719                 }
1720                 skb->dev = dev;
1721                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1722                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1723                 if (tmp < 0) 
1724                 {
1725                         prot->wfree(sk, skb);
1726                         release_sock(sk);
1727                         if (copied) 
1728                                 return(copied);
1729                         return(tmp);
1730                 }
1731 
1732                 if (flags & MSG_OOB) 
1733                 {
1734                         skb->h.th->urg = 1;
1735                         skb->h.th->urg_ptr = ntohs(copy);
1736                 }
1737 
1738                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1739                 
1740                 from += copy;
1741                 copied += copy;
1742                 len -= copy;
1743                 skb->free = 0;
1744                 sk->write_seq += copy;
1745         
1746                 if (send_tmp != NULL && sk->packets_out) 
1747                 {
1748                         tcp_enqueue_partial(send_tmp, sk);
1749                         continue;
1750                 }
1751                 tcp_send_skb(sk, skb);
1752         }
1753         sk->err = 0;
1754 
1755 /*
1756  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1757  *      interactive fast network servers. It's meant to be on and
1758  *      it really improves the throughput though not the echo time
1759  *      on my slow slip link - Alan
1760  */
1761 
1762 /*
1763  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1764  */
1765  
1766         if(sk->partial && ((!sk->packets_out) 
1767      /* If not nagling we can send on the before case too.. */
1768               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1769         ))
1770                 tcp_send_partial(sk);
1771 
1772         release_sock(sk);
1773         return(copied);
1774 }
1775 
1776 /*
1777  *      This is just a wrapper. 
1778  */
1779 
1780 static int tcp_sendto(struct sock *sk, const unsigned char *from,
     /*  */
1781            int len, int nonblock, unsigned flags,
1782            struct sockaddr_in *addr, int addr_len)
1783 {
1784         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1785                 return -EINVAL;
1786         if (sk->state == TCP_CLOSE)
1787                 return -ENOTCONN;
1788         if (addr_len < sizeof(*addr))
1789                 return -EINVAL;
1790         if (addr->sin_family && addr->sin_family != AF_INET) 
1791                 return -EINVAL;
1792         if (addr->sin_port != sk->dummy_th.dest) 
1793                 return -EISCONN;
1794         if (addr->sin_addr.s_addr != sk->daddr) 
1795                 return -EISCONN;
1796         return tcp_write(sk, from, len, nonblock, flags);
1797 }
1798 
1799 
1800 /*
1801  *      Send an ack if one is backlogged at this point. Ought to merge
1802  *      this with tcp_send_ack().
1803  */
1804  
1805 static void tcp_read_wakeup(struct sock *sk)
     /*  */
1806 {
1807         int tmp;
1808         struct device *dev = NULL;
1809         struct tcphdr *t1;
1810         struct sk_buff *buff;
1811 
1812         if (!sk->ack_backlog) 
1813                 return;
1814 
1815         /*
1816          * If we're closed, don't send an ack, or we'll get a RST
1817          * from the closed destination.
1818          */
1819         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1820                 return; 
1821 
1822         /*
1823          * FIXME: we need to put code here to prevent this routine from
1824          * being called.  Being called once in a while is ok, so only check
1825          * if this is the second time in a row.
1826          */
1827 
1828         /*
1829          * We need to grab some memory, and put together an ack,
1830          * and then put it into the queue to be sent.
1831          */
1832 
1833         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1834         if (buff == NULL) 
1835         {
1836                 /* Try again real soon. */
1837                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1838                 return;
1839         }
1840 
1841         buff->sk = sk;
1842         buff->localroute = sk->localroute;
1843         
1844         /*
1845          *      Put in the IP header and routing stuff. 
1846          */
1847 
1848         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1849                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1850         if (tmp < 0) 
1851         {
1852                 buff->free = 1;
1853                 sk->prot->wfree(sk, buff);
1854                 return;
1855         }
1856 
1857         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1858 
1859         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1860         t1->seq = htonl(sk->sent_seq);
1861         t1->ack = 1;
1862         t1->res1 = 0;
1863         t1->res2 = 0;
1864         t1->rst = 0;
1865         t1->urg = 0;
1866         t1->syn = 0;
1867         t1->psh = 0;
1868         sk->ack_backlog = 0;
1869         sk->bytes_rcv = 0;
1870         sk->window = tcp_select_window(sk);
1871         t1->window = ntohs(sk->window);
1872         t1->ack_seq = ntohl(sk->acked_seq);
1873         t1->doff = sizeof(*t1)/4;
1874         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1875         sk->prot->queue_xmit(sk, dev, buff, 1);
1876         tcp_statistics.TcpOutSegs++;
1877 }
1878 
1879 
1880 /*
1881  *      FIXME:
1882  *      This routine frees used buffers.
1883  *      It should consider sending an ACK to let the
1884  *      other end know we now have a bigger window.
1885  */
1886 
1887 static void cleanup_rbuf(struct sock *sk)
     /*  */
1888 {
1889         unsigned long flags;
1890         unsigned long left;
1891         struct sk_buff *skb;
1892         unsigned long rspace;
1893 
1894         if(sk->debug)
1895                 printk("cleaning rbuf for sk=%p\n", sk);
1896   
1897         save_flags(flags);
1898         cli();
1899   
1900         left = sk->prot->rspace(sk);
1901  
1902         /*
1903          *      We have to loop through all the buffer headers,
1904          *      and try to free up all the space we can.
1905          */
1906 
1907         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1908         {
1909                 if (!skb->used || skb->users) 
1910                         break;
1911                 skb_unlink(skb);
1912                 skb->sk = sk;
1913                 kfree_skb(skb, FREE_READ);
1914         }
1915 
1916         restore_flags(flags);
1917 
1918         /*
1919          *      FIXME:
1920          *      At this point we should send an ack if the difference
1921          *      in the window, and the amount of space is bigger than
1922          *      TCP_WINDOW_DIFF.
1923          */
1924 
1925         if(sk->debug)
1926                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1927                                             left);
1928         if ((rspace=sk->prot->rspace(sk)) != left) 
1929         {
1930                 /*
1931                  * This area has caused the most trouble.  The current strategy
1932                  * is to simply do nothing if the other end has room to send at
1933                  * least 3 full packets, because the ack from those will auto-
1934                  * matically update the window.  If the other end doesn't think
1935                  * we have much space left, but we have room for at least 1 more
1936                  * complete packet than it thinks we do, we will send an ack
1937                  * immediately.  Otherwise we will wait up to .5 seconds in case
1938                  * the user reads some more.
1939                  */
1940                 sk->ack_backlog++;
1941         /*
1942          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1943          * if the other end is offering a window smaller than the agreed on MSS
1944          * (called sk->mtu here).  In theory there's no connection between send
1945          * and receive, and so no reason to think that they're going to send
1946          * small packets.  For the moment I'm using the hack of reducing the mss
1947          * only on the send side, so I'm putting mtu here.
1948          */
1949 
1950                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1951                 {
1952                         /* Send an ack right now. */
1953                         tcp_read_wakeup(sk);
1954                 } 
1955                 else 
1956                 {
1957                         /* Force it to send an ack soon. */
1958                         int was_active = del_timer(&sk->retransmit_timer);
1959                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
1960                         {
1961                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1962                         } 
1963                         else
1964                                 add_timer(&sk->retransmit_timer);
1965                 }
1966         }
1967 } 
1968 
1969 
1970 /*
1971  *      Handle reading urgent data. BSD has very simple semantics for
1972  *      this, no blocking and very strange errors 8)
1973  */
1974  
1975 static int tcp_read_urg(struct sock * sk, int nonblock,
     /*  */
1976              unsigned char *to, int len, unsigned flags)
1977 {
1978         /*
1979          *      No URG data to read
1980          */
1981         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1982                 return -EINVAL; /* Yes this is right ! */
1983                 
1984         if (sk->err) 
1985         {
1986                 int tmp = -sk->err;
1987                 sk->err = 0;
1988                 return tmp;
1989         }
1990 
1991         if (sk->state == TCP_CLOSE || sk->done) 
1992         {
1993                 if (!sk->done) {
1994                         sk->done = 1;
1995                         return 0;
1996                 }
1997                 return -ENOTCONN;
1998         }
1999 
2000         if (sk->shutdown & RCV_SHUTDOWN) 
2001         {
2002                 sk->done = 1;
2003                 return 0;
2004         }
2005         sk->inuse = 1;
2006         if (sk->urg_data & URG_VALID) 
2007         {
2008                 char c = sk->urg_data;
2009                 if (!(flags & MSG_PEEK))
2010                         sk->urg_data = URG_READ;
2011                 put_fs_byte(c, to);
2012                 release_sock(sk);
2013                 return 1;
2014         }
2015         release_sock(sk);
2016         
2017         /*
2018          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2019          * the available implementations agree in this case:
2020          * this call should never block, independent of the
2021          * blocking state of the socket.
2022          * Mike <pall@rz.uni-karlsruhe.de>
2023          */
2024         return -EAGAIN;
2025 }
2026 
2027 
2028 /*
2029  *      This routine copies from a sock struct into the user buffer. 
2030  */
2031  
2032 static int tcp_read(struct sock *sk, unsigned char *to,
     /*  */
2033         int len, int nonblock, unsigned flags)
2034 {
2035         struct wait_queue wait = { current, NULL };
2036         int copied = 0;
2037         u32 peek_seq;
2038         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2039         unsigned long used;
2040 
2041         /* 
2042          *      This error should be checked. 
2043          */
2044          
2045         if (sk->state == TCP_LISTEN)
2046                 return -ENOTCONN;
2047 
2048         /*
2049          *      Urgent data needs to be handled specially. 
2050          */
2051          
2052         if (flags & MSG_OOB)
2053                 return tcp_read_urg(sk, nonblock, to, len, flags);
2054 
2055         /*
2056          *      Copying sequence to update. This is volatile to handle
2057          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2058          *      inline and thus not flush cached variables otherwise).
2059          */
2060          
2061         peek_seq = sk->copied_seq;
2062         seq = &sk->copied_seq;
2063         if (flags & MSG_PEEK)
2064                 seq = &peek_seq;
2065 
2066         add_wait_queue(sk->sleep, &wait);
2067         sk->inuse = 1;
2068         while (len > 0) 
2069         {
2070                 struct sk_buff * skb;
2071                 u32 offset;
2072         
2073                 /*
2074                  * Are we at urgent data? Stop if we have read anything.
2075                  */
2076                  
2077                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2078                         break;
2079 
2080                 /*
2081                  *      Next get a buffer.
2082                  */
2083                  
2084                 current->state = TASK_INTERRUPTIBLE;
2085 
2086                 skb = skb_peek(&sk->receive_queue);
2087                 do 
2088                 {
2089                         if (!skb)
2090                                 break;
2091                         if (before(*seq, skb->h.th->seq))
2092                                 break;
2093                         offset = *seq - skb->h.th->seq;
2094                         if (skb->h.th->syn)
2095                                 offset--;
2096                         if (offset < skb->len)
2097                                 goto found_ok_skb;
2098                         if (skb->h.th->fin)
2099                                 goto found_fin_ok;
2100                         if (!(flags & MSG_PEEK))
2101                                 skb->used = 1;
2102                         skb = skb->next;
2103                 }
2104                 while (skb != (struct sk_buff *)&sk->receive_queue);
2105 
2106                 if (copied)
2107                         break;
2108 
2109                 if (sk->err) 
2110                 {
2111                         copied = -sk->err;
2112                         sk->err = 0;
2113                         break;
2114                 }
2115 
2116                 if (sk->state == TCP_CLOSE) 
2117                 {
2118                         if (!sk->done) 
2119                         {
2120                                 sk->done = 1;
2121                                 break;
2122                         }
2123                         copied = -ENOTCONN;
2124                         break;
2125                 }
2126 
2127                 if (sk->shutdown & RCV_SHUTDOWN) 
2128                 {
2129                         sk->done = 1;
2130                         break;
2131                 }
2132                         
2133                 if (nonblock) 
2134                 {
2135                         copied = -EAGAIN;
2136                         break;
2137                 }
2138 
2139                 cleanup_rbuf(sk);
2140                 release_sock(sk);
2141                 sk->socket->flags |= SO_WAITDATA;
2142                 schedule();
2143                 sk->socket->flags &= ~SO_WAITDATA;
2144                 sk->inuse = 1;
2145 
2146                 if (current->signal & ~current->blocked) 
2147                 {
2148                         copied = -ERESTARTSYS;
2149                         break;
2150                 }
2151                 continue;
2152 
2153         found_ok_skb:
2154                 /*
2155                  *      Lock the buffer. We can be fairly relaxed as
2156                  *      an interrupt will never steal a buffer we are 
2157                  *      using unless I've missed something serious in
2158                  *      tcp_data.
2159                  */
2160                 
2161                 skb->users++;
2162                 
2163                 /*
2164                  *      Ok so how much can we use ? 
2165                  */
2166                  
2167                 used = skb->len - offset;
2168                 if (len < used)
2169                         used = len;
2170                 /*
2171                  *      Do we have urgent data here? 
2172                  */
2173                 
2174                 if (sk->urg_data) 
2175                 {
2176                         u32 urg_offset = sk->urg_seq - *seq;
2177                         if (urg_offset < used) 
2178                         {
2179                                 if (!urg_offset) 
2180                                 {
2181                                         if (!sk->urginline) 
2182                                         {
2183                                                 ++*seq;
2184                                                 offset++;
2185                                                 used--;
2186                                         }
2187                                 }
2188                                 else
2189                                         used = urg_offset;
2190                         }
2191                 }
2192                 
2193                 /*
2194                  *      Copy it - We _MUST_ update *seq first so that we
2195                  *      don't ever double read when we have dual readers
2196                  */
2197                  
2198                 *seq += used;
2199 
2200                 /*
2201                  *      This memcpy_tofs can sleep. If it sleeps and we
2202                  *      do a second read it relies on the skb->users to avoid
2203                  *      a crash when cleanup_rbuf() gets called.
2204                  */
2205                  
2206                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2207                         skb->h.th->doff*4 + offset, used);
2208                 copied += used;
2209                 len -= used;
2210                 to += used;
2211                 
2212                 /*
2213                  *      We now will not sleep again until we are finished
2214                  *      with skb. Sorry if you are doing the SMP port
2215                  *      but you'll just have to fix it neatly ;)
2216                  */
2217                  
2218                 skb->users --;
2219                 
2220                 if (after(sk->copied_seq,sk->urg_seq))
2221                         sk->urg_data = 0;
2222                 if (used + offset < skb->len)
2223                         continue;
2224                 
2225                 /*
2226                  *      Process the FIN.
2227                  */
2228 
2229                 if (skb->h.th->fin)
2230                         goto found_fin_ok;
2231                 if (flags & MSG_PEEK)
2232                         continue;
2233                 skb->used = 1;
2234                 continue;
2235 
2236         found_fin_ok:
2237                 ++*seq;
2238                 if (flags & MSG_PEEK)
2239                         break;
2240                         
2241                 /*
2242                  *      All is done
2243                  */
2244                  
2245                 skb->used = 1;
2246                 sk->shutdown |= RCV_SHUTDOWN;
2247                 break;
2248 
2249         }
2250         remove_wait_queue(sk->sleep, &wait);
2251         current->state = TASK_RUNNING;
2252 
2253         /* Clean up data we have read: This will do ACK frames */
2254         cleanup_rbuf(sk);
2255         release_sock(sk);
2256         return copied;
2257 }
2258 
2259 /*
2260  *      State processing on a close. This implements the state shift for
2261  *      sending our FIN frame. Note that we only send a FIN for some 
2262  *      states. A shutdown() may have already sent the FIN, or we may be
2263  *      closed.
2264  */
2265  
2266 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2267 {
2268         int ns=TCP_CLOSE;
2269         int send_fin=0;
2270         switch(sk->state)
2271         {
2272                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2273                         break;
2274                 case TCP_SYN_RECV:
2275                 case TCP_ESTABLISHED:   /* Closedown begin */
2276                         ns=TCP_FIN_WAIT1;
2277                         send_fin=1;
2278                         break;
2279                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2280                 case TCP_FIN_WAIT2:
2281                 case TCP_CLOSING:
2282                         ns=sk->state;
2283                         break;
2284                 case TCP_CLOSE:
2285                 case TCP_LISTEN:
2286                         break;
2287                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2288                                            wait only for the ACK */
2289                         ns=TCP_LAST_ACK;
2290                         send_fin=1;
2291         }
2292         
2293         tcp_set_state(sk,ns);
2294                 
2295         /*
2296          *      This is a (useful) BSD violating of the RFC. There is a
2297          *      problem with TCP as specified in that the other end could
2298          *      keep a socket open forever with no application left this end.
2299          *      We use a 3 minute timeout (about the same as BSD) then kill
2300          *      our end. If they send after that then tough - BUT: long enough
2301          *      that we won't make the old 4*rto = almost no time - whoops
2302          *      reset mistake.
2303          */
2304         if(dead && ns==TCP_FIN_WAIT2)
2305         {
2306                 int timer_active=del_timer(&sk->timer);
2307                 if(timer_active)
2308                         add_timer(&sk->timer);
2309                 else
2310                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2311         }
2312         
2313         return send_fin;
2314 }
2315 
2316 /*
2317  *      Send a fin.
2318  */
2319 
2320 static void tcp_send_fin(struct sock *sk)
     /*  */
2321 {
2322         struct proto *prot =(struct proto *)sk->prot;
2323         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2324         struct tcphdr *t1;
2325         struct sk_buff *buff;
2326         struct device *dev=NULL;
2327         int tmp;
2328                 
2329         release_sock(sk); /* in case the malloc sleeps. */
2330         
2331         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2332         sk->inuse = 1;
2333 
2334         if (buff == NULL)
2335         {
2336                 /* This is a disaster if it occurs */
2337                 printk("tcp_send_fin: Impossible malloc failure");
2338                 return;
2339         }
2340 
2341         /*
2342          *      Administrivia
2343          */
2344          
2345         buff->sk = sk;
2346         buff->localroute = sk->localroute;
2347 
2348         /*
2349          *      Put in the IP header and routing stuff. 
2350          */
2351 
2352         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2353                            IPPROTO_TCP, sk->opt,
2354                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2355         if (tmp < 0) 
2356         {
2357                 int t;
2358                 /*
2359                  *      Finish anyway, treat this as a send that got lost. 
2360                  *      (Not good).
2361                  */
2362                  
2363                 buff->free = 1;
2364                 prot->wfree(sk,buff);
2365                 sk->write_seq++;
2366                 t=del_timer(&sk->timer);
2367                 if(t)
2368                         add_timer(&sk->timer);
2369                 else
2370                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2371                 return;
2372         }
2373         
2374         /*
2375          *      We ought to check if the end of the queue is a buffer and
2376          *      if so simply add the fin to that buffer, not send it ahead.
2377          */
2378 
2379         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2380         buff->dev = dev;
2381         memcpy(t1, th, sizeof(*t1));
2382         t1->seq = ntohl(sk->write_seq);
2383         sk->write_seq++;
2384         buff->h.seq = sk->write_seq;
2385         t1->ack = 1;
2386         t1->ack_seq = ntohl(sk->acked_seq);
2387         t1->window = ntohs(sk->window=tcp_select_window(sk));
2388         t1->fin = 1;
2389         t1->rst = 0;
2390         t1->doff = sizeof(*t1)/4;
2391         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2392 
2393         /*
2394          * If there is data in the write queue, the fin must be appended to
2395          * the write queue.
2396          */
2397         
2398         if (skb_peek(&sk->write_queue) != NULL) 
2399         {
2400                 buff->free = 0;
2401                 if (buff->next != NULL) 
2402                 {
2403                         printk("tcp_send_fin: next != NULL\n");
2404                         skb_unlink(buff);
2405                 }
2406                 skb_queue_tail(&sk->write_queue, buff);
2407         } 
2408         else 
2409         {
2410                 sk->sent_seq = sk->write_seq;
2411                 sk->prot->queue_xmit(sk, dev, buff, 0);
2412                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2413         }
2414 }
2415 
2416 /*
2417  *      Shutdown the sending side of a connection. Much like close except
2418  *      that we don't receive shut down or set sk->dead=1.
2419  */
2420 
2421 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2422 {
2423         /*
2424          *      We need to grab some memory, and put together a FIN,
2425          *      and then put it into the queue to be sent.
2426          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2427          */
2428 
2429         if (!(how & SEND_SHUTDOWN)) 
2430                 return;
2431          
2432         /*
2433          *      If we've already sent a FIN, or it's a closed state
2434          */
2435          
2436         if (sk->state == TCP_FIN_WAIT1 ||
2437             sk->state == TCP_FIN_WAIT2 ||
2438             sk->state == TCP_CLOSING ||
2439             sk->state == TCP_LAST_ACK ||
2440             sk->state == TCP_TIME_WAIT || 
2441             sk->state == TCP_CLOSE ||
2442             sk->state == TCP_LISTEN
2443           )
2444         {
2445                 return;
2446         }
2447         sk->inuse = 1;
2448 
2449         /*
2450          * flag that the sender has shutdown
2451          */
2452 
2453         sk->shutdown |= SEND_SHUTDOWN;
2454 
2455         /*
2456          *  Clear out any half completed packets. 
2457          */
2458 
2459         if (sk->partial)
2460                 tcp_send_partial(sk);
2461                 
2462         /*
2463          *      FIN if needed
2464          */
2465          
2466         if(tcp_close_state(sk,0))
2467                 tcp_send_fin(sk);
2468                 
2469         release_sock(sk);
2470 }
2471 
2472 
2473 static int
2474 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /*  */
2475              int to_len, int nonblock, unsigned flags,
2476              struct sockaddr_in *addr, int *addr_len)
2477 {
2478         int result;
2479   
2480         /* 
2481          *      Have to check these first unlike the old code. If 
2482          *      we check them after we lose data on an error
2483          *      which is wrong 
2484          */
2485 
2486         if(addr_len)
2487                 *addr_len = sizeof(*addr);
2488         result=tcp_read(sk, to, to_len, nonblock, flags);
2489 
2490         if (result < 0) 
2491                 return(result);
2492   
2493         if(addr)
2494         {
2495                 addr->sin_family = AF_INET;
2496                 addr->sin_port = sk->dummy_th.dest;
2497                 addr->sin_addr.s_addr = sk->daddr;
2498         }
2499         return(result);
2500 }
2501 
2502 
2503 /*
2504  *      This routine will send an RST to the other tcp. 
2505  */
2506  
2507 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2508           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2509 {
2510         struct sk_buff *buff;
2511         struct tcphdr *t1;
2512         int tmp;
2513         struct device *ndev=NULL;
2514 
2515         /*
2516          *      Cannot reset a reset (Think about it).
2517          */
2518          
2519         if(th->rst)
2520                 return;
2521   
2522         /*
2523          * We need to grab some memory, and put together an RST,
2524          * and then put it into the queue to be sent.
2525          */
2526 
2527         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2528         if (buff == NULL) 
2529                 return;
2530 
2531         buff->sk = NULL;
2532         buff->dev = dev;
2533         buff->localroute = 0;
2534 
2535         /*
2536          *      Put in the IP header and routing stuff. 
2537          */
2538 
2539         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2540                            sizeof(struct tcphdr),tos,ttl);
2541         if (tmp < 0) 
2542         {
2543                 buff->free = 1;
2544                 prot->wfree(NULL, buff);
2545                 return;
2546         }
2547 
2548         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2549         memcpy(t1, th, sizeof(*t1));
2550 
2551         /*
2552          *      Swap the send and the receive. 
2553          */
2554 
2555         t1->dest = th->source;
2556         t1->source = th->dest;
2557         t1->rst = 1;  
2558         t1->window = 0;
2559   
2560         if(th->ack)
2561         {
2562                 t1->ack = 0;
2563                 t1->seq = th->ack_seq;
2564                 t1->ack_seq = 0;
2565         }
2566         else
2567         {
2568                 t1->ack = 1;
2569                 if(!th->syn)
2570                         t1->ack_seq=htonl(th->seq);
2571                 else
2572                         t1->ack_seq=htonl(th->seq+1);
2573                 t1->seq=0;
2574         }
2575 
2576         t1->syn = 0;
2577         t1->urg = 0;
2578         t1->fin = 0;
2579         t1->psh = 0;
2580         t1->doff = sizeof(*t1)/4;
2581         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2582         prot->queue_xmit(NULL, ndev, buff, 1);
2583         tcp_statistics.TcpOutSegs++;
2584 }
2585 
2586 
2587 /*
2588  *      Look for tcp options. Parses everything but only knows about MSS.
2589  *      This routine is always called with the packet containing the SYN.
2590  *      However it may also be called with the ack to the SYN.  So you
2591  *      can't assume this is always the SYN.  It's always called after
2592  *      we have set up sk->mtu to our own MTU.
2593  *
2594  *      We need at minimum to add PAWS support here. Possibly large windows
2595  *      as Linux gets deployed on 100Mb/sec networks.
2596  */
2597  
2598 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2599 {
2600         unsigned char *ptr;
2601         int length=(th->doff*4)-sizeof(struct tcphdr);
2602         int mss_seen = 0;
2603     
2604         ptr = (unsigned char *)(th + 1);
2605   
2606         while(length>0)
2607         {
2608                 int opcode=*ptr++;
2609                 int opsize=*ptr++;
2610                 switch(opcode)
2611                 {
2612                         case TCPOPT_EOL:
2613                                 return;
2614                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2615                                 length--;
2616                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2617                                 continue;
2618                         
2619                         default:
2620                                 if(opsize<=2)   /* Avoid silly options looping forever */
2621                                         return;
2622                                 switch(opcode)
2623                                 {
2624                                         case TCPOPT_MSS:
2625                                                 if(opsize==4 && th->syn)
2626                                                 {
2627                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2628                                                         mss_seen = 1;
2629                                                 }
2630                                                 break;
2631                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2632                                 }
2633                                 ptr+=opsize-2;
2634                                 length-=opsize;
2635                 }
2636         }
2637         if (th->syn) 
2638         {
2639                 if (! mss_seen)
2640                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2641         }
2642 #ifdef CONFIG_INET_PCTCP
2643         sk->mss = min(sk->max_window >> 1, sk->mtu);
2644 #else    
2645         sk->mss = min(sk->max_window, sk->mtu);
2646 #endif  
2647 }
2648 
2649 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2650 {
2651         dst = ntohl(dst);
2652         if (IN_CLASSA(dst))
2653                 return htonl(IN_CLASSA_NET);
2654         if (IN_CLASSB(dst))
2655                 return htonl(IN_CLASSB_NET);
2656         return htonl(IN_CLASSC_NET);
2657 }
2658 
2659 /*
2660  *      Default sequence number picking algorithm.
2661  *      As close as possible to RFC 793, which
2662  *      suggests using a 250kHz clock.
2663  *      Further reading shows this assumes 2MB/s networks.
2664  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2665  *      That's funny, Linux has one built in!  Use it!
2666  */
2667 
2668 extern inline u32 tcp_init_seq(void)
     /*  */
2669 {
2670         struct timeval tv;
2671         do_gettimeofday(&tv);
2672         return tv.tv_usec+tv.tv_sec*1000000;
2673 }
2674 
2675 /*
2676  *      This routine handles a connection request.
2677  *      It should make sure we haven't already responded.
2678  *      Because of the way BSD works, we have to send a syn/ack now.
2679  *      This also means it will be harder to close a socket which is
2680  *      listening.
2681  */
2682  
2683 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2684                  unsigned long daddr, unsigned long saddr,
2685                  struct options *opt, struct device *dev, u32 seq)
2686 {
2687         struct sk_buff *buff;
2688         struct tcphdr *t1;
2689         unsigned char *ptr;
2690         struct sock *newsk;
2691         struct tcphdr *th;
2692         struct device *ndev=NULL;
2693         int tmp;
2694         struct rtable *rt;
2695   
2696         th = skb->h.th;
2697 
2698         /* If the socket is dead, don't accept the connection. */
2699         if (!sk->dead) 
2700         {
2701                 sk->data_ready(sk,0);
2702         }
2703         else 
2704         {
2705                 if(sk->debug)
2706                         printk("Reset on %p: Connect on dead socket.\n",sk);
2707                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2708                 tcp_statistics.TcpAttemptFails++;
2709                 kfree_skb(skb, FREE_READ);
2710                 return;
2711         }
2712 
2713         /*
2714          * Make sure we can accept more.  This will prevent a
2715          * flurry of syns from eating up all our memory.
2716          */
2717 
2718         if (sk->ack_backlog >= sk->max_ack_backlog) 
2719         {
2720                 tcp_statistics.TcpAttemptFails++;
2721                 kfree_skb(skb, FREE_READ);
2722                 return;
2723         }
2724 
2725         /*
2726          * We need to build a new sock struct.
2727          * It is sort of bad to have a socket without an inode attached
2728          * to it, but the wake_up's will just wake up the listening socket,
2729          * and if the listening socket is destroyed before this is taken
2730          * off of the queue, this will take care of it.
2731          */
2732 
2733         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2734         if (newsk == NULL) 
2735         {
2736                 /* just ignore the syn.  It will get retransmitted. */
2737                 tcp_statistics.TcpAttemptFails++;
2738                 kfree_skb(skb, FREE_READ);
2739                 return;
2740         }
2741 
2742         memcpy(newsk, sk, sizeof(*newsk));
2743         skb_queue_head_init(&newsk->write_queue);
2744         skb_queue_head_init(&newsk->receive_queue);
2745         newsk->send_head = NULL;
2746         newsk->send_tail = NULL;
2747         skb_queue_head_init(&newsk->back_log);
2748         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2749         newsk->rto = TCP_TIMEOUT_INIT;
2750         newsk->mdev = 0;
2751         newsk->max_window = 0;
2752         newsk->cong_window = 1;
2753         newsk->cong_count = 0;
2754         newsk->ssthresh = 0;
2755         newsk->backoff = 0;
2756         newsk->blog = 0;
2757         newsk->intr = 0;
2758         newsk->proc = 0;
2759         newsk->done = 0;
2760         newsk->partial = NULL;
2761         newsk->pair = NULL;
2762         newsk->wmem_alloc = 0;
2763         newsk->rmem_alloc = 0;
2764         newsk->localroute = sk->localroute;
2765 
2766         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2767 
2768         newsk->err = 0;
2769         newsk->shutdown = 0;
2770         newsk->ack_backlog = 0;
2771         newsk->acked_seq = skb->h.th->seq+1;
2772         newsk->copied_seq = skb->h.th->seq+1;
2773         newsk->fin_seq = skb->h.th->seq;
2774         newsk->state = TCP_SYN_RECV;
2775         newsk->timeout = 0;
2776         newsk->ip_xmit_timeout = 0;
2777         newsk->write_seq = seq; 
2778         newsk->window_seq = newsk->write_seq;
2779         newsk->rcv_ack_seq = newsk->write_seq;
2780         newsk->urg_data = 0;
2781         newsk->retransmits = 0;
2782         newsk->linger=0;
2783         newsk->destroy = 0;
2784         init_timer(&newsk->timer);
2785         newsk->timer.data = (unsigned long)newsk;
2786         newsk->timer.function = &net_timer;
2787         init_timer(&newsk->retransmit_timer);
2788         newsk->retransmit_timer.data = (unsigned long)newsk;
2789         newsk->retransmit_timer.function=&retransmit_timer;
2790         newsk->dummy_th.source = skb->h.th->dest;
2791         newsk->dummy_th.dest = skb->h.th->source;
2792         
2793         /*
2794          *      Swap these two, they are from our point of view. 
2795          */
2796          
2797         newsk->daddr = saddr;
2798         newsk->saddr = daddr;
2799 
2800         put_sock(newsk->num,newsk);
2801         newsk->dummy_th.res1 = 0;
2802         newsk->dummy_th.doff = 6;
2803         newsk->dummy_th.fin = 0;
2804         newsk->dummy_th.syn = 0;
2805         newsk->dummy_th.rst = 0;        
2806         newsk->dummy_th.psh = 0;
2807         newsk->dummy_th.ack = 0;
2808         newsk->dummy_th.urg = 0;
2809         newsk->dummy_th.res2 = 0;
2810         newsk->acked_seq = skb->h.th->seq + 1;
2811         newsk->copied_seq = skb->h.th->seq + 1;
2812         newsk->socket = NULL;
2813 
2814         /*
2815          *      Grab the ttl and tos values and use them 
2816          */
2817 
2818         newsk->ip_ttl=sk->ip_ttl;
2819         newsk->ip_tos=skb->ip_hdr->tos;
2820 
2821         /*
2822          *      Use 512 or whatever user asked for 
2823          */
2824 
2825         /*
2826          *      Note use of sk->user_mss, since user has no direct access to newsk 
2827          */
2828 
2829         rt=ip_rt_route(saddr, NULL,NULL);
2830         
2831         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2832                 newsk->window_clamp = rt->rt_window;
2833         else
2834                 newsk->window_clamp = 0;
2835                 
2836         if (sk->user_mss)
2837                 newsk->mtu = sk->user_mss;
2838         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2839                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2840         else 
2841         {
2842 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2843                 if ((saddr ^ daddr) & default_mask(saddr))
2844 #else
2845                 if ((saddr ^ daddr) & dev->pa_mask)
2846 #endif
2847                         newsk->mtu = 576 - HEADER_SIZE;
2848                 else
2849                         newsk->mtu = MAX_WINDOW;
2850         }
2851 
2852         /*
2853          *      But not bigger than device MTU 
2854          */
2855 
2856         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2857 
2858         /*
2859          *      This will min with what arrived in the packet 
2860          */
2861 
2862         tcp_options(newsk,skb->h.th);
2863         
2864         tcp_cache_zap();
2865 
2866         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2867         if (buff == NULL) 
2868         {
2869                 sk->err = ENOMEM;
2870                 newsk->dead = 1;
2871                 newsk->state = TCP_CLOSE;
2872                 /* And this will destroy it */
2873                 release_sock(newsk);
2874                 kfree_skb(skb, FREE_READ);
2875                 tcp_statistics.TcpAttemptFails++;
2876                 return;
2877         }
2878   
2879         buff->sk = newsk;
2880         buff->localroute = newsk->localroute;
2881 
2882         /*
2883          *      Put in the IP header and routing stuff. 
2884          */
2885 
2886         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2887                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2888 
2889         /*
2890          *      Something went wrong. 
2891          */
2892 
2893         if (tmp < 0) 
2894         {
2895                 sk->err = tmp;
2896                 buff->free = 1;
2897                 kfree_skb(buff,FREE_WRITE);
2898                 newsk->dead = 1;
2899                 newsk->state = TCP_CLOSE;
2900                 release_sock(newsk);
2901                 skb->sk = sk;
2902                 kfree_skb(skb, FREE_READ);
2903                 tcp_statistics.TcpAttemptFails++;
2904                 return;
2905         }
2906 
2907         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2908   
2909         memcpy(t1, skb->h.th, sizeof(*t1));
2910         buff->h.seq = newsk->write_seq;
2911         /*
2912          *      Swap the send and the receive. 
2913          */
2914         t1->dest = skb->h.th->source;
2915         t1->source = newsk->dummy_th.source;
2916         t1->seq = ntohl(newsk->write_seq++);
2917         t1->ack = 1;
2918         newsk->window = tcp_select_window(newsk);
2919         newsk->sent_seq = newsk->write_seq;
2920         t1->window = ntohs(newsk->window);
2921         t1->res1 = 0;
2922         t1->res2 = 0;
2923         t1->rst = 0;
2924         t1->urg = 0;
2925         t1->psh = 0;
2926         t1->syn = 1;
2927         t1->ack_seq = ntohl(skb->h.th->seq+1);
2928         t1->doff = sizeof(*t1)/4+1;
2929         ptr = skb_put(buff,4);
2930         ptr[0] = 2;
2931         ptr[1] = 4;
2932         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2933         ptr[3] =(newsk->mtu) & 0xff;
2934 
2935         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2936         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2937         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2938         skb->sk = newsk;
2939 
2940         /*
2941          *      Charge the sock_buff to newsk. 
2942          */
2943          
2944         sk->rmem_alloc -= skb->truesize;
2945         newsk->rmem_alloc += skb->truesize;
2946         
2947         skb_queue_tail(&sk->receive_queue,skb);
2948         sk->ack_backlog++;
2949         release_sock(newsk);
2950         tcp_statistics.TcpOutSegs++;
2951 }
2952 
2953 
2954 static void tcp_close(struct sock *sk, int timeout)
     /*  */
2955 {
2956         /*
2957          * We need to grab some memory, and put together a FIN, 
2958          * and then put it into the queue to be sent.
2959          */
2960         
2961         sk->inuse = 1;
2962         
2963         if(th_cache_sk==sk)
2964                 tcp_cache_zap();
2965         if(sk->state == TCP_LISTEN)
2966         {
2967                 /* Special case */
2968                 tcp_set_state(sk, TCP_CLOSE);
2969                 tcp_close_pending(sk);
2970                 release_sock(sk);
2971                 return;
2972         }
2973         
2974         sk->keepopen = 1;
2975         sk->shutdown = SHUTDOWN_MASK;
2976 
2977         if (!sk->dead) 
2978                 sk->state_change(sk);
2979 
2980         if (timeout == 0) 
2981         {
2982                 struct sk_buff *skb;
2983                 
2984                 /*
2985                  *  We need to flush the recv. buffs.  We do this only on the
2986                  *  descriptor close, not protocol-sourced closes, because the
2987                  *  reader process may not have drained the data yet!
2988                  */
2989                  
2990                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2991                         kfree_skb(skb, FREE_READ);
2992                 /*
2993                  *      Get rid off any half-completed packets. 
2994                  */
2995 
2996                 if (sk->partial) 
2997                         tcp_send_partial(sk);
2998         }
2999 
3000                 
3001         /*
3002          *      Timeout is not the same thing - however the code likes
3003          *      to send both the same way (sigh).
3004          */
3005          
3006         if(timeout)
3007         {
3008                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3009         }
3010         else
3011         {
3012                 if(tcp_close_state(sk,1)==1)
3013                 {
3014                         tcp_send_fin(sk);
3015                 }
3016         }
3017         release_sock(sk);
3018 }
3019 
3020 
3021 /*
3022  *      This routine takes stuff off of the write queue,
3023  *      and puts it in the xmit queue. This happens as incoming acks
3024  *      open up the remote window for us.
3025  */
3026  
3027 static void tcp_write_xmit(struct sock *sk)
     /*  */
3028 {
3029         struct sk_buff *skb;
3030 
3031         /*
3032          *      The bytes will have to remain here. In time closedown will
3033          *      empty the write queue and all will be happy 
3034          */
3035 
3036         if(sk->zapped)
3037                 return;
3038 
3039         /*
3040          *      Anything on the transmit queue that fits the window can
3041          *      be added providing we are not
3042          *
3043          *      a) retransmitting (Nagle's rule)
3044          *      b) exceeding our congestion window.
3045          */
3046          
3047         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3048                 before(skb->h.seq, sk->window_seq + 1) &&
3049                 (sk->retransmits == 0 ||
3050                  sk->ip_xmit_timeout != TIME_WRITE ||
3051                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3052                 && sk->packets_out < sk->cong_window) 
3053         {
3054                 IS_SKB(skb);
3055                 skb_unlink(skb);
3056                 
3057                 /*
3058                  *      See if we really need to send the packet. 
3059                  */
3060                  
3061                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3062                 {
3063                         /*
3064                          *      This is acked data. We can discard it. This 
3065                          *      cannot currently occur.
3066                          */
3067                          
3068                         sk->retransmits = 0;
3069                         kfree_skb(skb, FREE_WRITE);
3070                         if (!sk->dead) 
3071                                 sk->write_space(sk);
3072                 } 
3073                 else
3074                 {
3075                         struct tcphdr *th;
3076                         struct iphdr *iph;
3077                         int size;
3078 /*
3079  * put in the ack seq and window at this point rather than earlier,
3080  * in order to keep them monotonic.  We really want to avoid taking
3081  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3082  * Ack and window will in general have changed since this packet was put
3083  * on the write queue.
3084  */
3085                         iph = skb->ip_hdr;
3086                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3087                         size = skb->len - (((unsigned char *) th) - skb->data);
3088                         
3089                         th->ack_seq = ntohl(sk->acked_seq);
3090                         th->window = ntohs(tcp_select_window(sk));
3091 
3092                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3093 
3094                         sk->sent_seq = skb->h.seq;
3095                         
3096                         /*
3097                          *      IP manages our queue for some crazy reason
3098                          */
3099                          
3100                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3101                         
3102                         /*
3103                          *      Again we slide the timer wrongly
3104                          */
3105                          
3106                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3107                 }
3108         }
3109 }
3110 
3111 
3112 /*
3113  *      This routine deals with incoming acks, but not outgoing ones.
3114  */
3115 
3116 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3117 {
3118         u32 ack;
3119         int flag = 0;
3120 
3121         /* 
3122          * 1 - there was data in packet as well as ack or new data is sent or 
3123          *     in shutdown state
3124          * 2 - data from retransmit queue was acked and removed
3125          * 4 - window shrunk or data from retransmit queue was acked and removed
3126          */
3127 
3128         if(sk->zapped)
3129                 return(1);      /* Dead, cant ack any more so why bother */
3130 
3131         /*
3132          *      Have we discovered a larger window
3133          */
3134          
3135         ack = ntohl(th->ack_seq);
3136 
3137         if (ntohs(th->window) > sk->max_window) 
3138         {
3139                 sk->max_window = ntohs(th->window);
3140 #ifdef CONFIG_INET_PCTCP
3141                 /* Hack because we don't send partial packets to non SWS
3142                    handling hosts */
3143                 sk->mss = min(sk->max_window>>1, sk->mtu);
3144 #else
3145                 sk->mss = min(sk->max_window, sk->mtu);
3146 #endif  
3147         }
3148 
3149         /*
3150          *      We have dropped back to keepalive timeouts. Thus we have
3151          *      no retransmits pending.
3152          */
3153          
3154         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3155                 sk->retransmits = 0;
3156 
3157         /*
3158          *      If the ack is newer than sent or older than previous acks
3159          *      then we can probably ignore it.
3160          */
3161          
3162         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3163         {
3164                 if(sk->debug)
3165                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3166                         
3167                 /*
3168                  *      Keepalive processing.
3169                  */
3170                  
3171                 if (after(ack, sk->sent_seq)) 
3172                 {
3173                         return(0);
3174                 }
3175                 
3176                 /*
3177                  *      Restart the keepalive timer.
3178                  */
3179                  
3180                 if (sk->keepopen) 
3181                 {
3182                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3183                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3184                 }
3185                 return(1);
3186         }
3187 
3188         /*
3189          *      If there is data set flag 1
3190          */
3191          
3192         if (len != th->doff*4) 
3193                 flag |= 1;
3194 
3195         /*
3196          *      See if our window has been shrunk. 
3197          */
3198 
3199         if (after(sk->window_seq, ack+ntohs(th->window))) 
3200         {
3201                 /*
3202                  * We may need to move packets from the send queue
3203                  * to the write queue, if the window has been shrunk on us.
3204                  * The RFC says you are not allowed to shrink your window
3205                  * like this, but if the other end does, you must be able
3206                  * to deal with it.
3207                  */
3208                 struct sk_buff *skb;
3209                 struct sk_buff *skb2;
3210                 struct sk_buff *wskb = NULL;
3211         
3212                 skb2 = sk->send_head;
3213                 sk->send_head = NULL;
3214                 sk->send_tail = NULL;
3215         
3216                 /*
3217                  *      This is an artifact of a flawed concept. We want one
3218                  *      queue and a smarter send routine when we send all.
3219                  */
3220         
3221                 flag |= 4;      /* Window changed */
3222         
3223                 sk->window_seq = ack + ntohs(th->window);
3224                 cli();
3225                 while (skb2 != NULL) 
3226                 {
3227                         skb = skb2;
3228                         skb2 = skb->link3;
3229                         skb->link3 = NULL;
3230                         if (after(skb->h.seq, sk->window_seq)) 
3231                         {
3232                                 if (sk->packets_out > 0) 
3233                                         sk->packets_out--;
3234                                 /* We may need to remove this from the dev send list. */
3235                                 if (skb->next != NULL) 
3236                                 {
3237                                         skb_unlink(skb);                                
3238                                 }
3239                                 /* Now add it to the write_queue. */
3240                                 if (wskb == NULL)
3241                                         skb_queue_head(&sk->write_queue,skb);
3242                                 else
3243                                         skb_append(wskb,skb);
3244                                 wskb = skb;
3245                         } 
3246                         else 
3247                         {
3248                                 if (sk->send_head == NULL) 
3249                                 {
3250                                         sk->send_head = skb;
3251                                         sk->send_tail = skb;
3252                                 }
3253                                 else
3254                                 {
3255                                         sk->send_tail->link3 = skb;
3256                                         sk->send_tail = skb;
3257                                 }
3258                                 skb->link3 = NULL;
3259                         }
3260                 }
3261                 sti();
3262         }
3263 
3264         /*
3265          *      Pipe has emptied
3266          */
3267          
3268         if (sk->send_tail == NULL || sk->send_head == NULL) 
3269         {
3270                 sk->send_head = NULL;
3271                 sk->send_tail = NULL;
3272                 sk->packets_out= 0;
3273         }
3274 
3275         /*
3276          *      Update the right hand window edge of the host
3277          */
3278          
3279         sk->window_seq = ack + ntohs(th->window);
3280 
3281         /*
3282          *      We don't want too many packets out there. 
3283          */
3284          
3285         if (sk->ip_xmit_timeout == TIME_WRITE && 
3286                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3287         {
3288                 /* 
3289                  * This is Jacobson's slow start and congestion avoidance. 
3290                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3291                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3292                  * counter and increment it once every cwnd times.  It's possible
3293                  * that this should be done only if sk->retransmits == 0.  I'm
3294                  * interpreting "new data is acked" as including data that has
3295                  * been retransmitted but is just now being acked.
3296                  */
3297                 if (sk->cong_window < sk->ssthresh)  
3298                         /* 
3299                          *      In "safe" area, increase
3300                          */
3301                         sk->cong_window++;
3302                 else 
3303                 {
3304                         /*
3305                          *      In dangerous area, increase slowly.  In theory this is
3306                          *      sk->cong_window += 1 / sk->cong_window
3307                          */
3308                         if (sk->cong_count >= sk->cong_window) 
3309                         {
3310                                 sk->cong_window++;
3311                                 sk->cong_count = 0;
3312                         }
3313                         else 
3314                                 sk->cong_count++;
3315                 }
3316         }
3317 
3318         /*
3319          *      Remember the highest ack received.
3320          */
3321          
3322         sk->rcv_ack_seq = ack;
3323 
3324         /*
3325          *      If this ack opens up a zero window, clear backoff.  It was
3326          *      being used to time the probes, and is probably far higher than
3327          *      it needs to be for normal retransmission.
3328          */
3329 
3330         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3331         {
3332                 sk->retransmits = 0;    /* Our probe was answered */
3333                 
3334                 /*
3335                  *      Was it a usable window open ?
3336                  */
3337                  
3338                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3339                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3340                 {
3341                         sk->backoff = 0;
3342                         
3343                         /*
3344                          *      Recompute rto from rtt.  this eliminates any backoff.
3345                          */
3346 
3347                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3348                         if (sk->rto > 120*HZ)
3349                                 sk->rto = 120*HZ;
3350                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3351                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3352                                                    .2 of a second is going to need huge windows (SIGH) */
3353                         sk->rto = 20;
3354                 }
3355         }
3356 
3357         /* 
3358          *      See if we can take anything off of the retransmit queue.
3359          */
3360    
3361         while(sk->send_head != NULL) 
3362         {
3363                 /* Check for a bug. */
3364                 if (sk->send_head->link3 &&
3365                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3366                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3367                         
3368                 /*
3369                  *      If our packet is before the ack sequence we can
3370                  *      discard it as it's confirmed to have arrived the other end.
3371                  */
3372                  
3373                 if (before(sk->send_head->h.seq, ack+1)) 
3374                 {
3375                         struct sk_buff *oskb;   
3376                         if (sk->retransmits) 
3377                         {       
3378                                 /*
3379                                  *      We were retransmitting.  don't count this in RTT est 
3380                                  */
3381                                 flag |= 2;
3382 
3383                                 /*
3384                                  * even though we've gotten an ack, we're still
3385                                  * retransmitting as long as we're sending from
3386                                  * the retransmit queue.  Keeping retransmits non-zero
3387                                  * prevents us from getting new data interspersed with
3388                                  * retransmissions.
3389                                  */
3390 
3391                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3392                                         sk->retransmits = 1;
3393                                 else
3394                                         sk->retransmits = 0;
3395                         }
3396                         /*
3397                          * Note that we only reset backoff and rto in the
3398                          * rtt recomputation code.  And that doesn't happen
3399                          * if there were retransmissions in effect.  So the
3400                          * first new packet after the retransmissions is
3401                          * sent with the backoff still in effect.  Not until
3402                          * we get an ack from a non-retransmitted packet do
3403                          * we reset the backoff and rto.  This allows us to deal
3404                          * with a situation where the network delay has increased
3405                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3406                          */
3407 
3408                         /*
3409                          *      We have one less packet out there. 
3410                          */
3411                          
3412                         if (sk->packets_out > 0) 
3413                                 sk->packets_out --;
3414                         /* 
3415                          *      Wake up the process, it can probably write more. 
3416                          */
3417                         if (!sk->dead) 
3418                                 sk->write_space(sk);
3419                         oskb = sk->send_head;
3420 
3421                         if (!(flag&2))  /* Not retransmitting */
3422                         {
3423                                 long m;
3424         
3425                                 /*
3426                                  *      The following amusing code comes from Jacobson's
3427                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3428                                  *      are scaled versions of rtt and mean deviation.
3429                                  *      This is designed to be as fast as possible 
3430                                  *      m stands for "measurement".
3431                                  */
3432         
3433                                 m = jiffies - oskb->when;  /* RTT */
3434                                 if(m<=0)
3435                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3436                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3437                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3438                                 if (m < 0)
3439                                         m = -m;         /* m is now abs(error) */
3440                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3441                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3442         
3443                                 /*
3444                                  *      Now update timeout.  Note that this removes any backoff.
3445                                  */
3446                          
3447                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3448                                 if (sk->rto > 120*HZ)
3449                                         sk->rto = 120*HZ;
3450                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3451                                         sk->rto = 20;
3452                                 sk->backoff = 0;
3453                         }
3454                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3455                                            In this case as we just set it up */
3456                         cli();
3457                         oskb = sk->send_head;
3458                         IS_SKB(oskb);
3459                         sk->send_head = oskb->link3;
3460                         if (sk->send_head == NULL) 
3461                         {
3462                                 sk->send_tail = NULL;
3463                         }
3464 
3465                 /*
3466                  *      We may need to remove this from the dev send list. 
3467                  */
3468 
3469                         if (oskb->next)
3470                                 skb_unlink(oskb);
3471                         sti();
3472                         kfree_skb(oskb, FREE_WRITE); /* write. */
3473                         if (!sk->dead) 
3474                                 sk->write_space(sk);
3475                 }
3476                 else
3477                 {
3478                         break;
3479                 }
3480         }
3481 
3482         /*
3483          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3484          * returns non-NULL, we complete ignore the timer stuff in the else
3485          * clause.  We ought to organize the code so that else clause can
3486          * (should) be executed regardless, possibly moving the PROBE timer
3487          * reset over.  The skb_peek() thing should only move stuff to the
3488          * write queue, NOT also manage the timer functions.
3489          */
3490 
3491         /*
3492          * Maybe we can take some stuff off of the write queue,
3493          * and put it onto the xmit queue.
3494          */
3495         if (skb_peek(&sk->write_queue) != NULL) 
3496         {
3497                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3498                         (sk->retransmits == 0 || 
3499                          sk->ip_xmit_timeout != TIME_WRITE ||
3500                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3501                         && sk->packets_out < sk->cong_window) 
3502                 {
3503                         /*
3504                          *      Add more data to the send queue.
3505                          */
3506                         flag |= 1;
3507                         tcp_write_xmit(sk);
3508                 }
3509                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3510                         sk->send_head == NULL &&
3511                         sk->ack_backlog == 0 &&
3512                         sk->state != TCP_TIME_WAIT) 
3513                 {
3514                         /*
3515                          *      Data to queue but no room.
3516                          */
3517                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3518                 }               
3519         }
3520         else
3521         {
3522                 /*
3523                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3524                  * from TCP_CLOSE we don't do anything
3525                  *
3526                  * from anything else, if there is write data (or fin) pending,
3527                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3528                  * a KEEPALIVE timeout, else we delete the timer.
3529                  *
3530                  * We do not set flag for nominal write data, otherwise we may
3531                  * force a state where we start to write itsy bitsy tidbits
3532                  * of data.
3533                  */
3534 
3535                 switch(sk->state) {
3536                 case TCP_TIME_WAIT:
3537                         /*
3538                          * keep us in TIME_WAIT until we stop getting packets,
3539                          * reset the timeout.
3540                          */
3541                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3542                         break;
3543                 case TCP_CLOSE:
3544                         /*
3545                          * don't touch the timer.
3546                          */
3547                         break;
3548                 default:
3549                         /*
3550                          *      Must check send_head, write_queue, and ack_backlog
3551                          *      to determine which timeout to use.
3552                          */
3553                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3554                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3555                         } else if (sk->keepopen) {
3556                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3557                         } else {
3558                                 del_timer(&sk->retransmit_timer);
3559                                 sk->ip_xmit_timeout = 0;
3560                         }
3561                         break;
3562                 }
3563         }
3564 
3565         /*
3566          *      We have nothing queued but space to send. Send any partial
3567          *      packets immediately (end of Nagle rule application).
3568          */
3569          
3570         if (sk->packets_out == 0 && sk->partial != NULL &&
3571                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3572         {
3573                 flag |= 1;
3574                 tcp_send_partial(sk);
3575         }
3576 
3577         /*
3578          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3579          * we are now waiting for an acknowledge to our FIN.  The other end is
3580          * already in TIME_WAIT.
3581          *
3582          * Move to TCP_CLOSE on success.
3583          */
3584 
3585         if (sk->state == TCP_LAST_ACK) 
3586         {
3587                 if (!sk->dead)
3588                         sk->state_change(sk);
3589                 if(sk->debug)
3590                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3591                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3592                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3593                 {
3594                         flag |= 1;
3595                         tcp_set_state(sk,TCP_CLOSE);
3596                         sk->shutdown = SHUTDOWN_MASK;
3597                 }
3598         }
3599 
3600         /*
3601          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3602          *
3603          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3604          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3605          */
3606 
3607         if (sk->state == TCP_FIN_WAIT1) 
3608         {
3609 
3610                 if (!sk->dead) 
3611                         sk->state_change(sk);
3612                 if (sk->rcv_ack_seq == sk->write_seq) 
3613                 {
3614                         flag |= 1;
3615                         sk->shutdown |= SEND_SHUTDOWN;
3616                         tcp_set_state(sk, TCP_FIN_WAIT2);
3617                 }
3618         }
3619 
3620         /*
3621          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3622          *
3623          *      Move to TIME_WAIT
3624          */
3625 
3626         if (sk->state == TCP_CLOSING) 
3627         {
3628 
3629                 if (!sk->dead) 
3630                         sk->state_change(sk);
3631                 if (sk->rcv_ack_seq == sk->write_seq) 
3632                 {
3633                         flag |= 1;
3634                         tcp_time_wait(sk);
3635                 }
3636         }
3637         
3638         /*
3639          *      Final ack of a three way shake 
3640          */
3641          
3642         if(sk->state==TCP_SYN_RECV)
3643         {
3644                 tcp_set_state(sk, TCP_ESTABLISHED);
3645                 tcp_options(sk,th);
3646                 sk->dummy_th.dest=th->source;
3647                 sk->copied_seq = sk->acked_seq;
3648                 if(!sk->dead)
3649                         sk->state_change(sk);
3650                 if(sk->max_window==0)
3651                 {
3652                         sk->max_window=32;      /* Sanity check */
3653                         sk->mss=min(sk->max_window,sk->mtu);
3654                 }
3655         }
3656         
3657         /*
3658          * I make no guarantees about the first clause in the following
3659          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3660          * what conditions "!flag" would be true.  However I think the rest
3661          * of the conditions would prevent that from causing any
3662          * unnecessary retransmission. 
3663          *   Clearly if the first packet has expired it should be 
3664          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3665          * harder to explain:  You have to look carefully at how and when the
3666          * timer is set and with what timeout.  The most recent transmission always
3667          * sets the timer.  So in general if the most recent thing has timed
3668          * out, everything before it has as well.  So we want to go ahead and
3669          * retransmit some more.  If we didn't explicitly test for this
3670          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3671          * would not be true.  If you look at the pattern of timing, you can
3672          * show that rto is increased fast enough that the next packet would
3673          * almost never be retransmitted immediately.  Then you'd end up
3674          * waiting for a timeout to send each packet on the retransmission
3675          * queue.  With my implementation of the Karn sampling algorithm,
3676          * the timeout would double each time.  The net result is that it would
3677          * take a hideous amount of time to recover from a single dropped packet.
3678          * It's possible that there should also be a test for TIME_WRITE, but
3679          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3680          * got to be in real retransmission mode.
3681          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3682          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3683          * As long as no further losses occur, this seems reasonable.
3684          */
3685         
3686         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3687                (((flag&2) && sk->retransmits) ||
3688                (sk->send_head->when + sk->rto < jiffies))) 
3689         {
3690                 if(sk->send_head->when + sk->rto < jiffies)
3691                         tcp_retransmit(sk,0);   
3692                 else
3693                 {
3694                         tcp_do_retransmit(sk, 1);
3695                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3696                 }
3697         }
3698 
3699         return(1);
3700 }
3701 
3702 
3703 /*
3704  *      Process the FIN bit. This now behaves as it is supposed to work
3705  *      and the FIN takes effect when it is validly part of sequence
3706  *      space. Not before when we get holes.
3707  *
3708  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3709  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3710  *      TIME-WAIT)
3711  *
3712  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3713  *      close and we go into CLOSING (and later onto TIME-WAIT)
3714  *
3715  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3716  *
3717  */
3718  
3719 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3720 {
3721         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3722 
3723         if (!sk->dead) 
3724         {
3725                 sk->state_change(sk);
3726                 sock_wake_async(sk->socket, 1);
3727         }
3728 
3729         switch(sk->state) 
3730         {
3731                 case TCP_SYN_RECV:
3732                 case TCP_SYN_SENT:
3733                 case TCP_ESTABLISHED:
3734                         /*
3735                          * move to CLOSE_WAIT, tcp_data() already handled
3736                          * sending the ack.
3737                          */
3738                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3739                         if (th->rst)
3740                                 sk->shutdown = SHUTDOWN_MASK;
3741                         break;
3742 
3743                 case TCP_CLOSE_WAIT:
3744                 case TCP_CLOSING:
3745                         /*
3746                          * received a retransmission of the FIN, do
3747                          * nothing.
3748                          */
3749                         break;
3750                 case TCP_TIME_WAIT:
3751                         /*
3752                          * received a retransmission of the FIN,
3753                          * restart the TIME_WAIT timer.
3754                          */
3755                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3756                         return(0);
3757                 case TCP_FIN_WAIT1:
3758                         /*
3759                          * This case occurs when a simultaneous close
3760                          * happens, we must ack the received FIN and
3761                          * enter the CLOSING state.
3762                          *
3763                          * This causes a WRITE timeout, which will either
3764                          * move on to TIME_WAIT when we timeout, or resend
3765                          * the FIN properly (maybe we get rid of that annoying
3766                          * FIN lost hang). The TIME_WRITE code is already correct
3767                          * for handling this timeout.
3768                          */
3769 
3770                         if(sk->ip_xmit_timeout != TIME_WRITE)
3771                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3772                         tcp_set_state(sk,TCP_CLOSING);
3773                         break;
3774                 case TCP_FIN_WAIT2:
3775                         /*
3776                          * received a FIN -- send ACK and enter TIME_WAIT
3777                          */
3778                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3779                         sk->shutdown|=SHUTDOWN_MASK;
3780                         tcp_set_state(sk,TCP_TIME_WAIT);
3781                         break;
3782                 case TCP_CLOSE:
3783                         /*
3784                          * already in CLOSE
3785                          */
3786                         break;
3787                 default:
3788                         tcp_set_state(sk,TCP_LAST_ACK);
3789         
3790                         /* Start the timers. */
3791                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3792                         return(0);
3793         }
3794 
3795         return(0);
3796 }
3797 
3798 
3799 
3800 /*
3801  *      This routine handles the data.  If there is room in the buffer,
3802  *      it will be have already been moved into it.  If there is no
3803  *      room, then we will just have to discard the packet.
3804  */
3805 
3806 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
3807          unsigned long saddr, unsigned short len)
3808 {
3809         struct sk_buff *skb1, *skb2;
3810         struct tcphdr *th;
3811         int dup_dumped=0;
3812         u32 new_seq, shut_seq;
3813 
3814         th = skb->h.th;
3815         skb_pull(skb,th->doff*4);
3816         skb_trim(skb,len-(th->doff*4));
3817 
3818         /*
3819          *      The bytes in the receive read/assembly queue has increased. Needed for the
3820          *      low memory discard algorithm 
3821          */
3822            
3823         sk->bytes_rcv += skb->len;
3824         
3825         if (skb->len == 0 && !th->fin) 
3826         {
3827                 /* 
3828                  *      Don't want to keep passing ack's back and forth. 
3829                  *      (someone sent us dataless, boring frame)
3830                  */
3831                 if (!th->ack)
3832                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3833                 kfree_skb(skb, FREE_READ);
3834                 return(0);
3835         }
3836         
3837         /*
3838          *      We no longer have anyone receiving data on this connection.
3839          */
3840 
3841 #ifndef TCP_DONT_RST_SHUTDOWN            
3842 
3843         if(sk->shutdown & RCV_SHUTDOWN)
3844         {
3845                 /*
3846                  *      FIXME: BSD has some magic to avoid sending resets to
3847                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3848                  *      BSD stacks still have broken keepalives so we want to
3849                  *      cope with it.
3850                  */
3851 
3852                 if(skb->len)    /* We don't care if it's just an ack or
3853                                    a keepalive/window probe */
3854                 {
3855                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3856                         
3857                         /* Do this the way 4.4BSD treats it. Not what I'd
3858                            regard as the meaning of the spec but it's what BSD
3859                            does and clearly they know everything 8) */
3860 
3861                         /*
3862                          *      This is valid because of two things
3863                          *
3864                          *      a) The way tcp_data behaves at the bottom.
3865                          *      b) A fin takes effect when read not when received.
3866                          */
3867                          
3868                         shut_seq=sk->acked_seq+1;       /* Last byte */
3869                         
3870                         if(after(new_seq,shut_seq))
3871                         {
3872                                 if(sk->debug)
3873                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3874                                                 sk, new_seq, shut_seq, sk->blog);
3875                                 if(sk->dead)
3876                                 {
3877                                         sk->acked_seq = new_seq + th->fin;
3878                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3879                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3880                                         tcp_statistics.TcpEstabResets++;
3881                                         tcp_set_state(sk,TCP_CLOSE);
3882                                         sk->err = EPIPE;
3883                                         sk->shutdown = SHUTDOWN_MASK;
3884                                         kfree_skb(skb, FREE_READ);
3885                                         return 0;
3886                                 }
3887                         }
3888                 }
3889         }
3890 
3891 #endif
3892 
3893         /*
3894          *      Now we have to walk the chain, and figure out where this one
3895          *      goes into it.  This is set up so that the last packet we received
3896          *      will be the first one we look at, that way if everything comes
3897          *      in order, there will be no performance loss, and if they come
3898          *      out of order we will be able to fit things in nicely.
3899          *
3900          *      [AC: This is wrong. We should assume in order first and then walk
3901          *       forwards from the first hole based upon real traffic patterns.]
3902          *      
3903          */
3904 
3905         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3906         {
3907                 skb_queue_head(&sk->receive_queue,skb);
3908                 skb1= NULL;
3909         } 
3910         else
3911         {
3912                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3913                 {
3914                         if(sk->debug)
3915                         {
3916                                 printk("skb1=%p :", skb1);
3917                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3918                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3919                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3920                                                 sk->acked_seq);
3921                         }
3922                         
3923                         /*
3924                          *      Optimisation: Duplicate frame or extension of previous frame from
3925                          *      same sequence point (lost ack case).
3926                          *      The frame contains duplicate data or replaces a previous frame
3927                          *      discard the previous frame (safe as sk->inuse is set) and put
3928                          *      the new one in its place.
3929                          */
3930                          
3931                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3932                         {
3933                                 skb_append(skb1,skb);
3934                                 skb_unlink(skb1);
3935                                 kfree_skb(skb1,FREE_READ);
3936                                 dup_dumped=1;
3937                                 skb1=NULL;
3938                                 break;
3939                         }
3940                         
3941                         /*
3942                          *      Found where it fits
3943                          */
3944                          
3945                         if (after(th->seq+1, skb1->h.th->seq))
3946                         {
3947                                 skb_append(skb1,skb);
3948                                 break;
3949                         }
3950                         
3951                         /*
3952                          *      See if we've hit the start. If so insert.
3953                          */
3954                         if (skb1 == skb_peek(&sk->receive_queue))
3955                         {
3956                                 skb_queue_head(&sk->receive_queue, skb);
3957                                 break;
3958                         }
3959                 }
3960         }
3961 
3962         /*
3963          *      Figure out what the ack value for this frame is
3964          */
3965          
3966         th->ack_seq = th->seq + skb->len;
3967         if (th->syn) 
3968                 th->ack_seq++;
3969         if (th->fin)
3970                 th->ack_seq++;
3971 
3972         if (before(sk->acked_seq, sk->copied_seq)) 
3973         {
3974                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3975                 sk->acked_seq = sk->copied_seq;
3976         }
3977 
3978         /*
3979          *      Now figure out if we can ack anything. This is very messy because we really want two
3980          *      receive queues, a completed and an assembly queue. We also want only one transmit
3981          *      queue.
3982          */
3983 
3984         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3985         {
3986                 if (before(th->seq, sk->acked_seq+1)) 
3987                 {
3988                         int newwindow;
3989 
3990                         if (after(th->ack_seq, sk->acked_seq)) 
3991                         {
3992                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3993                                 if (newwindow < 0)
3994                                         newwindow = 0;  
3995                                 sk->window = newwindow;
3996                                 sk->acked_seq = th->ack_seq;
3997                         }
3998                         skb->acked = 1;
3999 
4000                         /*
4001                          *      When we ack the fin, we do the FIN 
4002                          *      processing.
4003                          */
4004 
4005                         if (skb->h.th->fin) 
4006                         {
4007                                 tcp_fin(skb,sk,skb->h.th);
4008                         }
4009           
4010                         for(skb2 = skb->next;
4011                             skb2 != (struct sk_buff *)&sk->receive_queue;
4012                             skb2 = skb2->next) 
4013                         {
4014                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4015                                 {
4016                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4017                                         {
4018                                                 newwindow = sk->window -
4019                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4020                                                 if (newwindow < 0)
4021                                                         newwindow = 0;  
4022                                                 sk->window = newwindow;
4023                                                 sk->acked_seq = skb2->h.th->ack_seq;
4024                                         }
4025                                         skb2->acked = 1;
4026                                         /*
4027                                          *      When we ack the fin, we do
4028                                          *      the fin handling.
4029                                          */
4030                                         if (skb2->h.th->fin) 
4031                                         {
4032                                                 tcp_fin(skb,sk,skb->h.th);
4033                                         }
4034 
4035                                         /*
4036                                          *      Force an immediate ack.
4037                                          */
4038                                          
4039                                         sk->ack_backlog = sk->max_ack_backlog;
4040                                 }
4041                                 else
4042                                 {
4043                                         break;
4044                                 }
4045                         }
4046 
4047                         /*
4048                          *      This also takes care of updating the window.
4049                          *      This if statement needs to be simplified.
4050                          */
4051                         if (!sk->delay_acks ||
4052                             sk->ack_backlog >= sk->max_ack_backlog || 
4053                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4054         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4055                         }
4056                         else 
4057                         {
4058                                 sk->ack_backlog++;
4059                                 if(sk->debug)
4060                                         printk("Ack queued.\n");
4061                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4062                         }
4063                 }
4064         }
4065 
4066         /*
4067          *      If we've missed a packet, send an ack.
4068          *      Also start a timer to send another.
4069          */
4070          
4071         if (!skb->acked) 
4072         {
4073         
4074         /*
4075          *      This is important.  If we don't have much room left,
4076          *      we need to throw out a few packets so we have a good
4077          *      window.  Note that mtu is used, not mss, because mss is really
4078          *      for the send side.  He could be sending us stuff as large as mtu.
4079          */
4080                  
4081                 while (sk->prot->rspace(sk) < sk->mtu) 
4082                 {
4083                         skb1 = skb_peek(&sk->receive_queue);
4084                         if (skb1 == NULL) 
4085                         {
4086                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4087                                 break;
4088                         }
4089 
4090                         /*
4091                          *      Don't throw out something that has been acked. 
4092                          */
4093                  
4094                         if (skb1->acked) 
4095                         {
4096                                 break;
4097                         }
4098                 
4099                         skb_unlink(skb1);
4100                         kfree_skb(skb1, FREE_READ);
4101                 }
4102                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4103                 sk->ack_backlog++;
4104                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4105         }
4106         else
4107         {
4108                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4109         }
4110 
4111         /*
4112          *      Now tell the user we may have some data. 
4113          */
4114          
4115         if (!sk->dead) 
4116         {
4117                 if(sk->debug)
4118                         printk("Data wakeup.\n");
4119                 sk->data_ready(sk,0);
4120         } 
4121         return(0);
4122 }
4123 
4124 
4125 /*
4126  *      This routine is only called when we have urgent data
4127  *      signalled. Its the 'slow' part of tcp_urg. It could be
4128  *      moved inline now as tcp_urg is only called from one
4129  *      place. We handle URGent data wrong. We have to - as
4130  *      BSD still doesn't use the correction from RFC961.
4131  */
4132  
4133 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4134 {
4135         u32 ptr = ntohs(th->urg_ptr);
4136 
4137         if (ptr)
4138                 ptr--;
4139         ptr += th->seq;
4140 
4141         /* ignore urgent data that we've already seen and read */
4142         if (after(sk->copied_seq, ptr))
4143                 return;
4144 
4145         /* do we already have a newer (or duplicate) urgent pointer? */
4146         if (sk->urg_data && !after(ptr, sk->urg_seq))
4147                 return;
4148 
4149         /* tell the world about our new urgent pointer */
4150         if (sk->proc != 0) {
4151                 if (sk->proc > 0) {
4152                         kill_proc(sk->proc, SIGURG, 1);
4153                 } else {
4154                         kill_pg(-sk->proc, SIGURG, 1);
4155                 }
4156         }
4157         sk->urg_data = URG_NOTYET;
4158         sk->urg_seq = ptr;
4159 }
4160 
4161 /*
4162  *      This is the 'fast' part of urgent handling.
4163  */
4164  
4165 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4166         unsigned long saddr, unsigned long len)
4167 {
4168         u32 ptr;
4169 
4170         /*
4171          *      Check if we get a new urgent pointer - normally not 
4172          */
4173          
4174         if (th->urg)
4175                 tcp_check_urg(sk,th);
4176 
4177         /*
4178          *      Do we wait for any urgent data? - normally not
4179          */
4180          
4181         if (sk->urg_data != URG_NOTYET)
4182                 return 0;
4183 
4184         /*
4185          *      Is the urgent pointer pointing into this packet? 
4186          */
4187          
4188         ptr = sk->urg_seq - th->seq + th->doff*4;
4189         if (ptr >= len)
4190                 return 0;
4191 
4192         /*
4193          *      Ok, got the correct packet, update info 
4194          */
4195          
4196         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4197         if (!sk->dead)
4198                 sk->data_ready(sk,0);
4199         return 0;
4200 }
4201 
4202 /*
4203  *      This will accept the next outstanding connection. 
4204  */
4205  
4206 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4207 {
4208         struct sock *newsk;
4209         struct sk_buff *skb;
4210   
4211   /*
4212    * We need to make sure that this socket is listening,
4213    * and that it has something pending.
4214    */
4215 
4216         if (sk->state != TCP_LISTEN) 
4217         {
4218                 sk->err = EINVAL;
4219                 return(NULL); 
4220         }
4221 
4222         /* Avoid the race. */
4223         cli();
4224         sk->inuse = 1;
4225 
4226         while((skb = tcp_dequeue_established(sk)) == NULL) 
4227         {
4228                 if (flags & O_NONBLOCK) 
4229                 {
4230                         sti();
4231                         release_sock(sk);
4232                         sk->err = EAGAIN;
4233                         return(NULL);
4234                 }
4235 
4236                 release_sock(sk);
4237                 interruptible_sleep_on(sk->sleep);
4238                 if (current->signal & ~current->blocked) 
4239                 {
4240                         sti();
4241                         sk->err = ERESTARTSYS;
4242                         return(NULL);
4243                 }
4244                 sk->inuse = 1;
4245         }
4246         sti();
4247 
4248         /*
4249          *      Now all we need to do is return skb->sk. 
4250          */
4251 
4252         newsk = skb->sk;
4253 
4254         kfree_skb(skb, FREE_READ);
4255         sk->ack_backlog--;
4256         release_sock(sk);
4257         return(newsk);
4258 }
4259 
4260 
4261 /*
4262  *      This will initiate an outgoing connection. 
4263  */
4264  
4265 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4266 {
4267         struct sk_buff *buff;
4268         struct device *dev=NULL;
4269         unsigned char *ptr;
4270         int tmp;
4271         int atype;
4272         struct tcphdr *t1;
4273         struct rtable *rt;
4274 
4275         if (sk->state != TCP_CLOSE) 
4276         {
4277                 return(-EISCONN);
4278         }
4279         
4280         if (addr_len < 8) 
4281                 return(-EINVAL);
4282 
4283         if (usin->sin_family && usin->sin_family != AF_INET) 
4284                 return(-EAFNOSUPPORT);
4285 
4286         /*
4287          *      connect() to INADDR_ANY means loopback (BSD'ism).
4288          */
4289         
4290         if(usin->sin_addr.s_addr==INADDR_ANY)
4291                 usin->sin_addr.s_addr=ip_my_addr();
4292                   
4293         /*
4294          *      Don't want a TCP connection going to a broadcast address 
4295          */
4296 
4297         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4298                 return -ENETUNREACH;
4299   
4300         sk->inuse = 1;
4301         sk->daddr = usin->sin_addr.s_addr;
4302         sk->write_seq = tcp_init_seq();
4303         sk->window_seq = sk->write_seq;
4304         sk->rcv_ack_seq = sk->write_seq -1;
4305         sk->err = 0;
4306         sk->dummy_th.dest = usin->sin_port;
4307         release_sock(sk);
4308 
4309         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4310         if (buff == NULL) 
4311         {
4312                 return(-ENOMEM);
4313         }
4314         sk->inuse = 1;
4315         buff->sk = sk;
4316         buff->free = 0;
4317         buff->localroute = sk->localroute;
4318         
4319 
4320         /*
4321          *      Put in the IP header and routing stuff. 
4322          */
4323          
4324         rt=ip_rt_route(sk->daddr, NULL, NULL);
4325         
4326 
4327         /*
4328          *      We need to build the routing stuff from the things saved in skb. 
4329          */
4330 
4331         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4332                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4333         if (tmp < 0) 
4334         {
4335                 sk->prot->wfree(sk, buff);
4336                 release_sock(sk);
4337                 return(-ENETUNREACH);
4338         }
4339 
4340         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4341 
4342         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4343         t1->seq = ntohl(sk->write_seq++);
4344         sk->sent_seq = sk->write_seq;
4345         buff->h.seq = sk->write_seq;
4346         t1->ack = 0;
4347         t1->window = 2;
4348         t1->res1=0;
4349         t1->res2=0;
4350         t1->rst = 0;
4351         t1->urg = 0;
4352         t1->psh = 0;
4353         t1->syn = 1;
4354         t1->urg_ptr = 0;
4355         t1->doff = 6;
4356         /* use 512 or whatever user asked for */
4357         
4358         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4359                 sk->window_clamp=rt->rt_window;
4360         else
4361                 sk->window_clamp=0;
4362 
4363         if (sk->user_mss)
4364                 sk->mtu = sk->user_mss;
4365         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4366                 sk->mtu = rt->rt_mss;
4367         else 
4368         {
4369 #ifdef CONFIG_INET_SNARL
4370                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4371 #else
4372                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4373 #endif
4374                         sk->mtu = 576 - HEADER_SIZE;
4375                 else
4376                         sk->mtu = MAX_WINDOW;
4377         }
4378         /*
4379          *      but not bigger than device MTU 
4380          */
4381 
4382         if(sk->mtu <32)
4383                 sk->mtu = 32;   /* Sanity limit */
4384                 
4385         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4386         
4387         /*
4388          *      Put in the TCP options to say MTU. 
4389          */
4390 
4391         ptr = skb_put(buff,4);
4392         ptr[0] = 2;
4393         ptr[1] = 4;
4394         ptr[2] = (sk->mtu) >> 8;
4395         ptr[3] = (sk->mtu) & 0xff;
4396         tcp_send_check(t1, sk->saddr, sk->daddr,
4397                   sizeof(struct tcphdr) + 4, sk);
4398 
4399         /*
4400          *      This must go first otherwise a really quick response will get reset. 
4401          */
4402 
4403         tcp_cache_zap();
4404         tcp_set_state(sk,TCP_SYN_SENT);
4405         if(rt&&rt->rt_flags&RTF_IRTT)
4406                 sk->rto = rt->rt_irtt;
4407         else
4408                 sk->rto = TCP_TIMEOUT_INIT;
4409         sk->retransmit_timer.function=&retransmit_timer;
4410         sk->retransmit_timer.data = (unsigned long)sk;
4411         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4412         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4413 
4414         sk->prot->queue_xmit(sk, dev, buff, 0);  
4415         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4416         tcp_statistics.TcpActiveOpens++;
4417         tcp_statistics.TcpOutSegs++;
4418   
4419         release_sock(sk);
4420         return(0);
4421 }
4422 
4423 
4424 /* This functions checks to see if the tcp header is actually acceptable. */
4425 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4426              struct options *opt, unsigned long saddr, struct device *dev)
4427 {
4428         u32 next_seq;
4429 
4430         next_seq = len - 4*th->doff;
4431         if (th->fin)
4432                 next_seq++;
4433         /* if we have a zero window, we can't have any data in the packet.. */
4434         if (next_seq && !sk->window)
4435                 goto ignore_it;
4436         next_seq += th->seq;
4437 
4438         /*
4439          * This isn't quite right.  sk->acked_seq could be more recent
4440          * than sk->window.  This is however close enough.  We will accept
4441          * slightly more packets than we should, but it should not cause
4442          * problems unless someone is trying to forge packets.
4443          */
4444 
4445         /* have we already seen all of this packet? */
4446         if (!after(next_seq+1, sk->acked_seq))
4447                 goto ignore_it;
4448         /* or does it start beyond the window? */
4449         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4450                 goto ignore_it;
4451 
4452         /* ok, at least part of this packet would seem interesting.. */
4453         return 1;
4454 
4455 ignore_it:
4456         if (th->rst)
4457                 return 0;
4458 
4459         /*
4460          *      Send a reset if we get something not ours and we are
4461          *      unsynchronized. Note: We don't do anything to our end. We
4462          *      are just killing the bogus remote connection then we will
4463          *      connect again and it will work (with luck).
4464          */
4465          
4466         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4467         {
4468                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4469                 return 1;
4470         }
4471 
4472         /* Try to resync things. */
4473         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4474         return 0;
4475 }
4476 
4477 /*
4478  *      When we get a reset we do this.
4479  */
4480 
4481 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4482 {
4483         sk->zapped = 1;
4484         sk->err = ECONNRESET;
4485         if (sk->state == TCP_SYN_SENT)
4486                 sk->err = ECONNREFUSED;
4487         if (sk->state == TCP_CLOSE_WAIT)
4488                 sk->err = EPIPE;
4489 #ifdef TCP_DO_RFC1337           
4490         /*
4491          *      Time wait assassination protection [RFC1337]
4492          */
4493         if(sk->state!=TCP_TIME_WAIT)
4494         {       
4495                 tcp_set_state(sk,TCP_CLOSE);
4496                 sk->shutdown = SHUTDOWN_MASK;
4497         }
4498 #else   
4499         tcp_set_state(sk,TCP_CLOSE);
4500         sk->shutdown = SHUTDOWN_MASK;
4501 #endif  
4502         if (!sk->dead) 
4503                 sk->state_change(sk);
4504         kfree_skb(skb, FREE_READ);
4505         release_sock(sk);
4506         return(0);
4507 }
4508 
4509 /*
4510  *      A TCP packet has arrived.
4511  *              skb->h.raw is the TCP header.
4512  */
4513  
4514 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4515         unsigned long daddr, unsigned short len,
4516         unsigned long saddr, int redo, struct inet_protocol * protocol)
4517 {
4518         struct tcphdr *th;
4519         struct sock *sk;
4520         int syn_ok=0;
4521         
4522         tcp_statistics.TcpInSegs++;
4523         if(skb->pkt_type!=PACKET_HOST)
4524         {
4525                 kfree_skb(skb,FREE_READ);
4526                 return(0);
4527         }
4528   
4529         th = skb->h.th;
4530 
4531         /*
4532          *      Find the socket, using the last hit cache if applicable.
4533          */
4534 
4535         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4536                 sk=(struct sock *)th_cache_sk;
4537         else
4538         {
4539                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4540                 th_cache_saddr=saddr;
4541                 th_cache_daddr=daddr;
4542                 th_cache_dport=th->dest;
4543                 th_cache_sport=th->source;
4544                 th_cache_sk=sk;
4545         }               
4546 
4547         /*
4548          *      If this socket has got a reset it's to all intents and purposes 
4549          *      really dead. Count closed sockets as dead.
4550          *
4551          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4552          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4553          *      exist so should cause resets as if the port was unreachable.
4554          */
4555          
4556         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4557                 sk=NULL;
4558 
4559         if (!redo) 
4560         {
4561                 /*
4562                  *      Pull up the IP header.
4563                  */
4564                 skb_pull(skb, skb->h.raw-skb->data);
4565                 /*
4566                  *      Try to use the device checksum if provided.
4567                  */
4568                 if (
4569                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4570                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4571                     )
4572                 {
4573                         skb->sk = NULL;
4574                         kfree_skb(skb,FREE_READ);
4575                         /*
4576                          *      We don't release the socket because it was
4577                          *      never marked in use.
4578                          */
4579                         return(0);
4580                 }
4581                 th->seq = ntohl(th->seq);
4582 
4583                 /* See if we know about the socket. */
4584                 if (sk == NULL) 
4585                 {
4586                         /*
4587                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4588                          */
4589                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4590                         skb->sk = NULL;
4591                         /*
4592                          *      Discard frame
4593                          */
4594                         kfree_skb(skb, FREE_READ);
4595                         return(0);
4596                 }
4597 
4598 /*              skb->len = len;*/
4599                 skb->acked = 0;
4600                 skb->used = 0;
4601                 skb->free = 0;
4602                 skb->saddr = daddr;
4603                 skb->daddr = saddr;
4604         
4605                 /* We may need to add it to the backlog here. */
4606                 cli();
4607                 if (sk->inuse) 
4608                 {
4609                         skb_queue_tail(&sk->back_log, skb);
4610                         sti();
4611                         return(0);
4612                 }
4613                 sk->inuse = 1;
4614                 sti();
4615         }
4616         else
4617         {
4618                 if (sk==NULL) 
4619                 {
4620                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4621                         skb->sk = NULL;
4622                         kfree_skb(skb, FREE_READ);
4623                         return(0);
4624                 }
4625         }
4626 
4627 
4628         if (!sk->prot) 
4629         {
4630                 printk("IMPOSSIBLE 3\n");
4631                 return(0);
4632         }
4633 
4634 
4635         /*
4636          *      Charge the memory to the socket. 
4637          */
4638          
4639         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4640         {
4641                 kfree_skb(skb, FREE_READ);
4642                 release_sock(sk);
4643                 return(0);
4644         }
4645 
4646         skb->sk=sk;
4647         sk->rmem_alloc += skb->truesize;
4648 
4649         /*
4650          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4651          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4652          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4653          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4654          */
4655 
4656         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4657         {
4658         
4659                 /*
4660                  *      Now deal with unusual cases.
4661                  */
4662          
4663                 if(sk->state==TCP_LISTEN)
4664                 {
4665                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4666                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4667 
4668                         /*
4669                          *      We don't care for RST, and non SYN are absorbed (old segments)
4670                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4671                          *      netmask on a running connection it can go broadcast. Even Sun's have
4672                          *      this problem so I'm ignoring it 
4673                          */
4674                            
4675                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4676                         {
4677                                 kfree_skb(skb, FREE_READ);
4678                                 release_sock(sk);
4679                                 return 0;
4680                         }
4681                 
4682                         /*      
4683                          *      Guess we need to make a new socket up 
4684                          */
4685                 
4686                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4687                 
4688                         /*
4689                          *      Now we have several options: In theory there is nothing else
4690                          *      in the frame. KA9Q has an option to send data with the syn,
4691                          *      BSD accepts data with the syn up to the [to be] advertised window
4692                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4693                          *      it, that fits the spec precisely and avoids incompatibilities. It
4694                          *      would be nice in future to drop through and process the data.
4695                          */
4696                          
4697                         release_sock(sk);
4698                         return 0;
4699                 }
4700         
4701                 /* retransmitted SYN? */
4702                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4703                 {
4704                         kfree_skb(skb, FREE_READ);
4705                         release_sock(sk);
4706                         return 0;
4707                 }
4708                 
4709                 /*
4710                  *      SYN sent means we have to look for a suitable ack and either reset
4711                  *      for bad matches or go to connected 
4712                  */
4713            
4714                 if(sk->state==TCP_SYN_SENT)
4715                 {
4716                         /* Crossed SYN or previous junk segment */
4717                         if(th->ack)
4718                         {
4719                                 /* We got an ack, but it's not a good ack */
4720                                 if(!tcp_ack(sk,th,saddr,len))
4721                                 {
4722                                         /* Reset the ack - its an ack from a 
4723                                            different connection  [ th->rst is checked in tcp_reset()] */
4724                                         tcp_statistics.TcpAttemptFails++;
4725                                         tcp_reset(daddr, saddr, th,
4726                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4727                                         kfree_skb(skb, FREE_READ);
4728                                         release_sock(sk);
4729                                         return(0);
4730                                 }
4731                                 if(th->rst)
4732                                         return tcp_std_reset(sk,skb);
4733                                 if(!th->syn)
4734                                 {
4735                                         /* A valid ack from a different connection
4736                                            start. Shouldn't happen but cover it */
4737                                         kfree_skb(skb, FREE_READ);
4738                                         release_sock(sk);
4739                                         return 0;
4740                                 }
4741                                 /*
4742                                  *      Ok.. it's good. Set up sequence numbers and
4743                                  *      move to established.
4744                                  */
4745                                 syn_ok=1;       /* Don't reset this connection for the syn */
4746                                 sk->acked_seq=th->seq+1;
4747                                 sk->fin_seq=th->seq;
4748                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4749                                 tcp_set_state(sk, TCP_ESTABLISHED);
4750                                 tcp_options(sk,th);
4751                                 sk->dummy_th.dest=th->source;
4752                                 sk->copied_seq = sk->acked_seq;
4753                                 if(!sk->dead)
4754                                 {
4755                                         sk->state_change(sk);
4756                                         sock_wake_async(sk->socket, 0);
4757                                 }
4758                                 if(sk->max_window==0)
4759                                 {
4760                                         sk->max_window = 32;
4761                                         sk->mss = min(sk->max_window, sk->mtu);
4762                                 }
4763                         }
4764                         else
4765                         {
4766                                 /* See if SYN's cross. Drop if boring */
4767                                 if(th->syn && !th->rst)
4768                                 {
4769                                         /* Crossed SYN's are fine - but talking to
4770                                            yourself is right out... */
4771                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4772                                                 sk->dummy_th.source==th->source &&
4773                                                 sk->dummy_th.dest==th->dest)
4774                                         {
4775                                                 tcp_statistics.TcpAttemptFails++;
4776                                                 return tcp_std_reset(sk,skb);
4777                                         }
4778                                         tcp_set_state(sk,TCP_SYN_RECV);
4779                                         
4780                                         /*
4781                                          *      FIXME:
4782                                          *      Must send SYN|ACK here
4783                                          */
4784                                 }               
4785                                 /* Discard junk segment */
4786                                 kfree_skb(skb, FREE_READ);
4787                                 release_sock(sk);
4788                                 return 0;
4789                         }
4790                         /*
4791                          *      SYN_RECV with data maybe.. drop through
4792                          */
4793                         goto rfc_step6;
4794                 }
4795 
4796         /*
4797          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4798          *      a more complex suggestion for fixing these reuse issues in RFC1644
4799          *      but not yet ready for general use. Also see RFC1379.
4800          */
4801         
4802 #define BSD_TIME_WAIT
4803 #ifdef BSD_TIME_WAIT
4804                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4805                         after(th->seq, sk->acked_seq) && !th->rst)
4806                 {
4807                         u32 seq = sk->write_seq;
4808                         if(sk->debug)
4809                                 printk("Doing a BSD time wait\n");
4810                         tcp_statistics.TcpEstabResets++;           
4811                         sk->rmem_alloc -= skb->truesize;
4812                         skb->sk = NULL;
4813                         sk->err=ECONNRESET;
4814                         tcp_set_state(sk, TCP_CLOSE);
4815                         sk->shutdown = SHUTDOWN_MASK;
4816                         release_sock(sk);
4817                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4818                         if (sk && sk->state==TCP_LISTEN)
4819                         {
4820                                 sk->inuse=1;
4821                                 skb->sk = sk;
4822                                 sk->rmem_alloc += skb->truesize;
4823                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4824                                 release_sock(sk);
4825                                 return 0;
4826                         }
4827                         kfree_skb(skb, FREE_READ);
4828                         return 0;
4829                 }
4830 #endif  
4831         }
4832 
4833         /*
4834          *      We are now in normal data flow (see the step list in the RFC)
4835          *      Note most of these are inline now. I'll inline the lot when
4836          *      I have time to test it hard and look at what gcc outputs 
4837          */
4838         
4839         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4840         {
4841                 kfree_skb(skb, FREE_READ);
4842                 release_sock(sk);
4843                 return 0;
4844         }
4845 
4846         if(th->rst)
4847                 return tcp_std_reset(sk,skb);
4848         
4849         /*
4850          *      !syn_ok is effectively the state test in RFC793.
4851          */
4852          
4853         if(th->syn && !syn_ok)
4854         {
4855                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4856                 return tcp_std_reset(sk,skb);   
4857         }
4858 
4859         /*
4860          *      Process the ACK
4861          */
4862          
4863 
4864         if(th->ack && !tcp_ack(sk,th,saddr,len))
4865         {
4866                 /*
4867                  *      Our three way handshake failed.
4868                  */
4869                  
4870                 if(sk->state==TCP_SYN_RECV)
4871                 {
4872                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4873                 }
4874                 kfree_skb(skb, FREE_READ);
4875                 release_sock(sk);
4876                 return 0;
4877         }
4878         
4879 rfc_step6:              /* I'll clean this up later */
4880 
4881         /*
4882          *      Process urgent data
4883          */
4884                 
4885         if(tcp_urg(sk, th, saddr, len))
4886         {
4887                 kfree_skb(skb, FREE_READ);
4888                 release_sock(sk);
4889                 return 0;
4890         }
4891         
4892         
4893         /*
4894          *      Process the encapsulated data
4895          */
4896         
4897         if(tcp_data(skb,sk, saddr, len))
4898         {
4899                 kfree_skb(skb, FREE_READ);
4900                 release_sock(sk);
4901                 return 0;
4902         }
4903 
4904         /*
4905          *      And done
4906          */     
4907         
4908         release_sock(sk);
4909         return 0;
4910 }
4911 
4912 /*
4913  *      This routine sends a packet with an out of date sequence
4914  *      number. It assumes the other end will try to ack it.
4915  */
4916 
4917 static void tcp_write_wakeup(struct sock *sk)
     /*  */
4918 {
4919         struct sk_buff *buff,*skb;
4920         struct tcphdr *t1;
4921         struct device *dev=NULL;
4922         int tmp;
4923 
4924         if (sk->zapped)
4925                 return; /* After a valid reset we can send no more */
4926 
4927         /*
4928          *      Write data can still be transmitted/retransmitted in the
4929          *      following states.  If any other state is encountered, return.
4930          *      [listen/close will never occur here anyway]
4931          */
4932 
4933         if (sk->state != TCP_ESTABLISHED && 
4934             sk->state != TCP_CLOSE_WAIT &&
4935             sk->state != TCP_FIN_WAIT1 && 
4936             sk->state != TCP_LAST_ACK &&
4937             sk->state != TCP_CLOSING
4938         ) 
4939         {
4940                 return;
4941         }
4942         if ( before(sk->sent_seq, sk->window_seq) && 
4943             (skb=skb_peek(&sk->write_queue)))
4944         {
4945                 /*
4946                  * We are probing the opening of a window
4947                  * but the window size is != 0
4948                  * must have been a result SWS advoidance ( sender )
4949                  */
4950             
4951                 struct iphdr *iph;
4952                 struct tcphdr *th;
4953                 struct tcphdr *nth;
4954                 unsigned long win_size, ow_size;
4955                 void * tcp_data_start;
4956         
4957                 /*
4958                  *      How many bytes can we send ?
4959                  */
4960                  
4961                 win_size = sk->window_seq - sk->sent_seq;
4962 
4963                 /*
4964                  *      Recover the buffer pointers
4965                  */
4966                  
4967                 iph = (struct iphdr *)skb->ip_hdr;
4968                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4969 
4970                 /*
4971                  *      Grab the data for a temporary frame
4972                  */
4973                  
4974                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
4975                                      (iph->ihl << 2) +
4976                                      sk->prot->max_header + 15, 
4977                                      1, GFP_ATOMIC);
4978                 if ( buff == NULL )
4979                         return;
4980 
4981                 /* 
4982                  *      If we strip the packet on the write queue we must
4983                  *      be ready to retransmit this one 
4984                  */
4985             
4986                 buff->free = /*0*/1;
4987 
4988                 buff->sk = sk;
4989                 buff->localroute = sk->localroute;
4990                 
4991                 /*
4992                  *      Put headers on the new packet
4993                  */
4994 
4995                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4996                                          IPPROTO_TCP, sk->opt, buff->truesize,
4997                                          sk->ip_tos,sk->ip_ttl);
4998                 if (tmp < 0) 
4999                 {
5000                         sk->prot->wfree(sk, buff);
5001                         return;
5002                 }
5003                 
5004                 /*
5005                  *      Move the TCP header over
5006                  */
5007 
5008                 buff->dev = dev;
5009 
5010                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5011 
5012                 memcpy(nth, th, th->doff * 4);
5013                 
5014                 /*
5015                  *      Correct the new header
5016                  */
5017                  
5018                 nth->ack = 1; 
5019                 nth->ack_seq = ntohl(sk->acked_seq);
5020                 nth->window = ntohs(tcp_select_window(sk));
5021                 nth->check = 0;
5022 
5023                 /*
5024                  *      Find the first data byte.
5025                  */
5026                  
5027                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5028                                 (iph->ihl << 2) + th->doff * 4;
5029 
5030                 /*
5031                  *      Add it to our new buffer
5032                  */
5033                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5034                 
5035                 /*
5036                  *      Remember our right edge sequence number.
5037                  */
5038                  
5039                 buff->h.seq = sk->sent_seq + win_size;
5040                 sk->sent_seq = buff->h.seq;             /* Hack */
5041 #if 0
5042 
5043                 /*
5044                  *      now: shrink the queue head segment 
5045                  */
5046                  
5047                 th->check = 0;
5048                 ow_size = skb->len - win_size - 
5049                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5050 
5051                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5052                 skb_trim(skb,skb->len-win_size);
5053                 sk->sent_seq += win_size;
5054                 th->seq = htonl(sk->sent_seq);
5055                 if (th->urg)
5056                 {
5057                         unsigned short urg_ptr;
5058         
5059                         urg_ptr = ntohs(th->urg_ptr);
5060                         if (urg_ptr <= win_size)
5061                                 th->urg = 0;
5062                         else
5063                         {
5064                                 urg_ptr -= win_size;
5065                                 th->urg_ptr = htons(urg_ptr);
5066                                 nth->urg_ptr = htons(win_size);
5067                         }
5068                 }
5069 #else
5070                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5071                         nth->urg = 0;
5072 #endif          
5073 
5074                 /*
5075                  *      Checksum the split buffer
5076                  */
5077                  
5078                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5079                            nth->doff * 4 + win_size , sk);
5080         }
5081         else
5082         {       
5083                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5084                 if (buff == NULL) 
5085                         return;
5086 
5087                 buff->free = 1;
5088                 buff->sk = sk;
5089                 buff->localroute = sk->localroute;
5090 
5091                 /*
5092                  *      Put in the IP header and routing stuff. 
5093                  */
5094                  
5095                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5096                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5097                 if (tmp < 0) 
5098                 {
5099                         sk->prot->wfree(sk, buff);
5100                         return;
5101                 }
5102 
5103                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5104                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5105 
5106                 /*
5107                  *      Use a previous sequence.
5108                  *      This should cause the other end to send an ack.
5109                  */
5110          
5111                 t1->seq = htonl(sk->sent_seq-1);
5112                 t1->ack = 1; 
5113                 t1->res1= 0;
5114                 t1->res2= 0;
5115                 t1->rst = 0;
5116                 t1->urg = 0;
5117                 t1->psh = 0;
5118                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5119                 t1->syn = 0;
5120                 t1->ack_seq = ntohl(sk->acked_seq);
5121                 t1->window = ntohs(tcp_select_window(sk));
5122                 t1->doff = sizeof(*t1)/4;
5123                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5124 
5125         }               
5126 
5127         /*
5128          *      Send it.
5129          */
5130         
5131         sk->prot->queue_xmit(sk, dev, buff, 1);
5132         tcp_statistics.TcpOutSegs++;
5133 }
5134 
5135 /*
5136  *      A window probe timeout has occurred.
5137  */
5138 
5139 void tcp_send_probe0(struct sock *sk)
     /*  */
5140 {
5141         if (sk->zapped)
5142                 return;         /* After a valid reset we can send no more */
5143 
5144         tcp_write_wakeup(sk);
5145 
5146         sk->backoff++;
5147         sk->rto = min(sk->rto << 1, 120*HZ);
5148         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5149         sk->retransmits++;
5150         sk->prot->retransmits ++;
5151 }
5152 
5153 /*
5154  *      Socket option code for TCP. 
5155  */
5156   
5157 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5158 {
5159         int val,err;
5160 
5161         if(level!=SOL_TCP)
5162                 return ip_setsockopt(sk,level,optname,optval,optlen);
5163 
5164         if (optval == NULL) 
5165                 return(-EINVAL);
5166 
5167         err=verify_area(VERIFY_READ, optval, sizeof(int));
5168         if(err)
5169                 return err;
5170         
5171         val = get_user((int *)optval);
5172 
5173         switch(optname)
5174         {
5175                 case TCP_MAXSEG:
5176 /*
5177  * values greater than interface MTU won't take effect.  however at
5178  * the point when this call is done we typically don't yet know
5179  * which interface is going to be used
5180  */
5181                         if(val<1||val>MAX_WINDOW)
5182                                 return -EINVAL;
5183                         sk->user_mss=val;
5184                         return 0;
5185                 case TCP_NODELAY:
5186                         sk->nonagle=(val==0)?0:1;
5187                         return 0;
5188                 default:
5189                         return(-ENOPROTOOPT);
5190         }
5191 }
5192 
5193 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5194 {
5195         int val,err;
5196 
5197         if(level!=SOL_TCP)
5198                 return ip_getsockopt(sk,level,optname,optval,optlen);
5199                         
5200         switch(optname)
5201         {
5202                 case TCP_MAXSEG:
5203                         val=sk->user_mss;
5204                         break;
5205                 case TCP_NODELAY:
5206                         val=sk->nonagle;
5207                         break;
5208                 default:
5209                         return(-ENOPROTOOPT);
5210         }
5211         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5212         if(err)
5213                 return err;
5214         put_user(sizeof(int),(int *) optlen);
5215 
5216         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5217         if(err)
5218                 return err;
5219         put_user(val,(int *)optval);
5220 
5221         return(0);
5222 }       
5223 
5224 
5225 struct proto tcp_prot = {
5226         sock_wmalloc,
5227         sock_rmalloc,
5228         sock_wfree,
5229         sock_rfree,
5230         sock_rspace,
5231         sock_wspace,
5232         tcp_close,
5233         tcp_read,
5234         tcp_write,
5235         tcp_sendto,
5236         tcp_recvfrom,
5237         ip_build_header,
5238         tcp_connect,
5239         tcp_accept,
5240         ip_queue_xmit,
5241         tcp_retransmit,
5242         tcp_write_wakeup,
5243         tcp_read_wakeup,
5244         tcp_rcv,
5245         tcp_select,
5246         tcp_ioctl,
5247         NULL,
5248         tcp_shutdown,
5249         tcp_setsockopt,
5250         tcp_getsockopt,
5251         128,
5252         0,
5253         "TCP",
5254         0, 0,
5255         {NULL,}
5256 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS