net/ipv4/tcp.c

/* */
This source file includes following definitions.
tcp_cache_zap
min
tcp_set_state
tcp_select_window
tcp_find_established
tcp_dequeue_established
tcp_close_pending
tcp_time_wait
tcp_do_retransmit
reset_xmit_timer
tcp_retransmit_time
tcp_retransmit
tcp_write_timeout
retransmit_timer
tcp_err
tcp_readable
tcp_listen_select
tcp_select
tcp_ioctl
tcp_check
tcp_send_check
tcp_send_skb
tcp_dequeue_partial
tcp_send_partial
tcp_enqueue_partial
tcp_send_ack
tcp_build_header
tcp_write
tcp_sendto
tcp_read_wakeup
cleanup_rbuf
tcp_read_urg
tcp_read
tcp_close_state
tcp_send_fin
tcp_shutdown
tcp_recvfrom
tcp_reset
tcp_options
default_mask
tcp_init_seq
tcp_conn_request
tcp_close
tcp_write_xmit
tcp_ack
tcp_fin
tcp_data
tcp_check_urg
tcp_urg
tcp_accept
tcp_connect
tcp_sequence
tcp_std_reset
tcp_rcv
tcp_write_wakeup
tcp_send_probe0
tcp_setsockopt
tcp_getsockopt
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  26  *                                      and was trying to connect (tcp_err()).
  27  *              Alan Cox        :       All icmp error handling was broken
  28  *                                      pointers passed where wrong and the
  29  *                                      socket was looked up backwards. Nobody
  30  *                                      tested any icmp error code obviously.
  31  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  32  *                                      on errors. select behaves and the icmp error race
  33  *                                      has gone by moving it into sock.c
  34  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  35  *                                      packets for unknown sockets.
  36  *              Alan Cox        :       tcp option processing.
  37  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  38  *              Herp Rosmanith  :       More reset fixes
  39  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  40  *                                      any kind of RST is right out.
  41  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  42  *                                      otherwise odd bits of prattle escape still
  43  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  44  *                                      LAN workplace lockups.
  45  *              Alan Cox        :       Some tidyups using the new skb list facilities
  46  *              Alan Cox        :       sk->keepopen now seems to work
  47  *              Alan Cox        :       Pulls options out correctly on accepts
  48  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  49  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  50  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  51  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  52  *              Alan Cox        :       Removed incorrect check for 20 * psh
  53  *      Michael O'Reilly        :       ack < copied bug fix.
  54  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  55  *              Alan Cox        :       FIN with no memory -> CRASH
  56  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  57  *              Alan Cox        :       Added TCP options (SOL_TCP)
  58  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  59  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  60  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  61  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  62  *              Alan Cox        :       Put in missing check for SYN bit.
  63  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  64  *                                      window non shrink trick.
  65  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  66  *              Charles Hedrick :       TCP fixes
  67  *              Toomas Tamm     :       TCP window fixes
  68  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  69  *              Charles Hedrick :       Rewrote most of it to actually work
  70  *              Linus           :       Rewrote tcp_read() and URG handling
  71  *                                      completely
  72  *              Gerhard Koerting:       Fixed some missing timer handling
  73  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  74  *              Gerhard Koerting:       PC/TCP workarounds
  75  *              Adam Caldwell   :       Assorted timer/timing errors
  76  *              Matthew Dillon  :       Fixed another RST bug
  77  *              Alan Cox        :       Move to kernel side addressing changes.
  78  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  79  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  80  *              Alan Cox        :       TCP fast path debugging
  81  *              Alan Cox        :       Window clamping
  82  *              Michael Riepe   :       Bug in tcp_check()
  83  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  84  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  85  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  86  *              Alan Cox        :       BSD accept semantics. 
  87  *              Alan Cox        :       Reset on closedown bug.
  88  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  89  *              Michael Pall    :       Handle select() after URG properly in all cases.
  90  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  91  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  92  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  93  *              Alan Cox        :       Changed the semantics of sk->socket to 
  94  *                                      fix a race and a signal problem with
  95  *                                      accept() and async I/O.
  96  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  97  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  98  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  99  *                                      clients/servers which listen in on
 100  *                                      fixed ports.
 101  *              Alan Cox        :       Cleaned the above up and shrank it to
 102  *                                      a sensible code size.
 103  *              Alan Cox        :       Self connect lockup fix.
 104  *              Alan Cox        :       No connect to multicast.
 105  *              Ross Biro       :       Close unaccepted children on master
 106  *                                      socket close.
 107  *              Alan Cox        :       Reset tracing code.
 108  *              Alan Cox        :       Spurious resets on shutdown.
 109  *              Alan Cox        :       Giant 15 minute/60 second timer error
 110  *              Alan Cox        :       Small whoops in selecting before an accept.
 111  *              Alan Cox        :       Kept the state trace facility since it's
 112  *                                      handy for debugging.
 113  *              Alan Cox        :       More reset handler fixes.
 114  *              Alan Cox        :       Started rewriting the code based on the RFC's
 115  *                                      for other useful protocol references see:  
 116  *                                      Comer, KA9Q NOS, and for a reference on the
 117  *                                      difference between specifications and how BSD
 118  *                                      works see the 4.4lite source.
 119  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 120  *                                      close.
 121  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 122  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 123  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 124  *                                      timers for sanity. 
 125  *              Alan Cox        :       Small bug fixes, and a lot of new
 126  *                                      comments.
 127  *              Alan Cox        :       Fixed dual reader crash by locking
 128  *                                      the buffers (much like datagram.c)
 129  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 130  *                                      now gets fed up of retrying without
 131  *                                      (even a no space) answer.
 132  *              Alan Cox        :       Extracted closing code better
 133  *              Alan Cox        :       Fixed the closing state machine to
 134  *                                      resemble the RFC.
 135  *              Alan Cox        :       More 'per spec' fixes.
 136  *              Jorge Cwik      :       Even faster checksumming.
 137  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 138  *                                      only frames. At least one pc tcp stack
 139  *                                      generates them.
 140  *              Alan Cox        :       Cache last socket.
 141  *              Alan Cox        :       Per route irtt.
 142  *              Matt Day        :       Select() match BSD precisely on error
 143  *              Alan Cox        :       New buffers
 144  *              Mark Tamsky     :       Various sk->prot->retransmits and 
 145  *                                      sk->retransmits misupdating fixed.
 146  *                                      Fixed tcp_write_timeout: stuck close,
 147  *                                      and TCP syn retries gets used now.
 148  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 149  *                                      ack if stat is TCP_CLOSED.
 150  *              Alan Cox        :       Look up device on a retransmit - routes may
 151  *                                      change. Doesn't yet cope with MSS shrink right
 152  *                                      but its a start!
 153  *
 154  *
 155  * To Fix:
 156  *              Fast path the code. Two things here - fix the window calculation
 157  *              so it doesn't iterate over the queue, also spot packets with no funny
 158  *              options arriving in order and process directly.
 159  *
 160  *              Implement RFC 1191 [Path MTU discovery]
 161  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 162  *              Rewrite output state machine to use a single queue and do low window
 163  *              situations as per the spec (RFC 1122)
 164  *              Speed up input assembly algorithm.
 165  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 166  *              could do with it working on IPv4
 167  *              User settable/learned rtt/max window/mtu
 168  *              Cope with MTU/device switches when retransmitting in tcp.
 169  *              Fix the window handling to use PR's new code.
 170  *
 171  *              Change the fundamental structure to a single send queue maintained
 172  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 173  *              active routes too]). Cut the queue off in tcp_retransmit/
 174  *              tcp_transmit.
 175  *              Change the receive queue to assemble as it goes. This lets us
 176  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 177  *              tcp_data/tcp_read as well as the window shrink crud.
 178  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 179  *              tcp_queue_skb seem obvious routines to extract.
 180  *      
 181  *              This program is free software; you can redistribute it and/or
 182  *              modify it under the terms of the GNU General Public License
 183  *              as published by the Free Software Foundation; either version
 184  *              2 of the License, or(at your option) any later version.
 185  *
 186  * Description of States:
 187  *
 188  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 189  *
 190  *      TCP_SYN_RECV            received a connection request, sent ack,
 191  *                              waiting for final ack in three-way handshake.
 192  *
 193  *      TCP_ESTABLISHED         connection established
 194  *
 195  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 196  *                              transmission of remaining buffered data
 197  *
 198  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 199  *                              to shutdown
 200  *
 201  *      TCP_CLOSING             both sides have shutdown but we still have
 202  *                              data we have to finish sending
 203  *
 204  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 205  *                              closed, can only be entered from FIN_WAIT2
 206  *                              or CLOSING.  Required because the other end
 207  *                              may not have gotten our last ACK causing it
 208  *                              to retransmit the data packet (which we ignore)
 209  *
 210  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 211  *                              us to finish writing our data and to shutdown
 212  *                              (we have to close() to move on to LAST_ACK)
 213  *
 214  *      TCP_LAST_ACK            out side has shutdown after remote has
 215  *                              shutdown.  There may still be data in our
 216  *                              buffer that we have to finish sending
 217  *              
 218  *      TCP_CLOSE               socket is finished
 219  */
 220 
 221 #include <linux/types.h>
 222 #include <linux/sched.h>
 223 #include <linux/mm.h>
 224 #include <linux/time.h>
 225 #include <linux/string.h>
 226 #include <linux/config.h>
 227 #include <linux/socket.h>
 228 #include <linux/sockios.h>
 229 #include <linux/termios.h>
 230 #include <linux/in.h>
 231 #include <linux/fcntl.h>
 232 #include <linux/inet.h>
 233 #include <linux/netdevice.h>
 234 #include <net/snmp.h>
 235 #include <net/ip.h>
 236 #include <net/protocol.h>
 237 #include <net/icmp.h>
 238 #include <net/tcp.h>
 239 #include <net/arp.h>
 240 #include <linux/skbuff.h>
 241 #include <net/sock.h>
 242 #include <net/route.h>
 243 #include <linux/errno.h>
 244 #include <linux/timer.h>
 245 #include <asm/system.h>
 246 #include <asm/segment.h>
 247 #include <linux/mm.h>
 248 #include <net/checksum.h>
 249 
 250 /*
 251  *      The MSL timer is the 'normal' timer.
 252  */
 253  
 254 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 255 
 256 #define SEQ_TICK 3
 257 unsigned long seq_offset;
 258 struct tcp_mib  tcp_statistics;
 259 
 260 /*
 261  *      Cached last hit socket
 262  */
 263  
 264 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 265 volatile unsigned short  th_cache_dport, th_cache_sport;
 266 volatile struct sock *th_cache_sk;
 267 
 268 void tcp_cache_zap(void)
     /*  */
 269 {
 270         unsigned long flags;
 271         save_flags(flags);
 272         cli();
 273         th_cache_saddr=0;
 274         th_cache_daddr=0;
 275         th_cache_dport=0;
 276         th_cache_sport=0;
 277         th_cache_sk=NULL;
 278         restore_flags(flags);
 279 }
 280 
 281 static void tcp_close(struct sock *sk, int timeout);
 282 
 283 
 284 /*
 285  *      The less said about this the better, but it works and will do for 1.2 
 286  */
 287 
 288 static struct wait_queue *master_select_wakeup;
 289 
 290 static __inline__ int min(unsigned int a, unsigned int b)
     /*  */
 291 {
 292         if (a < b) 
 293                 return(a);
 294         return(b);
 295 }
 296 
 297 #undef STATE_TRACE
 298 
 299 #ifdef STATE_TRACE
 300 static char *statename[]={
 301         "Unused","Established","Syn Sent","Syn Recv",
 302         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 303         "Close Wait","Last ACK","Listen","Closing"
 304 };
 305 #endif
 306 
 307 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /*  */
 308 {
 309         if(sk->state==TCP_ESTABLISHED)
 310                 tcp_statistics.TcpCurrEstab--;
 311 #ifdef STATE_TRACE
 312         if(sk->debug)
 313                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 314 #endif  
 315         /* This is a hack but it doesn't occur often and it's going to
 316            be a real        to fix nicely */
 317            
 318         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 319         {
 320                 wake_up_interruptible(&master_select_wakeup);
 321         }
 322         sk->state=state;
 323         if(state==TCP_ESTABLISHED)
 324                 tcp_statistics.TcpCurrEstab++;
 325 }
 326 
 327 /*
 328  *      This routine picks a TCP windows for a socket based on
 329  *      the following constraints
 330  *  
 331  *      1. The window can never be shrunk once it is offered (RFC 793)
 332  *      2. We limit memory per socket
 333  *   
 334  *      For now we use NET2E3's heuristic of offering half the memory
 335  *      we have handy. All is not as bad as this seems however because
 336  *      of two things. Firstly we will bin packets even within the window
 337  *      in order to get the data we are waiting for into the memory limit.
 338  *      Secondly we bin common duplicate forms at receive time
 339  *      Better heuristics welcome
 340  */
 341    
 342 int tcp_select_window(struct sock *sk)
     /*  */
 343 {
 344         int new_window = sk->prot->rspace(sk);
 345         
 346         if(sk->window_clamp)
 347                 new_window=min(sk->window_clamp,new_window);
 348         /*
 349          *      Two things are going on here.  First, we don't ever offer a
 350          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 351          *      receiver side of SWS as specified in RFC1122.
 352          *      Second, we always give them at least the window they
 353          *      had before, in order to avoid retracting window.  This
 354          *      is technically allowed, but RFC1122 advises against it and
 355          *      in practice it causes trouble.
 356          *
 357          *      Fixme: This doesn't correctly handle the case where
 358          *      new_window > sk->window but not by enough to allow for the
 359          *      shift in sequence space. 
 360          */
 361         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 362                 return(sk->window);
 363         return(new_window);
 364 }
 365 
 366 /*
 367  *      Find someone to 'accept'. Must be called with
 368  *      sk->inuse=1 or cli()
 369  */ 
 370 
 371 static struct sk_buff *tcp_find_established(struct sock *s)
     /*  */
 372 {
 373         struct sk_buff *p=skb_peek(&s->receive_queue);
 374         if(p==NULL)
 375                 return NULL;
 376         do
 377         {
 378                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 379                         return p;
 380                 p=p->next;
 381         }
 382         while(p!=(struct sk_buff *)&s->receive_queue);
 383         return NULL;
 384 }
 385 
 386 /*
 387  *      Remove a completed connection and return it. This is used by
 388  *      tcp_accept() to get connections from the queue.
 389  */
 390 
 391 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /*  */
 392 {
 393         struct sk_buff *skb;
 394         unsigned long flags;
 395         save_flags(flags);
 396         cli(); 
 397         skb=tcp_find_established(s);
 398         if(skb!=NULL)
 399                 skb_unlink(skb);        /* Take it off the queue */
 400         restore_flags(flags);
 401         return skb;
 402 }
 403 
 404 /* 
 405  *      This routine closes sockets which have been at least partially
 406  *      opened, but not yet accepted. Currently it is only called by
 407  *      tcp_close, and timeout mirrors the value there. 
 408  */
 409 
 410 static void tcp_close_pending (struct sock *sk) 
     /*  */
 411 {
 412         struct sk_buff *skb;
 413 
 414         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 415         {
 416                 skb->sk->dead=1;
 417                 tcp_close(skb->sk, 0);
 418                 kfree_skb(skb, FREE_READ);
 419         }
 420         return;
 421 }
 422 
 423 /*
 424  *      Enter the time wait state. 
 425  */
 426 
 427 static void tcp_time_wait(struct sock *sk)
     /*  */
 428 {
 429         tcp_set_state(sk,TCP_TIME_WAIT);
 430         sk->shutdown = SHUTDOWN_MASK;
 431         if (!sk->dead)
 432                 sk->state_change(sk);
 433         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 434 }
 435 
 436 /*
 437  *      A socket has timed out on its send queue and wants to do a
 438  *      little retransmitting. Currently this means TCP.
 439  */
 440 
 441 void tcp_do_retransmit(struct sock *sk, int all)
     /*  */
 442 {
 443         struct sk_buff * skb;
 444         struct proto *prot;
 445         struct device *dev;
 446         int ct=0;
 447         struct rtable *rt;
 448 
 449         prot = sk->prot;
 450         skb = sk->send_head;
 451 
 452         while (skb != NULL)
 453         {
 454                 struct tcphdr *th;
 455                 struct iphdr *iph;
 456                 int size;
 457 
 458                 dev = skb->dev;
 459                 IS_SKB(skb);
 460                 skb->when = jiffies;
 461 
 462                 /*
 463                  *      Discard the surplus MAC header
 464                  */
 465                  
 466                 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
 467 
 468                 /*
 469                  * In general it's OK just to use the old packet.  However we
 470                  * need to use the current ack and window fields.  Urg and
 471                  * urg_ptr could possibly stand to be updated as well, but we
 472                  * don't keep the necessary data.  That shouldn't be a problem,
 473                  * if the other end is doing the right thing.  Since we're
 474                  * changing the packet, we have to issue a new IP identifier.
 475                  */
 476 
 477                 iph = (struct iphdr *)skb->data;
 478                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 479                 size = ntohs(iph->tot_len) - (iph->ihl<<2);
 480                 
 481                 /*
 482                  *      Note: We ought to check for window limits here but
 483                  *      currently this is done (less efficiently) elsewhere.
 484                  */
 485 
 486                 iph->id = htons(ip_id_count++);
 487                 ip_send_check(iph);
 488                 
 489                 /*
 490                  *      Put a MAC header back on (may cause ARPing)
 491                  */
 492                  
 493                 if(skb->localroute)
 494                         rt=ip_rt_local(iph->daddr,NULL,NULL);
 495                 else
 496                         rt=ip_rt_route(iph->daddr,NULL,NULL);
 497                         
 498                 if(rt==NULL)    /* Deep poo */
 499                 {
 500                         if(skb->sk)
 501                         {
 502                                 skb->sk->err=ENETUNREACH;
 503                                 skb->sk->error_report(skb->sk);
 504                         }
 505                 }
 506                 else
 507                 {
 508                         dev=rt->rt_dev;
 509                         skb->raddr=rt->rt_gateway;
 510                         if(skb->raddr==0)
 511                                 skb->raddr=iph->daddr;
 512                         skb->dev=dev;
 513                         skb->arp=1;
 514                         if(dev->hard_header)
 515                         {
 516                                 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
 517                                         skb->arp=0;
 518                         }
 519                 
 520                         /*
 521                          *      This is not the right way to handle this. We have to
 522                          *      issue an up to date window and ack report with this 
 523                          *      retransmit to keep the odd buggy tcp that relies on 
 524                          *      the fact BSD does this happy. 
 525                          *      We don't however need to recalculate the entire 
 526                          *      checksum, so someone wanting a small problem to play
 527                          *      with might like to implement RFC1141/RFC1624 and speed
 528                          *      this up by avoiding a full checksum.
 529                          */
 530                  
 531                         th->ack_seq = ntohl(sk->acked_seq);
 532                         th->window = ntohs(tcp_select_window(sk));
 533                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 534                 
 535                         /*
 536                          *      If the interface is (still) up and running, kick it.
 537                          */
 538         
 539                         if (dev->flags & IFF_UP)
 540                         {
 541                                 /*
 542                                  *      If the packet is still being sent by the device/protocol
 543                                  *      below then don't retransmit. This is both needed, and good -
 544                                  *      especially with connected mode AX.25 where it stops resends
 545                                  *      occurring of an as yet unsent anyway frame!
 546                                  *      We still add up the counts as the round trip time wants
 547                                  *      adjusting.
 548                                  */
 549                                 if (sk && !skb_device_locked(skb))
 550                                 {
 551                                         /* Remove it from any existing driver queue first! */
 552                                         skb_unlink(skb);
 553                                         /* Now queue it */
 554                                         ip_statistics.IpOutRequests++;
 555                                         dev_queue_xmit(skb, dev, sk->priority);
 556                                 }
 557                         }
 558                 }
 559                 
 560                 /*
 561                  *      Count retransmissions
 562                  */
 563                  
 564                 ct++;
 565                 sk->prot->retransmits ++;
 566                 tcp_statistics.TcpRetransSegs++;
 567                 
 568 
 569                 /*
 570                  *      Only one retransmit requested.
 571                  */
 572         
 573                 if (!all)
 574                         break;
 575 
 576                 /*
 577                  *      This should cut it off before we send too many packets.
 578                  */
 579 
 580                 if (ct >= sk->cong_window)
 581                         break;
 582                 skb = skb->link3;
 583         }
 584 }
 585 
 586 /*
 587  *      Reset the retransmission timer
 588  */
 589  
 590 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /*  */
 591 {
 592         del_timer(&sk->retransmit_timer);
 593         sk->ip_xmit_timeout = why;
 594         if((int)when < 0)
 595         {
 596                 when=3;
 597                 printk("Error: Negative timer in xmit_timer\n");
 598         }
 599         sk->retransmit_timer.expires=jiffies+when;
 600         add_timer(&sk->retransmit_timer);
 601 }
 602 
 603 /*
 604  *      This is the normal code called for timeouts.  It does the retransmission
 605  *      and then does backoff.  tcp_do_retransmit is separated out because
 606  *      tcp_ack needs to send stuff from the retransmit queue without
 607  *      initiating a backoff.
 608  */
 609 
 610 
 611 void tcp_retransmit_time(struct sock *sk, int all)
     /*  */
 612 {
 613         tcp_do_retransmit(sk, all);
 614 
 615         /*
 616          * Increase the timeout each time we retransmit.  Note that
 617          * we do not increase the rtt estimate.  rto is initialized
 618          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 619          * that doubling rto each time is the least we can get away with.
 620          * In KA9Q, Karn uses this for the first few times, and then
 621          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 622          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 623          * defined in the protocol as the maximum possible RTT.  I guess
 624          * we'll have to use something other than TCP to talk to the
 625          * University of Mars.
 626          *
 627          * PAWS allows us longer timeouts and large windows, so once
 628          * implemented ftp to mars will work nicely. We will have to fix
 629          * the 120 second clamps though!
 630          */
 631 
 632         sk->retransmits++;
 633         sk->prot->retransmits++;
 634         sk->backoff++;
 635         sk->rto = min(sk->rto << 1, 120*HZ);
 636         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 637 }
 638 
 639 
 640 /*
 641  *      A timer event has trigger a tcp retransmit timeout. The
 642  *      socket xmit queue is ready and set up to send. Because
 643  *      the ack receive code keeps the queue straight we do
 644  *      nothing clever here.
 645  */
 646 
 647 static void tcp_retransmit(struct sock *sk, int all)
     /*  */
 648 {
 649         if (all) 
 650         {
 651                 tcp_retransmit_time(sk, all);
 652                 return;
 653         }
 654 
 655         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 656         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 657         sk->cong_count = 0;
 658 
 659         sk->cong_window = 1;
 660 
 661         /* Do the actual retransmit. */
 662         tcp_retransmit_time(sk, all);
 663 }
 664 
 665 /*
 666  *      A write timeout has occurred. Process the after effects.
 667  */
 668 
 669 static int tcp_write_timeout(struct sock *sk)
     /*  */
 670 {
 671         /*
 672          *      Look for a 'soft' timeout.
 673          */
 674         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 675                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 676         {
 677                 /*
 678                  *      Attempt to recover if arp has changed (unlikely!) or
 679                  *      a route has shifted (not supported prior to 1.3).
 680                  */
 681                 arp_destroy (sk->daddr, 0);
 682                 /*ip_route_check (sk->daddr);*/
 683         }
 684         
 685         /*
 686          *      Have we tried to SYN too many times (repent repent 8))
 687          */
 688          
 689         if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
 690         {
 691                 sk->err=ETIMEDOUT;
 692                 sk->error_report(sk);
 693                 del_timer(&sk->retransmit_timer);
 694                 tcp_statistics.TcpAttemptFails++;       /* Is this right ??? - FIXME - */
 695                 tcp_set_state(sk,TCP_CLOSE);
 696                 /* Don't FIN, we got nothing back */
 697                 release_sock(sk);
 698                 return 0;
 699         }
 700         /*
 701          *      Has it gone just too far ?
 702          */
 703         if (sk->retransmits > TCP_RETR2) 
 704         {
 705                 sk->err = ETIMEDOUT;
 706                 sk->error_report(sk);
 707                 del_timer(&sk->retransmit_timer);
 708                 /*
 709                  *      Time wait the socket 
 710                  */
 711                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 712                 {
 713                         tcp_set_state(sk,TCP_TIME_WAIT);
 714                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 715                 }
 716                 else
 717                 {
 718                         /*
 719                          *      Clean up time.
 720                          */
 721                         tcp_set_state(sk, TCP_CLOSE);
 722                         release_sock(sk);
 723                         return 0;
 724                 }
 725         }
 726         return 1;
 727 }
 728 
 729 /*
 730  *      The TCP retransmit timer. This lacks a few small details.
 731  *
 732  *      1.      An initial rtt timeout on the probe0 should cause what we can
 733  *              of the first write queue buffer to be split and sent.
 734  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 735  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 736  *              tcp_err should save a 'soft error' for us.
 737  */
 738 
 739 static void retransmit_timer(unsigned long data)
     /*  */
 740 {
 741         struct sock *sk = (struct sock*)data;
 742         int why = sk->ip_xmit_timeout;
 743 
 744         /* 
 745          * only process if socket is not in use
 746          */
 747 
 748         cli();
 749         if (sk->inuse || in_bh) 
 750         {
 751                 /* Try again in 1 second */
 752                 sk->retransmit_timer.expires = jiffies+HZ;
 753                 add_timer(&sk->retransmit_timer);
 754                 sti();
 755                 return;
 756         }
 757 
 758         sk->inuse = 1;
 759         sti();
 760 
 761         /* Always see if we need to send an ack. */
 762 
 763         if (sk->ack_backlog && !sk->zapped) 
 764         {
 765                 sk->prot->read_wakeup (sk);
 766                 if (! sk->dead)
 767                         sk->data_ready(sk,0);
 768         }
 769 
 770         /* Now we need to figure out why the socket was on the timer. */
 771 
 772         switch (why) 
 773         {
 774                 /* Window probing */
 775                 case TIME_PROBE0:
 776                         tcp_send_probe0(sk);
 777                         tcp_write_timeout(sk);
 778                         break;
 779                 /* Retransmitting */
 780                 case TIME_WRITE:
 781                         /* It could be we got here because we needed to send an ack.
 782                          * So we need to check for that.
 783                          */
 784                 {
 785                         struct sk_buff *skb;
 786                         unsigned long flags;
 787 
 788                         save_flags(flags);
 789                         cli();
 790                         skb = sk->send_head;
 791                         if (!skb) 
 792                         {
 793                                 restore_flags(flags);
 794                         } 
 795                         else 
 796                         {
 797                                 /*
 798                                  *      Kicked by a delayed ack. Reset timer
 799                                  *      correctly now
 800                                  */
 801                                 if (jiffies < skb->when + sk->rto) 
 802                                 {
 803                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 804                                         restore_flags(flags);
 805                                         break;
 806                                 }
 807                                 restore_flags(flags);
 808                                 /*
 809                                  *      Retransmission
 810                                  */
 811                                 sk->prot->retransmit (sk, 0);
 812                                 tcp_write_timeout(sk);
 813                         }
 814                         break;
 815                 }
 816                 /* Sending Keepalives */
 817                 case TIME_KEEPOPEN:
 818                         /* 
 819                          * this reset_timer() call is a hack, this is not
 820                          * how KEEPOPEN is supposed to work.
 821                          */
 822                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 823 
 824                         /* Send something to keep the connection open. */
 825                         if (sk->prot->write_wakeup)
 826                                   sk->prot->write_wakeup (sk);
 827                         sk->retransmits++;
 828                         sk->prot->retransmits++;
 829                         tcp_write_timeout(sk);
 830                         break;
 831                 default:
 832                         printk ("rexmit_timer: timer expired - reason unknown\n");
 833                         break;
 834         }
 835         release_sock(sk);
 836 }
 837 
 838 /*
 839  * This routine is called by the ICMP module when it gets some
 840  * sort of error condition.  If err < 0 then the socket should
 841  * be closed and the error returned to the user.  If err > 0
 842  * it's just the icmp type << 8 | icmp code.  After adjustment
 843  * header points to the first 8 bytes of the tcp header.  We need
 844  * to find the appropriate port.
 845  */
 846 
 847 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /*  */
 848         unsigned long saddr, struct inet_protocol *protocol)
 849 {
 850         struct tcphdr *th;
 851         struct sock *sk;
 852         struct iphdr *iph=(struct iphdr *)header;
 853   
 854         header+=4*iph->ihl;
 855    
 856 
 857         th =(struct tcphdr *)header;
 858         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 859 
 860         if (sk == NULL) 
 861                 return;
 862   
 863         if(err<0)
 864         {
 865                 sk->err = -err;
 866                 sk->error_report(sk);
 867                 return;
 868         }
 869 
 870         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 871         {
 872                 /*
 873                  * FIXME:
 874                  * For now we will just trigger a linear backoff.
 875                  * The slow start code should cause a real backoff here.
 876                  */
 877                 if (sk->cong_window > 4)
 878                         sk->cong_window--;
 879                 return;
 880         }
 881 
 882         /*
 883          * If we've already connected we will keep trying
 884          * until we time out, or the user gives up.
 885          */
 886 
 887         err &= 0xff;
 888         if (err < 13 && (icmp_err_convert[err].fatal || sk->state == TCP_SYN_SENT))
 889         {
 890                 sk->err = icmp_err_convert[err].errno;
 891                 if (sk->state == TCP_SYN_SENT) 
 892                 {
 893                         tcp_statistics.TcpAttemptFails++;
 894                         tcp_set_state(sk,TCP_CLOSE);
 895                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 896                 }
 897         }
 898         return;
 899 }
 900 
 901 
 902 /*
 903  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 904  *      in the received data queue (ie a frame missing that needs sending to us). Not
 905  *      sorting using two queues as data arrives makes life so much harder.
 906  */
 907 
 908 static int tcp_readable(struct sock *sk)
     /*  */
 909 {
 910         unsigned long counted;
 911         unsigned long amount;
 912         struct sk_buff *skb;
 913         int sum;
 914         unsigned long flags;
 915 
 916         if(sk && sk->debug)
 917                 printk("tcp_readable: %p - ",sk);
 918 
 919         save_flags(flags);
 920         cli();
 921         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 922         {
 923                 restore_flags(flags);
 924                 if(sk && sk->debug) 
 925                         printk("empty\n");
 926                 return(0);
 927         }
 928   
 929         counted = sk->copied_seq;       /* Where we are at the moment */
 930         amount = 0;
 931   
 932         /* 
 933          *      Do until a push or until we are out of data. 
 934          */
 935          
 936         do 
 937         {
 938                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 939                         break;
 940                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 941                 if (skb->h.th->syn)
 942                         sum++;
 943                 if (sum > 0) 
 944                 {                                       /* Add it up, move on */
 945                         amount += sum;
 946                         if (skb->h.th->syn) 
 947                                 amount--;
 948                         counted += sum;
 949                 }
 950                 /*
 951                  * Don't count urg data ... but do it in the right place!
 952                  * Consider: "old_data (ptr is here) URG PUSH data"
 953                  * The old code would stop at the first push because
 954                  * it counted the urg (amount==1) and then does amount--
 955                  * *after* the loop.  This means tcp_readable() always
 956                  * returned zero if any URG PUSH was in the queue, even
 957                  * though there was normal data available. If we subtract
 958                  * the urg data right here, we even get it to work for more
 959                  * than one URG PUSH skb without normal data.
 960                  * This means that select() finally works now with urg data
 961                  * in the queue.  Note that rlogin was never affected
 962                  * because it doesn't use select(); it uses two processes
 963                  * and a blocking read().  And the queue scan in tcp_read()
 964                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 965                  */
 966                 if (skb->h.th->urg)
 967                         amount--;       /* don't count urg data */
 968                 if (amount && skb->h.th->psh) break;
 969                 skb = skb->next;
 970         }
 971         while(skb != (struct sk_buff *)&sk->receive_queue);
 972 
 973         restore_flags(flags);
 974         if(sk->debug)
 975                 printk("got %lu bytes.\n",amount);
 976         return(amount);
 977 }
 978 
 979 /*
 980  * LISTEN is a special case for select..
 981  */
 982 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
 983 {
 984         if (sel_type == SEL_IN) {
 985                 int retval;
 986 
 987                 sk->inuse = 1;
 988                 retval = (tcp_find_established(sk) != NULL);
 989                 release_sock(sk);
 990                 if (!retval)
 991                         select_wait(&master_select_wakeup,wait);
 992                 return retval;
 993         }
 994         return 0;
 995 }
 996 
 997 
 998 /*
 999  *      Wait for a TCP event.
1000  *
1001  *      Note that we don't need to set "sk->inuse", as the upper select layers
1002  *      take care of normal races (between the test and the event) and we don't
1003  *      go look at any of the socket buffers directly.
1004  */
1005 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /*  */
1006 {
1007         if (sk->state == TCP_LISTEN)
1008                 return tcp_listen_select(sk, sel_type, wait);
1009 
1010         switch(sel_type) {
1011         case SEL_IN:
1012                 if (sk->err)
1013                         return 1;
1014                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1015                         break;
1016 
1017                 if (sk->shutdown & RCV_SHUTDOWN)
1018                         return 1;
1019                         
1020                 if (sk->acked_seq == sk->copied_seq)
1021                         break;
1022 
1023                 if (sk->urg_seq != sk->copied_seq ||
1024                     sk->acked_seq != sk->copied_seq+1 ||
1025                     sk->urginline || !sk->urg_data)
1026                         return 1;
1027                 break;
1028 
1029         case SEL_OUT:
1030                 if (sk->err)
1031                         return 1;
1032                 if (sk->shutdown & SEND_SHUTDOWN) 
1033                         return 0;
1034                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1035                         break;
1036                 /*
1037                  * This is now right thanks to a small fix
1038                  * by Matt Dillon.
1039                  */
1040 
1041                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1042                         break;
1043                 return 1;
1044 
1045         case SEL_EX:
1046                 if (sk->urg_data)
1047                         return 1;
1048                 break;
1049         }
1050         select_wait(sk->sleep, wait);
1051         return 0;
1052 }
1053 
1054 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
1055 {
1056         int err;
1057         switch(cmd) 
1058         {
1059 
1060                 case TIOCINQ:
1061 #ifdef FIXME    /* FIXME: */
1062                 case FIONREAD:
1063 #endif
1064                 {
1065                         unsigned long amount;
1066 
1067                         if (sk->state == TCP_LISTEN) 
1068                                 return(-EINVAL);
1069 
1070                         sk->inuse = 1;
1071                         amount = tcp_readable(sk);
1072                         release_sock(sk);
1073                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1074                         if(err)
1075                                 return err;
1076                         put_user(amount, (int *)arg);
1077                         return(0);
1078                 }
1079                 case SIOCATMARK:
1080                 {
1081                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1082 
1083                         err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1084                         if (err)
1085                                 return err;
1086                         put_user(answ,(int *) arg);
1087                         return(0);
1088                 }
1089                 case TIOCOUTQ:
1090                 {
1091                         unsigned long amount;
1092 
1093                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1094                         amount = sk->prot->wspace(sk);
1095                         err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1096                         if(err)
1097                                 return err;
1098                         put_user(amount, (int *)arg);
1099                         return(0);
1100                 }
1101                 default:
1102                         return(-EINVAL);
1103         }
1104 }
1105 
1106 
1107 /*
1108  *      This routine computes a TCP checksum. 
1109  *
1110  *      Modified January 1995 from a go-faster DOS routine by
1111  *      Jorge Cwik <jorge@laser.satlink.net>
1112  */
1113  
1114 unsigned short tcp_check(struct tcphdr *th, int len,
     /*  */
1115           unsigned long saddr, unsigned long daddr, unsigned long base)
1116 {     
1117         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1118 }
1119 
1120 
1121 
1122 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /*  */
1123                 unsigned long daddr, int len, struct sock *sk)
1124 {
1125         th->check = 0;
1126         th->check = tcp_check(th, len, saddr, daddr,
1127                 csum_partial((char *)th,len,0));
1128         return;
1129 }
1130 
1131 /*
1132  *      This is the main buffer sending routine. We queue the buffer
1133  *      having checked it is sane seeming.
1134  */
1135  
1136 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /*  */
1137 {
1138         int size;
1139         struct tcphdr * th = skb->h.th;
1140 
1141         /*
1142          *      length of packet (not counting length of pre-tcp headers) 
1143          */
1144          
1145         size = skb->len - ((unsigned char *) th - skb->data);
1146 
1147         /*
1148          *      Sanity check it.. 
1149          */
1150          
1151         if (size < sizeof(struct tcphdr) || size > skb->len) 
1152         {
1153                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1154                         skb, skb->data, th, skb->len);
1155                 kfree_skb(skb, FREE_WRITE);
1156                 return;
1157         }
1158 
1159         /*
1160          *      If we have queued a header size packet.. (these crash a few
1161          *      tcp stacks if ack is not set)
1162          */
1163          
1164         if (size == sizeof(struct tcphdr)) 
1165         {
1166                 /* If it's got a syn or fin it's notionally included in the size..*/
1167                 if(!th->syn && !th->fin) 
1168                 {
1169                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1170                         kfree_skb(skb,FREE_WRITE);
1171                         return;
1172                 }
1173         }
1174 
1175         /*
1176          *      Actual processing.
1177          */
1178          
1179         tcp_statistics.TcpOutSegs++;  
1180         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1181         
1182         /*
1183          *      We must queue if
1184          *
1185          *      a) The right edge of this frame exceeds the window
1186          *      b) We are retransmitting (Nagle's rule)
1187          *      c) We have too many packets 'in flight'
1188          */
1189          
1190         if (after(skb->h.seq, sk->window_seq) ||
1191             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1192              sk->packets_out >= sk->cong_window) 
1193         {
1194                 /* checksum will be supplied by tcp_write_xmit.  So
1195                  * we shouldn't need to set it at all.  I'm being paranoid */
1196                 th->check = 0;
1197                 if (skb->next != NULL) 
1198                 {
1199                         printk("tcp_send_partial: next != NULL\n");
1200                         skb_unlink(skb);
1201                 }
1202                 skb_queue_tail(&sk->write_queue, skb);
1203                 
1204                 /*
1205                  *      If we don't fit we have to start the zero window
1206                  *      probes. This is broken - we really need to do a partial
1207                  *      send _first_ (This is what causes the Cisco and PC/TCP
1208                  *      grief).
1209                  */
1210                  
1211                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1212                     sk->send_head == NULL && sk->ack_backlog == 0)
1213                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1214         } 
1215         else 
1216         {
1217                 /*
1218                  *      This is going straight out
1219                  */
1220                  
1221                 th->ack_seq = ntohl(sk->acked_seq);
1222                 th->window = ntohs(tcp_select_window(sk));
1223 
1224                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1225 
1226                 sk->sent_seq = sk->write_seq;
1227                 
1228                 /*
1229                  *      This is mad. The tcp retransmit queue is put together
1230                  *      by the ip layer. This causes half the problems with
1231                  *      unroutable FIN's and other things.
1232                  */
1233                  
1234                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1235                 
1236                 /*
1237                  *      Set for next retransmit based on expected ACK time.
1238                  *      FIXME: We set this every time which means our 
1239                  *      retransmits are really about a window behind.
1240                  */
1241 
1242                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1243         }
1244 }
1245 
1246 /*
1247  *      Locking problems lead us to a messy situation where we can have
1248  *      multiple partially complete buffers queued up. This is really bad
1249  *      as we don't want to be sending partial buffers. Fix this with
1250  *      a semaphore or similar to lock tcp_write per socket.
1251  *
1252  *      These routines are pretty self descriptive.
1253  */
1254  
1255 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /*  */
1256 {
1257         struct sk_buff * skb;
1258         unsigned long flags;
1259 
1260         save_flags(flags);
1261         cli();
1262         skb = sk->partial;
1263         if (skb) {
1264                 sk->partial = NULL;
1265                 del_timer(&sk->partial_timer);
1266         }
1267         restore_flags(flags);
1268         return skb;
1269 }
1270 
1271 /*
1272  *      Empty the partial queue
1273  */
1274  
1275 static void tcp_send_partial(struct sock *sk)
     /*  */
1276 {
1277         struct sk_buff *skb;
1278 
1279         if (sk == NULL)
1280                 return;
1281         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1282                 tcp_send_skb(sk, skb);
1283 }
1284 
1285 /*
1286  *      Queue a partial frame
1287  */
1288  
1289 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /*  */
1290 {
1291         struct sk_buff * tmp;
1292         unsigned long flags;
1293 
1294         save_flags(flags);
1295         cli();
1296         tmp = sk->partial;
1297         if (tmp)
1298                 del_timer(&sk->partial_timer);
1299         sk->partial = skb;
1300         init_timer(&sk->partial_timer);
1301         /*
1302          *      Wait up to 1 second for the buffer to fill.
1303          */
1304         sk->partial_timer.expires = jiffies+HZ;
1305         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1306         sk->partial_timer.data = (unsigned long) sk;
1307         add_timer(&sk->partial_timer);
1308         restore_flags(flags);
1309         if (tmp)
1310                 tcp_send_skb(sk, tmp);
1311 }
1312 
1313 
1314 /*
1315  *      This routine sends an ack and also updates the window. 
1316  */
1317  
1318 static void tcp_send_ack(u32 sequence, u32 ack,
     /*  */
1319              struct sock *sk,
1320              struct tcphdr *th, unsigned long daddr)
1321 {
1322         struct sk_buff *buff;
1323         struct tcphdr *t1;
1324         struct device *dev = NULL;
1325         int tmp;
1326 
1327         if(sk->zapped)
1328                 return;         /* We have been reset, we may not send again */
1329                 
1330         /*
1331          * We need to grab some memory, and put together an ack,
1332          * and then put it into the queue to be sent.
1333          */
1334 
1335         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1336         if (buff == NULL) 
1337         {
1338                 /* 
1339                  *      Force it to send an ack. We don't have to do this
1340                  *      (ACK is unreliable) but it's much better use of 
1341                  *      bandwidth on slow links to send a spare ack than
1342                  *      resend packets. 
1343                  */
1344                  
1345                 sk->ack_backlog++;
1346                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1347                 {
1348                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1349                 }
1350                 return;
1351         }
1352 
1353         /*
1354          *      Assemble a suitable TCP frame
1355          */
1356          
1357         buff->sk = sk;
1358         buff->localroute = sk->localroute;
1359 
1360         /* 
1361          *      Put in the IP header and routing stuff. 
1362          */
1363          
1364         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1365                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1366         if (tmp < 0) 
1367         {
1368                 buff->free = 1;
1369                 sk->prot->wfree(sk, buff);
1370                 return;
1371         }
1372         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1373 
1374         memcpy(t1, th, sizeof(*t1));
1375 
1376         /*
1377          *      Swap the send and the receive. 
1378          */
1379          
1380         t1->dest = th->source;
1381         t1->source = th->dest;
1382         t1->seq = ntohl(sequence);
1383         t1->ack = 1;
1384         sk->window = tcp_select_window(sk);
1385         t1->window = ntohs(sk->window);
1386         t1->res1 = 0;
1387         t1->res2 = 0;
1388         t1->rst = 0;
1389         t1->urg = 0;
1390         t1->syn = 0;
1391         t1->psh = 0;
1392         t1->fin = 0;
1393         
1394         /*
1395          *      If we have nothing queued for transmit and the transmit timer
1396          *      is on we are just doing an ACK timeout and need to switch
1397          *      to a keepalive.
1398          */
1399          
1400         if (ack == sk->acked_seq) 
1401         {
1402                 sk->ack_backlog = 0;
1403                 sk->bytes_rcv = 0;
1404                 sk->ack_timed = 0;
1405                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1406                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1407                 {
1408                         if(sk->keepopen) {
1409                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1410                         } else {
1411                                 delete_timer(sk);
1412                         }
1413                 }
1414         }
1415         
1416         /*
1417          *      Fill in the packet and send it
1418          */
1419          
1420         t1->ack_seq = ntohl(ack);
1421         t1->doff = sizeof(*t1)/4;
1422         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1423         if (sk->debug)
1424                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1425         tcp_statistics.TcpOutSegs++;
1426         sk->prot->queue_xmit(sk, dev, buff, 1);
1427 }
1428 
1429 
1430 /* 
1431  *      This routine builds a generic TCP header. 
1432  */
1433  
1434 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /*  */
1435 {
1436 
1437         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1438         th->seq = htonl(sk->write_seq);
1439         th->psh =(push == 0) ? 1 : 0;
1440         th->doff = sizeof(*th)/4;
1441         th->ack = 1;
1442         th->fin = 0;
1443         sk->ack_backlog = 0;
1444         sk->bytes_rcv = 0;
1445         sk->ack_timed = 0;
1446         th->ack_seq = htonl(sk->acked_seq);
1447         sk->window = tcp_select_window(sk);
1448         th->window = htons(sk->window);
1449 
1450         return(sizeof(*th));
1451 }
1452 
1453 /*
1454  *      This routine copies from a user buffer into a socket,
1455  *      and starts the transmit system.
1456  */
1457 
1458 static int tcp_write(struct sock *sk, const unsigned char *from,
     /*  */
1459           int len, int nonblock, unsigned flags)
1460 {
1461         int copied = 0;
1462         int copy;
1463         int tmp;
1464         struct sk_buff *skb;
1465         struct sk_buff *send_tmp;
1466         struct proto *prot;
1467         struct device *dev = NULL;
1468 
1469         sk->inuse=1;
1470         prot = sk->prot;
1471         while(len > 0) 
1472         {
1473                 if (sk->err) 
1474                 {                       /* Stop on an error */
1475                         release_sock(sk);
1476                         if (copied) 
1477                                 return(copied);
1478                         tmp = -sk->err;
1479                         sk->err = 0;
1480                         return(tmp);
1481                 }
1482 
1483                 /*
1484                  *      First thing we do is make sure that we are established. 
1485                  */
1486         
1487                 if (sk->shutdown & SEND_SHUTDOWN) 
1488                 {
1489                         release_sock(sk);
1490                         sk->err = EPIPE;
1491                         if (copied) 
1492                                 return(copied);
1493                         sk->err = 0;
1494                         return(-EPIPE);
1495                 }
1496 
1497                 /* 
1498                  *      Wait for a connection to finish.
1499                  */
1500         
1501                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1502                 {
1503                         if (sk->err) 
1504                         {
1505                                 release_sock(sk);
1506                                 if (copied) 
1507                                         return(copied);
1508                                 tmp = -sk->err;
1509                                 sk->err = 0;
1510                                 return(tmp);
1511                         }
1512 
1513                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1514                         {
1515                                 release_sock(sk);
1516                                 if (copied) 
1517                                         return(copied);
1518 
1519                                 if (sk->err) 
1520                                 {
1521                                         tmp = -sk->err;
1522                                         sk->err = 0;
1523                                         return(tmp);
1524                                 }
1525 
1526                                 if (sk->keepopen) 
1527                                 {
1528                                         send_sig(SIGPIPE, current, 0);
1529                                 }
1530                                 return(-EPIPE);
1531                         }
1532 
1533                         if (nonblock || copied) 
1534                         {
1535                                 release_sock(sk);
1536                                 if (copied) 
1537                                         return(copied);
1538                                 return(-EAGAIN);
1539                         }
1540 
1541                         release_sock(sk);
1542                         cli();
1543                 
1544                         if (sk->state != TCP_ESTABLISHED &&
1545                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1546                         {
1547                                 interruptible_sleep_on(sk->sleep);
1548                                 if (current->signal & ~current->blocked) 
1549                                 {
1550                                         sti();
1551                                         if (copied) 
1552                                                 return(copied);
1553                                         return(-ERESTARTSYS);
1554                                 }
1555                         }
1556                         sk->inuse = 1;
1557                         sti();
1558                 }
1559 
1560         /*
1561          * The following code can result in copy <= if sk->mss is ever
1562          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1563          * sk->mtu is constant once SYN processing is finished.  I.e. we
1564          * had better not get here until we've seen his SYN and at least one
1565          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1566          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1567          * non-decreasing.  Note that any ioctl to set user_mss must be done
1568          * before the exchange of SYN's.  If the initial ack from the other
1569          * end has a window of 0, max_window and thus mss will both be 0.
1570          */
1571 
1572         /* 
1573          *      Now we need to check if we have a half built packet. 
1574          */
1575 
1576                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1577                 {
1578                         int hdrlen;
1579 
1580                          /* IP header + TCP header */
1581                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1582                                  + sizeof(struct tcphdr);
1583         
1584                         /* Add more stuff to the end of skb->len */
1585                         if (!(flags & MSG_OOB)) 
1586                         {
1587                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1588                                 /* FIXME: this is really a bug. */
1589                                 if (copy <= 0) 
1590                                 {
1591                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1592                                         copy = 0;
1593                                 }
1594           
1595                                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1596                                 from += copy;
1597                                 copied += copy;
1598                                 len -= copy;
1599                                 sk->write_seq += copy;
1600                         }
1601                         if ((skb->len - hdrlen) >= sk->mss ||
1602                                 (flags & MSG_OOB) || !sk->packets_out)
1603                                 tcp_send_skb(sk, skb);
1604                         else
1605                                 tcp_enqueue_partial(skb, sk);
1606                         continue;
1607                 }
1608 
1609         /*
1610          * We also need to worry about the window.
1611          * If window < 1/2 the maximum window we've seen from this
1612          *   host, don't use it.  This is sender side
1613          *   silly window prevention, as specified in RFC1122.
1614          *   (Note that this is different than earlier versions of
1615          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1616          *   use the whole MSS.  Since the results in the right
1617          *   edge of the packet being outside the window, it will
1618          *   be queued for later rather than sent.
1619          */
1620 
1621                 copy = sk->window_seq - sk->write_seq;
1622                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1623                         copy = sk->mss;
1624                 if (copy > len)
1625                         copy = len;
1626 
1627         /*
1628          *      We should really check the window here also. 
1629          */
1630          
1631                 send_tmp = NULL;
1632                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1633                 {
1634                         /*
1635                          *      We will release the socket in case we sleep here. 
1636                          */
1637                         release_sock(sk);
1638                         /*
1639                          *      NB: following must be mtu, because mss can be increased.
1640                          *      mss is always <= mtu 
1641                          */
1642                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1643                         sk->inuse = 1;
1644                         send_tmp = skb;
1645                 } 
1646                 else 
1647                 {
1648                         /*
1649                          *      We will release the socket in case we sleep here. 
1650                          */
1651                         release_sock(sk);
1652                         skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1653                         sk->inuse = 1;
1654                 }
1655 
1656                 /*
1657                  *      If we didn't get any memory, we need to sleep. 
1658                  */
1659 
1660                 if (skb == NULL) 
1661                 {
1662                         sk->socket->flags |= SO_NOSPACE;
1663                         if (nonblock) 
1664                         {
1665                                 release_sock(sk);
1666                                 if (copied) 
1667                                         return(copied);
1668                                 return(-EAGAIN);
1669                         }
1670 
1671                         /*
1672                          *      FIXME: here is another race condition. 
1673                          */
1674 
1675                         tmp = sk->wmem_alloc;
1676                         release_sock(sk);
1677                         cli();
1678                         /*
1679                          *      Again we will try to avoid it. 
1680                          */
1681                         if (tmp <= sk->wmem_alloc &&
1682                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1683                                 && sk->err == 0) 
1684                         {
1685                                 sk->socket->flags &= ~SO_NOSPACE;
1686                                 interruptible_sleep_on(sk->sleep);
1687                                 if (current->signal & ~current->blocked) 
1688                                 {
1689                                         sti();
1690                                         if (copied) 
1691                                                 return(copied);
1692                                         return(-ERESTARTSYS);
1693                                 }
1694                         }
1695                         sk->inuse = 1;
1696                         sti();
1697                         continue;
1698                 }
1699 
1700                 skb->sk = sk;
1701                 skb->free = 0;
1702                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1703         
1704                 /*
1705                  * FIXME: we need to optimize this.
1706                  * Perhaps some hints here would be good.
1707                  */
1708                 
1709                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1710                                  IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1711                 if (tmp < 0 ) 
1712                 {
1713                         prot->wfree(sk, skb);
1714                         release_sock(sk);
1715                         if (copied) 
1716                                 return(copied);
1717                         return(tmp);
1718                 }
1719                 skb->dev = dev;
1720                 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1721                 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1722                 if (tmp < 0) 
1723                 {
1724                         prot->wfree(sk, skb);
1725                         release_sock(sk);
1726                         if (copied) 
1727                                 return(copied);
1728                         return(tmp);
1729                 }
1730 
1731                 if (flags & MSG_OOB) 
1732                 {
1733                         skb->h.th->urg = 1;
1734                         skb->h.th->urg_ptr = ntohs(copy);
1735                 }
1736 
1737                 memcpy_fromfs(skb_put(skb,copy), from, copy);
1738                 
1739                 from += copy;
1740                 copied += copy;
1741                 len -= copy;
1742                 skb->free = 0;
1743                 sk->write_seq += copy;
1744         
1745                 if (send_tmp != NULL && sk->packets_out) 
1746                 {
1747                         tcp_enqueue_partial(send_tmp, sk);
1748                         continue;
1749                 }
1750                 tcp_send_skb(sk, skb);
1751         }
1752         sk->err = 0;
1753 
1754 /*
1755  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1756  *      interactive fast network servers. It's meant to be on and
1757  *      it really improves the throughput though not the echo time
1758  *      on my slow slip link - Alan
1759  */
1760 
1761 /*
1762  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1763  */
1764  
1765         if(sk->partial && ((!sk->packets_out) 
1766      /* If not nagling we can send on the before case too.. */
1767               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1768         ))
1769                 tcp_send_partial(sk);
1770 
1771         release_sock(sk);
1772         return(copied);
1773 }
1774 
1775 /*
1776  *      This is just a wrapper. 
1777  */
1778 
1779 static int tcp_sendto(struct sock *sk, const unsigned char *from,
     /*  */
1780            int len, int nonblock, unsigned flags,
1781            struct sockaddr_in *addr, int addr_len)
1782 {
1783         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1784                 return -EINVAL;
1785         if (sk->state == TCP_CLOSE)
1786                 return -ENOTCONN;
1787         if (addr_len < sizeof(*addr))
1788                 return -EINVAL;
1789         if (addr->sin_family && addr->sin_family != AF_INET) 
1790                 return -EINVAL;
1791         if (addr->sin_port != sk->dummy_th.dest) 
1792                 return -EISCONN;
1793         if (addr->sin_addr.s_addr != sk->daddr) 
1794                 return -EISCONN;
1795         return tcp_write(sk, from, len, nonblock, flags);
1796 }
1797 
1798 
1799 /*
1800  *      Send an ack if one is backlogged at this point. Ought to merge
1801  *      this with tcp_send_ack().
1802  */
1803  
1804 static void tcp_read_wakeup(struct sock *sk)
     /*  */
1805 {
1806         int tmp;
1807         struct device *dev = NULL;
1808         struct tcphdr *t1;
1809         struct sk_buff *buff;
1810 
1811         if (!sk->ack_backlog) 
1812                 return;
1813 
1814         /*
1815          * If we're closed, don't send an ack, or we'll get a RST
1816          * from the closed destination.
1817          */
1818         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1819                 return; 
1820 
1821         /*
1822          * FIXME: we need to put code here to prevent this routine from
1823          * being called.  Being called once in a while is ok, so only check
1824          * if this is the second time in a row.
1825          */
1826 
1827         /*
1828          * We need to grab some memory, and put together an ack,
1829          * and then put it into the queue to be sent.
1830          */
1831 
1832         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1833         if (buff == NULL) 
1834         {
1835                 /* Try again real soon. */
1836                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1837                 return;
1838         }
1839 
1840         buff->sk = sk;
1841         buff->localroute = sk->localroute;
1842         
1843         /*
1844          *      Put in the IP header and routing stuff. 
1845          */
1846 
1847         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1848                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1849         if (tmp < 0) 
1850         {
1851                 buff->free = 1;
1852                 sk->prot->wfree(sk, buff);
1853                 return;
1854         }
1855 
1856         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1857 
1858         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1859         t1->seq = htonl(sk->sent_seq);
1860         t1->ack = 1;
1861         t1->res1 = 0;
1862         t1->res2 = 0;
1863         t1->rst = 0;
1864         t1->urg = 0;
1865         t1->syn = 0;
1866         t1->psh = 0;
1867         sk->ack_backlog = 0;
1868         sk->bytes_rcv = 0;
1869         sk->window = tcp_select_window(sk);
1870         t1->window = ntohs(sk->window);
1871         t1->ack_seq = ntohl(sk->acked_seq);
1872         t1->doff = sizeof(*t1)/4;
1873         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1874         sk->prot->queue_xmit(sk, dev, buff, 1);
1875         tcp_statistics.TcpOutSegs++;
1876 }
1877 
1878 
1879 /*
1880  *      FIXME:
1881  *      This routine frees used buffers.
1882  *      It should consider sending an ACK to let the
1883  *      other end know we now have a bigger window.
1884  */
1885 
1886 static void cleanup_rbuf(struct sock *sk)
     /*  */
1887 {
1888         unsigned long flags;
1889         unsigned long left;
1890         struct sk_buff *skb;
1891         unsigned long rspace;
1892 
1893         if(sk->debug)
1894                 printk("cleaning rbuf for sk=%p\n", sk);
1895   
1896         save_flags(flags);
1897         cli();
1898   
1899         left = sk->prot->rspace(sk);
1900  
1901         /*
1902          *      We have to loop through all the buffer headers,
1903          *      and try to free up all the space we can.
1904          */
1905 
1906         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1907         {
1908                 if (!skb->used || skb->users) 
1909                         break;
1910                 skb_unlink(skb);
1911                 skb->sk = sk;
1912                 kfree_skb(skb, FREE_READ);
1913         }
1914 
1915         restore_flags(flags);
1916 
1917         /*
1918          *      FIXME:
1919          *      At this point we should send an ack if the difference
1920          *      in the window, and the amount of space is bigger than
1921          *      TCP_WINDOW_DIFF.
1922          */
1923 
1924         if(sk->debug)
1925                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1926                                             left);
1927         if ((rspace=sk->prot->rspace(sk)) != left) 
1928         {
1929                 /*
1930                  * This area has caused the most trouble.  The current strategy
1931                  * is to simply do nothing if the other end has room to send at
1932                  * least 3 full packets, because the ack from those will auto-
1933                  * matically update the window.  If the other end doesn't think
1934                  * we have much space left, but we have room for at least 1 more
1935                  * complete packet than it thinks we do, we will send an ack
1936                  * immediately.  Otherwise we will wait up to .5 seconds in case
1937                  * the user reads some more.
1938                  */
1939                 sk->ack_backlog++;
1940         /*
1941          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1942          * if the other end is offering a window smaller than the agreed on MSS
1943          * (called sk->mtu here).  In theory there's no connection between send
1944          * and receive, and so no reason to think that they're going to send
1945          * small packets.  For the moment I'm using the hack of reducing the mss
1946          * only on the send side, so I'm putting mtu here.
1947          */
1948 
1949                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1950                 {
1951                         /* Send an ack right now. */
1952                         tcp_read_wakeup(sk);
1953                 } 
1954                 else 
1955                 {
1956                         /* Force it to send an ack soon. */
1957                         int was_active = del_timer(&sk->retransmit_timer);
1958                         if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) 
1959                         {
1960                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1961                         } 
1962                         else
1963                                 add_timer(&sk->retransmit_timer);
1964                 }
1965         }
1966 } 
1967 
1968 
1969 /*
1970  *      Handle reading urgent data. BSD has very simple semantics for
1971  *      this, no blocking and very strange errors 8)
1972  */
1973  
1974 static int tcp_read_urg(struct sock * sk, int nonblock,
     /*  */
1975              unsigned char *to, int len, unsigned flags)
1976 {
1977         /*
1978          *      No URG data to read
1979          */
1980         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1981                 return -EINVAL; /* Yes this is right ! */
1982                 
1983         if (sk->err) 
1984         {
1985                 int tmp = -sk->err;
1986                 sk->err = 0;
1987                 return tmp;
1988         }
1989 
1990         if (sk->state == TCP_CLOSE || sk->done) 
1991         {
1992                 if (!sk->done) {
1993                         sk->done = 1;
1994                         return 0;
1995                 }
1996                 return -ENOTCONN;
1997         }
1998 
1999         if (sk->shutdown & RCV_SHUTDOWN) 
2000         {
2001                 sk->done = 1;
2002                 return 0;
2003         }
2004         sk->inuse = 1;
2005         if (sk->urg_data & URG_VALID) 
2006         {
2007                 char c = sk->urg_data;
2008                 if (!(flags & MSG_PEEK))
2009                         sk->urg_data = URG_READ;
2010                 put_fs_byte(c, to);
2011                 release_sock(sk);
2012                 return 1;
2013         }
2014         release_sock(sk);
2015         
2016         /*
2017          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2018          * the available implementations agree in this case:
2019          * this call should never block, independent of the
2020          * blocking state of the socket.
2021          * Mike <pall@rz.uni-karlsruhe.de>
2022          */
2023         return -EAGAIN;
2024 }
2025 
2026 
2027 /*
2028  *      This routine copies from a sock struct into the user buffer. 
2029  */
2030  
2031 static int tcp_read(struct sock *sk, unsigned char *to,
     /*  */
2032         int len, int nonblock, unsigned flags)
2033 {
2034         struct wait_queue wait = { current, NULL };
2035         int copied = 0;
2036         u32 peek_seq;
2037         volatile u32 *seq;      /* So gcc doesn't overoptimise */
2038         unsigned long used;
2039 
2040         /* 
2041          *      This error should be checked. 
2042          */
2043          
2044         if (sk->state == TCP_LISTEN)
2045                 return -ENOTCONN;
2046 
2047         /*
2048          *      Urgent data needs to be handled specially. 
2049          */
2050          
2051         if (flags & MSG_OOB)
2052                 return tcp_read_urg(sk, nonblock, to, len, flags);
2053 
2054         /*
2055          *      Copying sequence to update. This is volatile to handle
2056          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2057          *      inline and thus not flush cached variables otherwise).
2058          */
2059          
2060         peek_seq = sk->copied_seq;
2061         seq = &sk->copied_seq;
2062         if (flags & MSG_PEEK)
2063                 seq = &peek_seq;
2064 
2065         add_wait_queue(sk->sleep, &wait);
2066         sk->inuse = 1;
2067         while (len > 0) 
2068         {
2069                 struct sk_buff * skb;
2070                 u32 offset;
2071         
2072                 /*
2073                  * Are we at urgent data? Stop if we have read anything.
2074                  */
2075                  
2076                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2077                         break;
2078 
2079                 /*
2080                  *      Next get a buffer.
2081                  */
2082                  
2083                 current->state = TASK_INTERRUPTIBLE;
2084 
2085                 skb = skb_peek(&sk->receive_queue);
2086                 do 
2087                 {
2088                         if (!skb)
2089                                 break;
2090                         if (before(*seq, skb->h.th->seq))
2091                                 break;
2092                         offset = *seq - skb->h.th->seq;
2093                         if (skb->h.th->syn)
2094                                 offset--;
2095                         if (offset < skb->len)
2096                                 goto found_ok_skb;
2097                         if (skb->h.th->fin)
2098                                 goto found_fin_ok;
2099                         if (!(flags & MSG_PEEK))
2100                                 skb->used = 1;
2101                         skb = skb->next;
2102                 }
2103                 while (skb != (struct sk_buff *)&sk->receive_queue);
2104 
2105                 if (copied)
2106                         break;
2107 
2108                 if (sk->err) 
2109                 {
2110                         copied = -sk->err;
2111                         sk->err = 0;
2112                         break;
2113                 }
2114 
2115                 if (sk->state == TCP_CLOSE) 
2116                 {
2117                         if (!sk->done) 
2118                         {
2119                                 sk->done = 1;
2120                                 break;
2121                         }
2122                         copied = -ENOTCONN;
2123                         break;
2124                 }
2125 
2126                 if (sk->shutdown & RCV_SHUTDOWN) 
2127                 {
2128                         sk->done = 1;
2129                         break;
2130                 }
2131                         
2132                 if (nonblock) 
2133                 {
2134                         copied = -EAGAIN;
2135                         break;
2136                 }
2137 
2138                 cleanup_rbuf(sk);
2139                 release_sock(sk);
2140                 sk->socket->flags |= SO_WAITDATA;
2141                 schedule();
2142                 sk->socket->flags &= ~SO_WAITDATA;
2143                 sk->inuse = 1;
2144 
2145                 if (current->signal & ~current->blocked) 
2146                 {
2147                         copied = -ERESTARTSYS;
2148                         break;
2149                 }
2150                 continue;
2151 
2152         found_ok_skb:
2153                 /*
2154                  *      Lock the buffer. We can be fairly relaxed as
2155                  *      an interrupt will never steal a buffer we are 
2156                  *      using unless I've missed something serious in
2157                  *      tcp_data.
2158                  */
2159                 
2160                 skb->users++;
2161                 
2162                 /*
2163                  *      Ok so how much can we use ? 
2164                  */
2165                  
2166                 used = skb->len - offset;
2167                 if (len < used)
2168                         used = len;
2169                 /*
2170                  *      Do we have urgent data here? 
2171                  */
2172                 
2173                 if (sk->urg_data) 
2174                 {
2175                         u32 urg_offset = sk->urg_seq - *seq;
2176                         if (urg_offset < used) 
2177                         {
2178                                 if (!urg_offset) 
2179                                 {
2180                                         if (!sk->urginline) 
2181                                         {
2182                                                 ++*seq;
2183                                                 offset++;
2184                                                 used--;
2185                                         }
2186                                 }
2187                                 else
2188                                         used = urg_offset;
2189                         }
2190                 }
2191                 
2192                 /*
2193                  *      Copy it - We _MUST_ update *seq first so that we
2194                  *      don't ever double read when we have dual readers
2195                  */
2196                  
2197                 *seq += used;
2198 
2199                 /*
2200                  *      This memcpy_tofs can sleep. If it sleeps and we
2201                  *      do a second read it relies on the skb->users to avoid
2202                  *      a crash when cleanup_rbuf() gets called.
2203                  */
2204                  
2205                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2206                         skb->h.th->doff*4 + offset, used);
2207                 copied += used;
2208                 len -= used;
2209                 to += used;
2210                 
2211                 /*
2212                  *      We now will not sleep again until we are finished
2213                  *      with skb. Sorry if you are doing the SMP port
2214                  *      but you'll just have to fix it neatly ;)
2215                  */
2216                  
2217                 skb->users --;
2218                 
2219                 if (after(sk->copied_seq,sk->urg_seq))
2220                         sk->urg_data = 0;
2221                 if (used + offset < skb->len)
2222                         continue;
2223                 
2224                 /*
2225                  *      Process the FIN.
2226                  */
2227 
2228                 if (skb->h.th->fin)
2229                         goto found_fin_ok;
2230                 if (flags & MSG_PEEK)
2231                         continue;
2232                 skb->used = 1;
2233                 continue;
2234 
2235         found_fin_ok:
2236                 ++*seq;
2237                 if (flags & MSG_PEEK)
2238                         break;
2239                         
2240                 /*
2241                  *      All is done
2242                  */
2243                  
2244                 skb->used = 1;
2245                 sk->shutdown |= RCV_SHUTDOWN;
2246                 break;
2247 
2248         }
2249         remove_wait_queue(sk->sleep, &wait);
2250         current->state = TASK_RUNNING;
2251 
2252         /* Clean up data we have read: This will do ACK frames */
2253         cleanup_rbuf(sk);
2254         release_sock(sk);
2255         return copied;
2256 }
2257 
2258 /*
2259  *      State processing on a close. This implements the state shift for
2260  *      sending our FIN frame. Note that we only send a FIN for some 
2261  *      states. A shutdown() may have already sent the FIN, or we may be
2262  *      closed.
2263  */
2264  
2265 static int tcp_close_state(struct sock *sk, int dead)
     /*  */
2266 {
2267         int ns=TCP_CLOSE;
2268         int send_fin=0;
2269         switch(sk->state)
2270         {
2271                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2272                         break;
2273                 case TCP_SYN_RECV:
2274                 case TCP_ESTABLISHED:   /* Closedown begin */
2275                         ns=TCP_FIN_WAIT1;
2276                         send_fin=1;
2277                         break;
2278                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2279                 case TCP_FIN_WAIT2:
2280                 case TCP_CLOSING:
2281                         ns=sk->state;
2282                         break;
2283                 case TCP_CLOSE:
2284                 case TCP_LISTEN:
2285                         break;
2286                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2287                                            wait only for the ACK */
2288                         ns=TCP_LAST_ACK;
2289                         send_fin=1;
2290         }
2291         
2292         tcp_set_state(sk,ns);
2293                 
2294         /*
2295          *      This is a (useful) BSD violating of the RFC. There is a
2296          *      problem with TCP as specified in that the other end could
2297          *      keep a socket open forever with no application left this end.
2298          *      We use a 3 minute timeout (about the same as BSD) then kill
2299          *      our end. If they send after that then tough - BUT: long enough
2300          *      that we won't make the old 4*rto = almost no time - whoops
2301          *      reset mistake.
2302          */
2303         if(dead && ns==TCP_FIN_WAIT2)
2304         {
2305                 int timer_active=del_timer(&sk->timer);
2306                 if(timer_active)
2307                         add_timer(&sk->timer);
2308                 else
2309                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2310         }
2311         
2312         return send_fin;
2313 }
2314 
2315 /*
2316  *      Send a fin.
2317  */
2318 
2319 static void tcp_send_fin(struct sock *sk)
     /*  */
2320 {
2321         struct proto *prot =(struct proto *)sk->prot;
2322         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2323         struct tcphdr *t1;
2324         struct sk_buff *buff;
2325         struct device *dev=NULL;
2326         int tmp;
2327                 
2328         release_sock(sk); /* in case the malloc sleeps. */
2329         
2330         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2331         sk->inuse = 1;
2332 
2333         if (buff == NULL)
2334         {
2335                 /* This is a disaster if it occurs */
2336                 printk("tcp_send_fin: Impossible malloc failure");
2337                 return;
2338         }
2339 
2340         /*
2341          *      Administrivia
2342          */
2343          
2344         buff->sk = sk;
2345         buff->localroute = sk->localroute;
2346 
2347         /*
2348          *      Put in the IP header and routing stuff. 
2349          */
2350 
2351         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2352                            IPPROTO_TCP, sk->opt,
2353                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2354         if (tmp < 0) 
2355         {
2356                 int t;
2357                 /*
2358                  *      Finish anyway, treat this as a send that got lost. 
2359                  *      (Not good).
2360                  */
2361                  
2362                 buff->free = 1;
2363                 prot->wfree(sk,buff);
2364                 sk->write_seq++;
2365                 t=del_timer(&sk->timer);
2366                 if(t)
2367                         add_timer(&sk->timer);
2368                 else
2369                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2370                 return;
2371         }
2372         
2373         /*
2374          *      We ought to check if the end of the queue is a buffer and
2375          *      if so simply add the fin to that buffer, not send it ahead.
2376          */
2377 
2378         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2379         buff->dev = dev;
2380         memcpy(t1, th, sizeof(*t1));
2381         t1->seq = ntohl(sk->write_seq);
2382         sk->write_seq++;
2383         buff->h.seq = sk->write_seq;
2384         t1->ack = 1;
2385         t1->ack_seq = ntohl(sk->acked_seq);
2386         t1->window = ntohs(sk->window=tcp_select_window(sk));
2387         t1->fin = 1;
2388         t1->rst = 0;
2389         t1->doff = sizeof(*t1)/4;
2390         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2391 
2392         /*
2393          * If there is data in the write queue, the fin must be appended to
2394          * the write queue.
2395          */
2396         
2397         if (skb_peek(&sk->write_queue) != NULL) 
2398         {
2399                 buff->free = 0;
2400                 if (buff->next != NULL) 
2401                 {
2402                         printk("tcp_send_fin: next != NULL\n");
2403                         skb_unlink(buff);
2404                 }
2405                 skb_queue_tail(&sk->write_queue, buff);
2406         } 
2407         else 
2408         {
2409                 sk->sent_seq = sk->write_seq;
2410                 sk->prot->queue_xmit(sk, dev, buff, 0);
2411                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2412         }
2413 }
2414 
2415 /*
2416  *      Shutdown the sending side of a connection. Much like close except
2417  *      that we don't receive shut down or set sk->dead=1.
2418  */
2419 
2420 void tcp_shutdown(struct sock *sk, int how)
     /*  */
2421 {
2422         /*
2423          *      We need to grab some memory, and put together a FIN,
2424          *      and then put it into the queue to be sent.
2425          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2426          */
2427 
2428         if (!(how & SEND_SHUTDOWN)) 
2429                 return;
2430          
2431         /*
2432          *      If we've already sent a FIN, or it's a closed state
2433          */
2434          
2435         if (sk->state == TCP_FIN_WAIT1 ||
2436             sk->state == TCP_FIN_WAIT2 ||
2437             sk->state == TCP_CLOSING ||
2438             sk->state == TCP_LAST_ACK ||
2439             sk->state == TCP_TIME_WAIT || 
2440             sk->state == TCP_CLOSE ||
2441             sk->state == TCP_LISTEN
2442           )
2443         {
2444                 return;
2445         }
2446         sk->inuse = 1;
2447 
2448         /*
2449          * flag that the sender has shutdown
2450          */
2451 
2452         sk->shutdown |= SEND_SHUTDOWN;
2453 
2454         /*
2455          *  Clear out any half completed packets. 
2456          */
2457 
2458         if (sk->partial)
2459                 tcp_send_partial(sk);
2460                 
2461         /*
2462          *      FIN if needed
2463          */
2464          
2465         if(tcp_close_state(sk,0))
2466                 tcp_send_fin(sk);
2467                 
2468         release_sock(sk);
2469 }
2470 
2471 
2472 static int
2473 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /*  */
2474              int to_len, int nonblock, unsigned flags,
2475              struct sockaddr_in *addr, int *addr_len)
2476 {
2477         int result;
2478   
2479         /* 
2480          *      Have to check these first unlike the old code. If 
2481          *      we check them after we lose data on an error
2482          *      which is wrong 
2483          */
2484 
2485         if(addr_len)
2486                 *addr_len = sizeof(*addr);
2487         result=tcp_read(sk, to, to_len, nonblock, flags);
2488 
2489         if (result < 0) 
2490                 return(result);
2491   
2492         if(addr)
2493         {
2494                 addr->sin_family = AF_INET;
2495                 addr->sin_port = sk->dummy_th.dest;
2496                 addr->sin_addr.s_addr = sk->daddr;
2497         }
2498         return(result);
2499 }
2500 
2501 
2502 /*
2503  *      This routine will send an RST to the other tcp. 
2504  */
2505  
2506 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /*  */
2507           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2508 {
2509         struct sk_buff *buff;
2510         struct tcphdr *t1;
2511         int tmp;
2512         struct device *ndev=NULL;
2513 
2514         /*
2515          *      Cannot reset a reset (Think about it).
2516          */
2517          
2518         if(th->rst)
2519                 return;
2520   
2521         /*
2522          * We need to grab some memory, and put together an RST,
2523          * and then put it into the queue to be sent.
2524          */
2525 
2526         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2527         if (buff == NULL) 
2528                 return;
2529 
2530         buff->sk = NULL;
2531         buff->dev = dev;
2532         buff->localroute = 0;
2533 
2534         /*
2535          *      Put in the IP header and routing stuff. 
2536          */
2537 
2538         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2539                            sizeof(struct tcphdr),tos,ttl);
2540         if (tmp < 0) 
2541         {
2542                 buff->free = 1;
2543                 prot->wfree(NULL, buff);
2544                 return;
2545         }
2546 
2547         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2548         memcpy(t1, th, sizeof(*t1));
2549 
2550         /*
2551          *      Swap the send and the receive. 
2552          */
2553 
2554         t1->dest = th->source;
2555         t1->source = th->dest;
2556         t1->rst = 1;  
2557         t1->window = 0;
2558   
2559         if(th->ack)
2560         {
2561                 t1->ack = 0;
2562                 t1->seq = th->ack_seq;
2563                 t1->ack_seq = 0;
2564         }
2565         else
2566         {
2567                 t1->ack = 1;
2568                 if(!th->syn)
2569                         t1->ack_seq=htonl(th->seq);
2570                 else
2571                         t1->ack_seq=htonl(th->seq+1);
2572                 t1->seq=0;
2573         }
2574 
2575         t1->syn = 0;
2576         t1->urg = 0;
2577         t1->fin = 0;
2578         t1->psh = 0;
2579         t1->doff = sizeof(*t1)/4;
2580         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2581         prot->queue_xmit(NULL, ndev, buff, 1);
2582         tcp_statistics.TcpOutSegs++;
2583 }
2584 
2585 
2586 /*
2587  *      Look for tcp options. Parses everything but only knows about MSS.
2588  *      This routine is always called with the packet containing the SYN.
2589  *      However it may also be called with the ack to the SYN.  So you
2590  *      can't assume this is always the SYN.  It's always called after
2591  *      we have set up sk->mtu to our own MTU.
2592  *
2593  *      We need at minimum to add PAWS support here. Possibly large windows
2594  *      as Linux gets deployed on 100Mb/sec networks.
2595  */
2596  
2597 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /*  */
2598 {
2599         unsigned char *ptr;
2600         int length=(th->doff*4)-sizeof(struct tcphdr);
2601         int mss_seen = 0;
2602     
2603         ptr = (unsigned char *)(th + 1);
2604   
2605         while(length>0)
2606         {
2607                 int opcode=*ptr++;
2608                 int opsize=*ptr++;
2609                 switch(opcode)
2610                 {
2611                         case TCPOPT_EOL:
2612                                 return;
2613                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2614                                 length--;
2615                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2616                                 continue;
2617                         
2618                         default:
2619                                 if(opsize<=2)   /* Avoid silly options looping forever */
2620                                         return;
2621                                 switch(opcode)
2622                                 {
2623                                         case TCPOPT_MSS:
2624                                                 if(opsize==4 && th->syn)
2625                                                 {
2626                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2627                                                         mss_seen = 1;
2628                                                 }
2629                                                 break;
2630                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2631                                 }
2632                                 ptr+=opsize-2;
2633                                 length-=opsize;
2634                 }
2635         }
2636         if (th->syn) 
2637         {
2638                 if (! mss_seen)
2639                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2640         }
2641 #ifdef CONFIG_INET_PCTCP
2642         sk->mss = min(sk->max_window >> 1, sk->mtu);
2643 #else    
2644         sk->mss = min(sk->max_window, sk->mtu);
2645 #endif  
2646 }
2647 
2648 static inline unsigned long default_mask(unsigned long dst)
     /*  */
2649 {
2650         dst = ntohl(dst);
2651         if (IN_CLASSA(dst))
2652                 return htonl(IN_CLASSA_NET);
2653         if (IN_CLASSB(dst))
2654                 return htonl(IN_CLASSB_NET);
2655         return htonl(IN_CLASSC_NET);
2656 }
2657 
2658 /*
2659  *      Default sequence number picking algorithm.
2660  *      As close as possible to RFC 793, which
2661  *      suggests using a 250kHz clock.
2662  *      Further reading shows this assumes 2MB/s networks.
2663  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2664  *      That's funny, Linux has one built in!  Use it!
2665  */
2666 
2667 extern inline u32 tcp_init_seq(void)
     /*  */
2668 {
2669         struct timeval tv;
2670         do_gettimeofday(&tv);
2671         return tv.tv_usec+tv.tv_sec*1000000;
2672 }
2673 
2674 /*
2675  *      This routine handles a connection request.
2676  *      It should make sure we haven't already responded.
2677  *      Because of the way BSD works, we have to send a syn/ack now.
2678  *      This also means it will be harder to close a socket which is
2679  *      listening.
2680  */
2681  
2682 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /*  */
2683                  unsigned long daddr, unsigned long saddr,
2684                  struct options *opt, struct device *dev, u32 seq)
2685 {
2686         struct sk_buff *buff;
2687         struct tcphdr *t1;
2688         unsigned char *ptr;
2689         struct sock *newsk;
2690         struct tcphdr *th;
2691         struct device *ndev=NULL;
2692         int tmp;
2693         struct rtable *rt;
2694   
2695         th = skb->h.th;
2696 
2697         /* If the socket is dead, don't accept the connection. */
2698         if (!sk->dead) 
2699         {
2700                 sk->data_ready(sk,0);
2701         }
2702         else 
2703         {
2704                 if(sk->debug)
2705                         printk("Reset on %p: Connect on dead socket.\n",sk);
2706                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2707                 tcp_statistics.TcpAttemptFails++;
2708                 kfree_skb(skb, FREE_READ);
2709                 return;
2710         }
2711 
2712         /*
2713          * Make sure we can accept more.  This will prevent a
2714          * flurry of syns from eating up all our memory.
2715          */
2716 
2717         if (sk->ack_backlog >= sk->max_ack_backlog) 
2718         {
2719                 tcp_statistics.TcpAttemptFails++;
2720                 kfree_skb(skb, FREE_READ);
2721                 return;
2722         }
2723 
2724         /*
2725          * We need to build a new sock struct.
2726          * It is sort of bad to have a socket without an inode attached
2727          * to it, but the wake_up's will just wake up the listening socket,
2728          * and if the listening socket is destroyed before this is taken
2729          * off of the queue, this will take care of it.
2730          */
2731 
2732         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2733         if (newsk == NULL) 
2734         {
2735                 /* just ignore the syn.  It will get retransmitted. */
2736                 tcp_statistics.TcpAttemptFails++;
2737                 kfree_skb(skb, FREE_READ);
2738                 return;
2739         }
2740 
2741         memcpy(newsk, sk, sizeof(*newsk));
2742         skb_queue_head_init(&newsk->write_queue);
2743         skb_queue_head_init(&newsk->receive_queue);
2744         newsk->send_head = NULL;
2745         newsk->send_tail = NULL;
2746         skb_queue_head_init(&newsk->back_log);
2747         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2748         newsk->rto = TCP_TIMEOUT_INIT;
2749         newsk->mdev = 0;
2750         newsk->max_window = 0;
2751         newsk->cong_window = 1;
2752         newsk->cong_count = 0;
2753         newsk->ssthresh = 0;
2754         newsk->backoff = 0;
2755         newsk->blog = 0;
2756         newsk->intr = 0;
2757         newsk->proc = 0;
2758         newsk->done = 0;
2759         newsk->partial = NULL;
2760         newsk->pair = NULL;
2761         newsk->wmem_alloc = 0;
2762         newsk->rmem_alloc = 0;
2763         newsk->localroute = sk->localroute;
2764 
2765         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2766 
2767         newsk->err = 0;
2768         newsk->shutdown = 0;
2769         newsk->ack_backlog = 0;
2770         newsk->acked_seq = skb->h.th->seq+1;
2771         newsk->copied_seq = skb->h.th->seq+1;
2772         newsk->fin_seq = skb->h.th->seq;
2773         newsk->state = TCP_SYN_RECV;
2774         newsk->timeout = 0;
2775         newsk->ip_xmit_timeout = 0;
2776         newsk->write_seq = seq; 
2777         newsk->window_seq = newsk->write_seq;
2778         newsk->rcv_ack_seq = newsk->write_seq;
2779         newsk->urg_data = 0;
2780         newsk->retransmits = 0;
2781         newsk->linger=0;
2782         newsk->destroy = 0;
2783         init_timer(&newsk->timer);
2784         newsk->timer.data = (unsigned long)newsk;
2785         newsk->timer.function = &net_timer;
2786         init_timer(&newsk->retransmit_timer);
2787         newsk->retransmit_timer.data = (unsigned long)newsk;
2788         newsk->retransmit_timer.function=&retransmit_timer;
2789         newsk->dummy_th.source = skb->h.th->dest;
2790         newsk->dummy_th.dest = skb->h.th->source;
2791         
2792         /*
2793          *      Swap these two, they are from our point of view. 
2794          */
2795          
2796         newsk->daddr = saddr;
2797         newsk->saddr = daddr;
2798 
2799         put_sock(newsk->num,newsk);
2800         newsk->dummy_th.res1 = 0;
2801         newsk->dummy_th.doff = 6;
2802         newsk->dummy_th.fin = 0;
2803         newsk->dummy_th.syn = 0;
2804         newsk->dummy_th.rst = 0;        
2805         newsk->dummy_th.psh = 0;
2806         newsk->dummy_th.ack = 0;
2807         newsk->dummy_th.urg = 0;
2808         newsk->dummy_th.res2 = 0;
2809         newsk->acked_seq = skb->h.th->seq + 1;
2810         newsk->copied_seq = skb->h.th->seq + 1;
2811         newsk->socket = NULL;
2812 
2813         /*
2814          *      Grab the ttl and tos values and use them 
2815          */
2816 
2817         newsk->ip_ttl=sk->ip_ttl;
2818         newsk->ip_tos=skb->ip_hdr->tos;
2819 
2820         /*
2821          *      Use 512 or whatever user asked for 
2822          */
2823 
2824         /*
2825          *      Note use of sk->user_mss, since user has no direct access to newsk 
2826          */
2827 
2828         rt=ip_rt_route(saddr, NULL,NULL);
2829         
2830         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2831                 newsk->window_clamp = rt->rt_window;
2832         else
2833                 newsk->window_clamp = 0;
2834                 
2835         if (sk->user_mss)
2836                 newsk->mtu = sk->user_mss;
2837         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2838                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2839         else 
2840         {
2841 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2842                 if ((saddr ^ daddr) & default_mask(saddr))
2843 #else
2844                 if ((saddr ^ daddr) & dev->pa_mask)
2845 #endif
2846                         newsk->mtu = 576 - HEADER_SIZE;
2847                 else
2848                         newsk->mtu = MAX_WINDOW;
2849         }
2850 
2851         /*
2852          *      But not bigger than device MTU 
2853          */
2854 
2855         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2856 
2857         /*
2858          *      This will min with what arrived in the packet 
2859          */
2860 
2861         tcp_options(newsk,skb->h.th);
2862         
2863         tcp_cache_zap();
2864 
2865         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2866         if (buff == NULL) 
2867         {
2868                 sk->err = ENOMEM;
2869                 newsk->dead = 1;
2870                 newsk->state = TCP_CLOSE;
2871                 /* And this will destroy it */
2872                 release_sock(newsk);
2873                 kfree_skb(skb, FREE_READ);
2874                 tcp_statistics.TcpAttemptFails++;
2875                 return;
2876         }
2877   
2878         buff->sk = newsk;
2879         buff->localroute = newsk->localroute;
2880 
2881         /*
2882          *      Put in the IP header and routing stuff. 
2883          */
2884 
2885         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2886                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2887 
2888         /*
2889          *      Something went wrong. 
2890          */
2891 
2892         if (tmp < 0) 
2893         {
2894                 sk->err = tmp;
2895                 buff->free = 1;
2896                 kfree_skb(buff,FREE_WRITE);
2897                 newsk->dead = 1;
2898                 newsk->state = TCP_CLOSE;
2899                 release_sock(newsk);
2900                 skb->sk = sk;
2901                 kfree_skb(skb, FREE_READ);
2902                 tcp_statistics.TcpAttemptFails++;
2903                 return;
2904         }
2905 
2906         t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2907   
2908         memcpy(t1, skb->h.th, sizeof(*t1));
2909         buff->h.seq = newsk->write_seq;
2910         /*
2911          *      Swap the send and the receive. 
2912          */
2913         t1->dest = skb->h.th->source;
2914         t1->source = newsk->dummy_th.source;
2915         t1->seq = ntohl(newsk->write_seq++);
2916         t1->ack = 1;
2917         newsk->window = tcp_select_window(newsk);
2918         newsk->sent_seq = newsk->write_seq;
2919         t1->window = ntohs(newsk->window);
2920         t1->res1 = 0;
2921         t1->res2 = 0;
2922         t1->rst = 0;
2923         t1->urg = 0;
2924         t1->psh = 0;
2925         t1->syn = 1;
2926         t1->ack_seq = ntohl(skb->h.th->seq+1);
2927         t1->doff = sizeof(*t1)/4+1;
2928         ptr = skb_put(buff,4);
2929         ptr[0] = 2;
2930         ptr[1] = 4;
2931         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2932         ptr[3] =(newsk->mtu) & 0xff;
2933 
2934         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2935         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2936         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2937         skb->sk = newsk;
2938 
2939         /*
2940          *      Charge the sock_buff to newsk. 
2941          */
2942          
2943         sk->rmem_alloc -= skb->truesize;
2944         newsk->rmem_alloc += skb->truesize;
2945         
2946         skb_queue_tail(&sk->receive_queue,skb);
2947         sk->ack_backlog++;
2948         release_sock(newsk);
2949         tcp_statistics.TcpOutSegs++;
2950 }
2951 
2952 
2953 static void tcp_close(struct sock *sk, int timeout)
     /*  */
2954 {
2955         /*
2956          * We need to grab some memory, and put together a FIN, 
2957          * and then put it into the queue to be sent.
2958          */
2959         
2960         sk->inuse = 1;
2961         
2962         if(th_cache_sk==sk)
2963                 tcp_cache_zap();
2964         if(sk->state == TCP_LISTEN)
2965         {
2966                 /* Special case */
2967                 tcp_set_state(sk, TCP_CLOSE);
2968                 tcp_close_pending(sk);
2969                 release_sock(sk);
2970                 return;
2971         }
2972         
2973         sk->keepopen = 1;
2974         sk->shutdown = SHUTDOWN_MASK;
2975 
2976         if (!sk->dead) 
2977                 sk->state_change(sk);
2978 
2979         if (timeout == 0) 
2980         {
2981                 struct sk_buff *skb;
2982                 
2983                 /*
2984                  *  We need to flush the recv. buffs.  We do this only on the
2985                  *  descriptor close, not protocol-sourced closes, because the
2986                  *  reader process may not have drained the data yet!
2987                  */
2988                  
2989                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2990                         kfree_skb(skb, FREE_READ);
2991                 /*
2992                  *      Get rid off any half-completed packets. 
2993                  */
2994 
2995                 if (sk->partial) 
2996                         tcp_send_partial(sk);
2997         }
2998 
2999                 
3000         /*
3001          *      Timeout is not the same thing - however the code likes
3002          *      to send both the same way (sigh).
3003          */
3004          
3005         if(timeout)
3006         {
3007                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3008         }
3009         else
3010         {
3011                 if(tcp_close_state(sk,1)==1)
3012                 {
3013                         tcp_send_fin(sk);
3014                 }
3015         }
3016         release_sock(sk);
3017 }
3018 
3019 
3020 /*
3021  *      This routine takes stuff off of the write queue,
3022  *      and puts it in the xmit queue. This happens as incoming acks
3023  *      open up the remote window for us.
3024  */
3025  
3026 static void tcp_write_xmit(struct sock *sk)
     /*  */
3027 {
3028         struct sk_buff *skb;
3029 
3030         /*
3031          *      The bytes will have to remain here. In time closedown will
3032          *      empty the write queue and all will be happy 
3033          */
3034 
3035         if(sk->zapped)
3036                 return;
3037 
3038         /*
3039          *      Anything on the transmit queue that fits the window can
3040          *      be added providing we are not
3041          *
3042          *      a) retransmitting (Nagle's rule)
3043          *      b) exceeding our congestion window.
3044          */
3045          
3046         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3047                 before(skb->h.seq, sk->window_seq + 1) &&
3048                 (sk->retransmits == 0 ||
3049                  sk->ip_xmit_timeout != TIME_WRITE ||
3050                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3051                 && sk->packets_out < sk->cong_window) 
3052         {
3053                 IS_SKB(skb);
3054                 skb_unlink(skb);
3055                 
3056                 /*
3057                  *      See if we really need to send the packet. 
3058                  */
3059                  
3060                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3061                 {
3062                         /*
3063                          *      This is acked data. We can discard it. This 
3064                          *      cannot currently occur.
3065                          */
3066                          
3067                         sk->retransmits = 0;
3068                         kfree_skb(skb, FREE_WRITE);
3069                         if (!sk->dead) 
3070                                 sk->write_space(sk);
3071                 } 
3072                 else
3073                 {
3074                         struct tcphdr *th;
3075                         struct iphdr *iph;
3076                         int size;
3077 /*
3078  * put in the ack seq and window at this point rather than earlier,
3079  * in order to keep them monotonic.  We really want to avoid taking
3080  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3081  * Ack and window will in general have changed since this packet was put
3082  * on the write queue.
3083  */
3084                         iph = skb->ip_hdr;
3085                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3086                         size = skb->len - (((unsigned char *) th) - skb->data);
3087                         
3088                         th->ack_seq = ntohl(sk->acked_seq);
3089                         th->window = ntohs(tcp_select_window(sk));
3090 
3091                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3092 
3093                         sk->sent_seq = skb->h.seq;
3094                         
3095                         /*
3096                          *      IP manages our queue for some crazy reason
3097                          */
3098                          
3099                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3100                         
3101                         /*
3102                          *      Again we slide the timer wrongly
3103                          */
3104                          
3105                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3106                 }
3107         }
3108 }
3109 
3110 
3111 /*
3112  *      This routine deals with incoming acks, but not outgoing ones.
3113  */
3114 
3115 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /*  */
3116 {
3117         u32 ack;
3118         int flag = 0;
3119 
3120         /* 
3121          * 1 - there was data in packet as well as ack or new data is sent or 
3122          *     in shutdown state
3123          * 2 - data from retransmit queue was acked and removed
3124          * 4 - window shrunk or data from retransmit queue was acked and removed
3125          */
3126 
3127         if(sk->zapped)
3128                 return(1);      /* Dead, cant ack any more so why bother */
3129 
3130         /*
3131          *      Have we discovered a larger window
3132          */
3133          
3134         ack = ntohl(th->ack_seq);
3135 
3136         if (ntohs(th->window) > sk->max_window) 
3137         {
3138                 sk->max_window = ntohs(th->window);
3139 #ifdef CONFIG_INET_PCTCP
3140                 /* Hack because we don't send partial packets to non SWS
3141                    handling hosts */
3142                 sk->mss = min(sk->max_window>>1, sk->mtu);
3143 #else
3144                 sk->mss = min(sk->max_window, sk->mtu);
3145 #endif  
3146         }
3147 
3148         /*
3149          *      We have dropped back to keepalive timeouts. Thus we have
3150          *      no retransmits pending.
3151          */
3152          
3153         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3154                 sk->retransmits = 0;
3155 
3156         /*
3157          *      If the ack is newer than sent or older than previous acks
3158          *      then we can probably ignore it.
3159          */
3160          
3161         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3162         {
3163                 if(sk->debug)
3164                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3165                         
3166                 /*
3167                  *      Keepalive processing.
3168                  */
3169                  
3170                 if (after(ack, sk->sent_seq)) 
3171                 {
3172                         return(0);
3173                 }
3174                 
3175                 /*
3176                  *      Restart the keepalive timer.
3177                  */
3178                  
3179                 if (sk->keepopen) 
3180                 {
3181                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3182                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3183                 }
3184                 return(1);
3185         }
3186 
3187         /*
3188          *      If there is data set flag 1
3189          */
3190          
3191         if (len != th->doff*4) 
3192                 flag |= 1;
3193 
3194         /*
3195          *      See if our window has been shrunk. 
3196          */
3197 
3198         if (after(sk->window_seq, ack+ntohs(th->window))) 
3199         {
3200                 /*
3201                  * We may need to move packets from the send queue
3202                  * to the write queue, if the window has been shrunk on us.
3203                  * The RFC says you are not allowed to shrink your window
3204                  * like this, but if the other end does, you must be able
3205                  * to deal with it.
3206                  */
3207                 struct sk_buff *skb;
3208                 struct sk_buff *skb2;
3209                 struct sk_buff *wskb = NULL;
3210         
3211                 skb2 = sk->send_head;
3212                 sk->send_head = NULL;
3213                 sk->send_tail = NULL;
3214         
3215                 /*
3216                  *      This is an artifact of a flawed concept. We want one
3217                  *      queue and a smarter send routine when we send all.
3218                  */
3219         
3220                 flag |= 4;      /* Window changed */
3221         
3222                 sk->window_seq = ack + ntohs(th->window);
3223                 cli();
3224                 while (skb2 != NULL) 
3225                 {
3226                         skb = skb2;
3227                         skb2 = skb->link3;
3228                         skb->link3 = NULL;
3229                         if (after(skb->h.seq, sk->window_seq)) 
3230                         {
3231                                 if (sk->packets_out > 0) 
3232                                         sk->packets_out--;
3233                                 /* We may need to remove this from the dev send list. */
3234                                 if (skb->next != NULL) 
3235                                 {
3236                                         skb_unlink(skb);                                
3237                                 }
3238                                 /* Now add it to the write_queue. */
3239                                 if (wskb == NULL)
3240                                         skb_queue_head(&sk->write_queue,skb);
3241                                 else
3242                                         skb_append(wskb,skb);
3243                                 wskb = skb;
3244                         } 
3245                         else 
3246                         {
3247                                 if (sk->send_head == NULL) 
3248                                 {
3249                                         sk->send_head = skb;
3250                                         sk->send_tail = skb;
3251                                 }
3252                                 else
3253                                 {
3254                                         sk->send_tail->link3 = skb;
3255                                         sk->send_tail = skb;
3256                                 }
3257                                 skb->link3 = NULL;
3258                         }
3259                 }
3260                 sti();
3261         }
3262 
3263         /*
3264          *      Pipe has emptied
3265          */
3266          
3267         if (sk->send_tail == NULL || sk->send_head == NULL) 
3268         {
3269                 sk->send_head = NULL;
3270                 sk->send_tail = NULL;
3271                 sk->packets_out= 0;
3272         }
3273 
3274         /*
3275          *      Update the right hand window edge of the host
3276          */
3277          
3278         sk->window_seq = ack + ntohs(th->window);
3279 
3280         /*
3281          *      We don't want too many packets out there. 
3282          */
3283          
3284         if (sk->ip_xmit_timeout == TIME_WRITE && 
3285                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3286         {
3287                 /* 
3288                  * This is Jacobson's slow start and congestion avoidance. 
3289                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3290                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3291                  * counter and increment it once every cwnd times.  It's possible
3292                  * that this should be done only if sk->retransmits == 0.  I'm
3293                  * interpreting "new data is acked" as including data that has
3294                  * been retransmitted but is just now being acked.
3295                  */
3296                 if (sk->cong_window < sk->ssthresh)  
3297                         /* 
3298                          *      In "safe" area, increase
3299                          */
3300                         sk->cong_window++;
3301                 else 
3302                 {
3303                         /*
3304                          *      In dangerous area, increase slowly.  In theory this is
3305                          *      sk->cong_window += 1 / sk->cong_window
3306                          */
3307                         if (sk->cong_count >= sk->cong_window) 
3308                         {
3309                                 sk->cong_window++;
3310                                 sk->cong_count = 0;
3311                         }
3312                         else 
3313                                 sk->cong_count++;
3314                 }
3315         }
3316 
3317         /*
3318          *      Remember the highest ack received.
3319          */
3320          
3321         sk->rcv_ack_seq = ack;
3322 
3323         /*
3324          *      If this ack opens up a zero window, clear backoff.  It was
3325          *      being used to time the probes, and is probably far higher than
3326          *      it needs to be for normal retransmission.
3327          */
3328 
3329         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3330         {
3331                 sk->retransmits = 0;    /* Our probe was answered */
3332                 
3333                 /*
3334                  *      Was it a usable window open ?
3335                  */
3336                  
3337                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3338                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3339                 {
3340                         sk->backoff = 0;
3341                         
3342                         /*
3343                          *      Recompute rto from rtt.  this eliminates any backoff.
3344                          */
3345 
3346                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3347                         if (sk->rto > 120*HZ)
3348                                 sk->rto = 120*HZ;
3349                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3350                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3351                                                    .2 of a second is going to need huge windows (SIGH) */
3352                         sk->rto = 20;
3353                 }
3354         }
3355 
3356         /* 
3357          *      See if we can take anything off of the retransmit queue.
3358          */
3359    
3360         while(sk->send_head != NULL) 
3361         {
3362                 /* Check for a bug. */
3363                 if (sk->send_head->link3 &&
3364                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3365                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3366                         
3367                 /*
3368                  *      If our packet is before the ack sequence we can
3369                  *      discard it as it's confirmed to have arrived the other end.
3370                  */
3371                  
3372                 if (before(sk->send_head->h.seq, ack+1)) 
3373                 {
3374                         struct sk_buff *oskb;   
3375                         if (sk->retransmits) 
3376                         {       
3377                                 /*
3378                                  *      We were retransmitting.  don't count this in RTT est 
3379                                  */
3380                                 flag |= 2;
3381 
3382                                 /*
3383                                  * even though we've gotten an ack, we're still
3384                                  * retransmitting as long as we're sending from
3385                                  * the retransmit queue.  Keeping retransmits non-zero
3386                                  * prevents us from getting new data interspersed with
3387                                  * retransmissions.
3388                                  */
3389 
3390                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3391                                         sk->retransmits = 1;
3392                                 else
3393                                         sk->retransmits = 0;
3394                         }
3395                         /*
3396                          * Note that we only reset backoff and rto in the
3397                          * rtt recomputation code.  And that doesn't happen
3398                          * if there were retransmissions in effect.  So the
3399                          * first new packet after the retransmissions is
3400                          * sent with the backoff still in effect.  Not until
3401                          * we get an ack from a non-retransmitted packet do
3402                          * we reset the backoff and rto.  This allows us to deal
3403                          * with a situation where the network delay has increased
3404                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3405                          */
3406 
3407                         /*
3408                          *      We have one less packet out there. 
3409                          */
3410                          
3411                         if (sk->packets_out > 0) 
3412                                 sk->packets_out --;
3413                         /* 
3414                          *      Wake up the process, it can probably write more. 
3415                          */
3416                         if (!sk->dead) 
3417                                 sk->write_space(sk);
3418                         oskb = sk->send_head;
3419 
3420                         if (!(flag&2))  /* Not retransmitting */
3421                         {
3422                                 long m;
3423         
3424                                 /*
3425                                  *      The following amusing code comes from Jacobson's
3426                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3427                                  *      are scaled versions of rtt and mean deviation.
3428                                  *      This is designed to be as fast as possible 
3429                                  *      m stands for "measurement".
3430                                  */
3431         
3432                                 m = jiffies - oskb->when;  /* RTT */
3433                                 if(m<=0)
3434                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3435                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3436                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3437                                 if (m < 0)
3438                                         m = -m;         /* m is now abs(error) */
3439                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3440                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3441         
3442                                 /*
3443                                  *      Now update timeout.  Note that this removes any backoff.
3444                                  */
3445                          
3446                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3447                                 if (sk->rto > 120*HZ)
3448                                         sk->rto = 120*HZ;
3449                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3450                                         sk->rto = 20;
3451                                 sk->backoff = 0;
3452                         }
3453                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3454                                            In this case as we just set it up */
3455                         cli();
3456                         oskb = sk->send_head;
3457                         IS_SKB(oskb);
3458                         sk->send_head = oskb->link3;
3459                         if (sk->send_head == NULL) 
3460                         {
3461                                 sk->send_tail = NULL;
3462                         }
3463 
3464                 /*
3465                  *      We may need to remove this from the dev send list. 
3466                  */
3467 
3468                         if (oskb->next)
3469                                 skb_unlink(oskb);
3470                         sti();
3471                         kfree_skb(oskb, FREE_WRITE); /* write. */
3472                         if (!sk->dead) 
3473                                 sk->write_space(sk);
3474                 }
3475                 else
3476                 {
3477                         break;
3478                 }
3479         }
3480 
3481         /*
3482          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3483          * returns non-NULL, we complete ignore the timer stuff in the else
3484          * clause.  We ought to organize the code so that else clause can
3485          * (should) be executed regardless, possibly moving the PROBE timer
3486          * reset over.  The skb_peek() thing should only move stuff to the
3487          * write queue, NOT also manage the timer functions.
3488          */
3489 
3490         /*
3491          * Maybe we can take some stuff off of the write queue,
3492          * and put it onto the xmit queue.
3493          */
3494         if (skb_peek(&sk->write_queue) != NULL) 
3495         {
3496                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3497                         (sk->retransmits == 0 || 
3498                          sk->ip_xmit_timeout != TIME_WRITE ||
3499                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3500                         && sk->packets_out < sk->cong_window) 
3501                 {
3502                         /*
3503                          *      Add more data to the send queue.
3504                          */
3505                         flag |= 1;
3506                         tcp_write_xmit(sk);
3507                 }
3508                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3509                         sk->send_head == NULL &&
3510                         sk->ack_backlog == 0 &&
3511                         sk->state != TCP_TIME_WAIT) 
3512                 {
3513                         /*
3514                          *      Data to queue but no room.
3515                          */
3516                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3517                 }               
3518         }
3519         else
3520         {
3521                 /*
3522                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3523                  * from TCP_CLOSE we don't do anything
3524                  *
3525                  * from anything else, if there is write data (or fin) pending,
3526                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3527                  * a KEEPALIVE timeout, else we delete the timer.
3528                  *
3529                  * We do not set flag for nominal write data, otherwise we may
3530                  * force a state where we start to write itsy bitsy tidbits
3531                  * of data.
3532                  */
3533 
3534                 switch(sk->state) {
3535                 case TCP_TIME_WAIT:
3536                         /*
3537                          * keep us in TIME_WAIT until we stop getting packets,
3538                          * reset the timeout.
3539                          */
3540                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3541                         break;
3542                 case TCP_CLOSE:
3543                         /*
3544                          * don't touch the timer.
3545                          */
3546                         break;
3547                 default:
3548                         /*
3549                          *      Must check send_head, write_queue, and ack_backlog
3550                          *      to determine which timeout to use.
3551                          */
3552                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3553                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3554                         } else if (sk->keepopen) {
3555                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3556                         } else {
3557                                 del_timer(&sk->retransmit_timer);
3558                                 sk->ip_xmit_timeout = 0;
3559                         }
3560                         break;
3561                 }
3562         }
3563 
3564         /*
3565          *      We have nothing queued but space to send. Send any partial
3566          *      packets immediately (end of Nagle rule application).
3567          */
3568          
3569         if (sk->packets_out == 0 && sk->partial != NULL &&
3570                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3571         {
3572                 flag |= 1;
3573                 tcp_send_partial(sk);
3574         }
3575 
3576         /*
3577          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3578          * we are now waiting for an acknowledge to our FIN.  The other end is
3579          * already in TIME_WAIT.
3580          *
3581          * Move to TCP_CLOSE on success.
3582          */
3583 
3584         if (sk->state == TCP_LAST_ACK) 
3585         {
3586                 if (!sk->dead)
3587                         sk->state_change(sk);
3588                 if(sk->debug)
3589                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3590                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3591                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3592                 {
3593                         flag |= 1;
3594                         tcp_set_state(sk,TCP_CLOSE);
3595                         sk->shutdown = SHUTDOWN_MASK;
3596                 }
3597         }
3598 
3599         /*
3600          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3601          *
3602          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3603          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3604          */
3605 
3606         if (sk->state == TCP_FIN_WAIT1) 
3607         {
3608 
3609                 if (!sk->dead) 
3610                         sk->state_change(sk);
3611                 if (sk->rcv_ack_seq == sk->write_seq) 
3612                 {
3613                         flag |= 1;
3614                         sk->shutdown |= SEND_SHUTDOWN;
3615                         tcp_set_state(sk, TCP_FIN_WAIT2);
3616                 }
3617         }
3618 
3619         /*
3620          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3621          *
3622          *      Move to TIME_WAIT
3623          */
3624 
3625         if (sk->state == TCP_CLOSING) 
3626         {
3627 
3628                 if (!sk->dead) 
3629                         sk->state_change(sk);
3630                 if (sk->rcv_ack_seq == sk->write_seq) 
3631                 {
3632                         flag |= 1;
3633                         tcp_time_wait(sk);
3634                 }
3635         }
3636         
3637         /*
3638          *      Final ack of a three way shake 
3639          */
3640          
3641         if(sk->state==TCP_SYN_RECV)
3642         {
3643                 tcp_set_state(sk, TCP_ESTABLISHED);
3644                 tcp_options(sk,th);
3645                 sk->dummy_th.dest=th->source;
3646                 sk->copied_seq = sk->acked_seq;
3647                 if(!sk->dead)
3648                         sk->state_change(sk);
3649                 if(sk->max_window==0)
3650                 {
3651                         sk->max_window=32;      /* Sanity check */
3652                         sk->mss=min(sk->max_window,sk->mtu);
3653                 }
3654         }
3655         
3656         /*
3657          * I make no guarantees about the first clause in the following
3658          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3659          * what conditions "!flag" would be true.  However I think the rest
3660          * of the conditions would prevent that from causing any
3661          * unnecessary retransmission. 
3662          *   Clearly if the first packet has expired it should be 
3663          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3664          * harder to explain:  You have to look carefully at how and when the
3665          * timer is set and with what timeout.  The most recent transmission always
3666          * sets the timer.  So in general if the most recent thing has timed
3667          * out, everything before it has as well.  So we want to go ahead and
3668          * retransmit some more.  If we didn't explicitly test for this
3669          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3670          * would not be true.  If you look at the pattern of timing, you can
3671          * show that rto is increased fast enough that the next packet would
3672          * almost never be retransmitted immediately.  Then you'd end up
3673          * waiting for a timeout to send each packet on the retransmission
3674          * queue.  With my implementation of the Karn sampling algorithm,
3675          * the timeout would double each time.  The net result is that it would
3676          * take a hideous amount of time to recover from a single dropped packet.
3677          * It's possible that there should also be a test for TIME_WRITE, but
3678          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3679          * got to be in real retransmission mode.
3680          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3681          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3682          * As long as no further losses occur, this seems reasonable.
3683          */
3684         
3685         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3686                (((flag&2) && sk->retransmits) ||
3687                (sk->send_head->when + sk->rto < jiffies))) 
3688         {
3689                 if(sk->send_head->when + sk->rto < jiffies)
3690                         tcp_retransmit(sk,0);   
3691                 else
3692                 {
3693                         tcp_do_retransmit(sk, 1);
3694                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3695                 }
3696         }
3697 
3698         return(1);
3699 }
3700 
3701 
3702 /*
3703  *      Process the FIN bit. This now behaves as it is supposed to work
3704  *      and the FIN takes effect when it is validly part of sequence
3705  *      space. Not before when we get holes.
3706  *
3707  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3708  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3709  *      TIME-WAIT)
3710  *
3711  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3712  *      close and we go into CLOSING (and later onto TIME-WAIT)
3713  *
3714  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3715  *
3716  */
3717  
3718 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /*  */
3719 {
3720         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3721 
3722         if (!sk->dead) 
3723         {
3724                 sk->state_change(sk);
3725                 sock_wake_async(sk->socket, 1);
3726         }
3727 
3728         switch(sk->state) 
3729         {
3730                 case TCP_SYN_RECV:
3731                 case TCP_SYN_SENT:
3732                 case TCP_ESTABLISHED:
3733                         /*
3734                          * move to CLOSE_WAIT, tcp_data() already handled
3735                          * sending the ack.
3736                          */
3737                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3738                         if (th->rst)
3739                                 sk->shutdown = SHUTDOWN_MASK;
3740                         break;
3741 
3742                 case TCP_CLOSE_WAIT:
3743                 case TCP_CLOSING:
3744                         /*
3745                          * received a retransmission of the FIN, do
3746                          * nothing.
3747                          */
3748                         break;
3749                 case TCP_TIME_WAIT:
3750                         /*
3751                          * received a retransmission of the FIN,
3752                          * restart the TIME_WAIT timer.
3753                          */
3754                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3755                         return(0);
3756                 case TCP_FIN_WAIT1:
3757                         /*
3758                          * This case occurs when a simultaneous close
3759                          * happens, we must ack the received FIN and
3760                          * enter the CLOSING state.
3761                          *
3762                          * This causes a WRITE timeout, which will either
3763                          * move on to TIME_WAIT when we timeout, or resend
3764                          * the FIN properly (maybe we get rid of that annoying
3765                          * FIN lost hang). The TIME_WRITE code is already correct
3766                          * for handling this timeout.
3767                          */
3768 
3769                         if(sk->ip_xmit_timeout != TIME_WRITE)
3770                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3771                         tcp_set_state(sk,TCP_CLOSING);
3772                         break;
3773                 case TCP_FIN_WAIT2:
3774                         /*
3775                          * received a FIN -- send ACK and enter TIME_WAIT
3776                          */
3777                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3778                         sk->shutdown|=SHUTDOWN_MASK;
3779                         tcp_set_state(sk,TCP_TIME_WAIT);
3780                         break;
3781                 case TCP_CLOSE:
3782                         /*
3783                          * already in CLOSE
3784                          */
3785                         break;
3786                 default:
3787                         tcp_set_state(sk,TCP_LAST_ACK);
3788         
3789                         /* Start the timers. */
3790                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3791                         return(0);
3792         }
3793 
3794         return(0);
3795 }
3796 
3797 
3798 
3799 /*
3800  *      This routine handles the data.  If there is room in the buffer,
3801  *      it will be have already been moved into it.  If there is no
3802  *      room, then we will just have to discard the packet.
3803  */
3804 
3805 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /*  */
3806          unsigned long saddr, unsigned short len)
3807 {
3808         struct sk_buff *skb1, *skb2;
3809         struct tcphdr *th;
3810         int dup_dumped=0;
3811         u32 new_seq, shut_seq;
3812 
3813         th = skb->h.th;
3814         skb_pull(skb,th->doff*4);
3815         skb_trim(skb,len-(th->doff*4));
3816 
3817         /*
3818          *      The bytes in the receive read/assembly queue has increased. Needed for the
3819          *      low memory discard algorithm 
3820          */
3821            
3822         sk->bytes_rcv += skb->len;
3823         
3824         if (skb->len == 0 && !th->fin) 
3825         {
3826                 /* 
3827                  *      Don't want to keep passing ack's back and forth. 
3828                  *      (someone sent us dataless, boring frame)
3829                  */
3830                 if (!th->ack)
3831                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3832                 kfree_skb(skb, FREE_READ);
3833                 return(0);
3834         }
3835         
3836         /*
3837          *      We no longer have anyone receiving data on this connection.
3838          */
3839 
3840 #ifndef TCP_DONT_RST_SHUTDOWN            
3841 
3842         if(sk->shutdown & RCV_SHUTDOWN)
3843         {
3844                 /*
3845                  *      FIXME: BSD has some magic to avoid sending resets to
3846                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3847                  *      BSD stacks still have broken keepalives so we want to
3848                  *      cope with it.
3849                  */
3850 
3851                 if(skb->len)    /* We don't care if it's just an ack or
3852                                    a keepalive/window probe */
3853                 {
3854                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3855                         
3856                         /* Do this the way 4.4BSD treats it. Not what I'd
3857                            regard as the meaning of the spec but it's what BSD
3858                            does and clearly they know everything 8) */
3859 
3860                         /*
3861                          *      This is valid because of two things
3862                          *
3863                          *      a) The way tcp_data behaves at the bottom.
3864                          *      b) A fin takes effect when read not when received.
3865                          */
3866                          
3867                         shut_seq=sk->acked_seq+1;       /* Last byte */
3868                         
3869                         if(after(new_seq,shut_seq))
3870                         {
3871                                 if(sk->debug)
3872                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3873                                                 sk, new_seq, shut_seq, sk->blog);
3874                                 if(sk->dead)
3875                                 {
3876                                         sk->acked_seq = new_seq + th->fin;
3877                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3878                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3879                                         tcp_statistics.TcpEstabResets++;
3880                                         tcp_set_state(sk,TCP_CLOSE);
3881                                         sk->err = EPIPE;
3882                                         sk->shutdown = SHUTDOWN_MASK;
3883                                         kfree_skb(skb, FREE_READ);
3884                                         return 0;
3885                                 }
3886                         }
3887                 }
3888         }
3889 
3890 #endif
3891 
3892         /*
3893          *      Now we have to walk the chain, and figure out where this one
3894          *      goes into it.  This is set up so that the last packet we received
3895          *      will be the first one we look at, that way if everything comes
3896          *      in order, there will be no performance loss, and if they come
3897          *      out of order we will be able to fit things in nicely.
3898          *
3899          *      [AC: This is wrong. We should assume in order first and then walk
3900          *       forwards from the first hole based upon real traffic patterns.]
3901          *      
3902          */
3903 
3904         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3905         {
3906                 skb_queue_head(&sk->receive_queue,skb);
3907                 skb1= NULL;
3908         } 
3909         else
3910         {
3911                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3912                 {
3913                         if(sk->debug)
3914                         {
3915                                 printk("skb1=%p :", skb1);
3916                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3917                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3918                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3919                                                 sk->acked_seq);
3920                         }
3921                         
3922                         /*
3923                          *      Optimisation: Duplicate frame or extension of previous frame from
3924                          *      same sequence point (lost ack case).
3925                          *      The frame contains duplicate data or replaces a previous frame
3926                          *      discard the previous frame (safe as sk->inuse is set) and put
3927                          *      the new one in its place.
3928                          */
3929                          
3930                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3931                         {
3932                                 skb_append(skb1,skb);
3933                                 skb_unlink(skb1);
3934                                 kfree_skb(skb1,FREE_READ);
3935                                 dup_dumped=1;
3936                                 skb1=NULL;
3937                                 break;
3938                         }
3939                         
3940                         /*
3941                          *      Found where it fits
3942                          */
3943                          
3944                         if (after(th->seq+1, skb1->h.th->seq))
3945                         {
3946                                 skb_append(skb1,skb);
3947                                 break;
3948                         }
3949                         
3950                         /*
3951                          *      See if we've hit the start. If so insert.
3952                          */
3953                         if (skb1 == skb_peek(&sk->receive_queue))
3954                         {
3955                                 skb_queue_head(&sk->receive_queue, skb);
3956                                 break;
3957                         }
3958                 }
3959         }
3960 
3961         /*
3962          *      Figure out what the ack value for this frame is
3963          */
3964          
3965         th->ack_seq = th->seq + skb->len;
3966         if (th->syn) 
3967                 th->ack_seq++;
3968         if (th->fin)
3969                 th->ack_seq++;
3970 
3971         if (before(sk->acked_seq, sk->copied_seq)) 
3972         {
3973                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3974                 sk->acked_seq = sk->copied_seq;
3975         }
3976 
3977         /*
3978          *      Now figure out if we can ack anything. This is very messy because we really want two
3979          *      receive queues, a completed and an assembly queue. We also want only one transmit
3980          *      queue.
3981          */
3982 
3983         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3984         {
3985                 if (before(th->seq, sk->acked_seq+1)) 
3986                 {
3987                         int newwindow;
3988 
3989                         if (after(th->ack_seq, sk->acked_seq)) 
3990                         {
3991                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3992                                 if (newwindow < 0)
3993                                         newwindow = 0;  
3994                                 sk->window = newwindow;
3995                                 sk->acked_seq = th->ack_seq;
3996                         }
3997                         skb->acked = 1;
3998 
3999                         /*
4000                          *      When we ack the fin, we do the FIN 
4001                          *      processing.
4002                          */
4003 
4004                         if (skb->h.th->fin) 
4005                         {
4006                                 tcp_fin(skb,sk,skb->h.th);
4007                         }
4008           
4009                         for(skb2 = skb->next;
4010                             skb2 != (struct sk_buff *)&sk->receive_queue;
4011                             skb2 = skb2->next) 
4012                         {
4013                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4014                                 {
4015                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4016                                         {
4017                                                 newwindow = sk->window -
4018                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4019                                                 if (newwindow < 0)
4020                                                         newwindow = 0;  
4021                                                 sk->window = newwindow;
4022                                                 sk->acked_seq = skb2->h.th->ack_seq;
4023                                         }
4024                                         skb2->acked = 1;
4025                                         /*
4026                                          *      When we ack the fin, we do
4027                                          *      the fin handling.
4028                                          */
4029                                         if (skb2->h.th->fin) 
4030                                         {
4031                                                 tcp_fin(skb,sk,skb->h.th);
4032                                         }
4033 
4034                                         /*
4035                                          *      Force an immediate ack.
4036                                          */
4037                                          
4038                                         sk->ack_backlog = sk->max_ack_backlog;
4039                                 }
4040                                 else
4041                                 {
4042                                         break;
4043                                 }
4044                         }
4045 
4046                         /*
4047                          *      This also takes care of updating the window.
4048                          *      This if statement needs to be simplified.
4049                          */
4050                         if (!sk->delay_acks ||
4051                             sk->ack_backlog >= sk->max_ack_backlog || 
4052                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4053         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4054                         }
4055                         else 
4056                         {
4057                                 sk->ack_backlog++;
4058                                 if(sk->debug)
4059                                         printk("Ack queued.\n");
4060                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4061                         }
4062                 }
4063         }
4064 
4065         /*
4066          *      If we've missed a packet, send an ack.
4067          *      Also start a timer to send another.
4068          */
4069          
4070         if (!skb->acked) 
4071         {
4072         
4073         /*
4074          *      This is important.  If we don't have much room left,
4075          *      we need to throw out a few packets so we have a good
4076          *      window.  Note that mtu is used, not mss, because mss is really
4077          *      for the send side.  He could be sending us stuff as large as mtu.
4078          */
4079                  
4080                 while (sk->prot->rspace(sk) < sk->mtu) 
4081                 {
4082                         skb1 = skb_peek(&sk->receive_queue);
4083                         if (skb1 == NULL) 
4084                         {
4085                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4086                                 break;
4087                         }
4088 
4089                         /*
4090                          *      Don't throw out something that has been acked. 
4091                          */
4092                  
4093                         if (skb1->acked) 
4094                         {
4095                                 break;
4096                         }
4097                 
4098                         skb_unlink(skb1);
4099                         kfree_skb(skb1, FREE_READ);
4100                 }
4101                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4102                 sk->ack_backlog++;
4103                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4104         }
4105         else
4106         {
4107                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4108         }
4109 
4110         /*
4111          *      Now tell the user we may have some data. 
4112          */
4113          
4114         if (!sk->dead) 
4115         {
4116                 if(sk->debug)
4117                         printk("Data wakeup.\n");
4118                 sk->data_ready(sk,0);
4119         } 
4120         return(0);
4121 }
4122 
4123 
4124 /*
4125  *      This routine is only called when we have urgent data
4126  *      signalled. Its the 'slow' part of tcp_urg. It could be
4127  *      moved inline now as tcp_urg is only called from one
4128  *      place. We handle URGent data wrong. We have to - as
4129  *      BSD still doesn't use the correction from RFC961.
4130  */
4131  
4132 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /*  */
4133 {
4134         u32 ptr = ntohs(th->urg_ptr);
4135 
4136         if (ptr)
4137                 ptr--;
4138         ptr += th->seq;
4139 
4140         /* ignore urgent data that we've already seen and read */
4141         if (after(sk->copied_seq, ptr))
4142                 return;
4143 
4144         /* do we already have a newer (or duplicate) urgent pointer? */
4145         if (sk->urg_data && !after(ptr, sk->urg_seq))
4146                 return;
4147 
4148         /* tell the world about our new urgent pointer */
4149         if (sk->proc != 0) {
4150                 if (sk->proc > 0) {
4151                         kill_proc(sk->proc, SIGURG, 1);
4152                 } else {
4153                         kill_pg(-sk->proc, SIGURG, 1);
4154                 }
4155         }
4156         sk->urg_data = URG_NOTYET;
4157         sk->urg_seq = ptr;
4158 }
4159 
4160 /*
4161  *      This is the 'fast' part of urgent handling.
4162  */
4163  
4164 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /*  */
4165         unsigned long saddr, unsigned long len)
4166 {
4167         u32 ptr;
4168 
4169         /*
4170          *      Check if we get a new urgent pointer - normally not 
4171          */
4172          
4173         if (th->urg)
4174                 tcp_check_urg(sk,th);
4175 
4176         /*
4177          *      Do we wait for any urgent data? - normally not
4178          */
4179          
4180         if (sk->urg_data != URG_NOTYET)
4181                 return 0;
4182 
4183         /*
4184          *      Is the urgent pointer pointing into this packet? 
4185          */
4186          
4187         ptr = sk->urg_seq - th->seq + th->doff*4;
4188         if (ptr >= len)
4189                 return 0;
4190 
4191         /*
4192          *      Ok, got the correct packet, update info 
4193          */
4194          
4195         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4196         if (!sk->dead)
4197                 sk->data_ready(sk,0);
4198         return 0;
4199 }
4200 
4201 /*
4202  *      This will accept the next outstanding connection. 
4203  */
4204  
4205 static struct sock *tcp_accept(struct sock *sk, int flags)
     /*  */
4206 {
4207         struct sock *newsk;
4208         struct sk_buff *skb;
4209   
4210   /*
4211    * We need to make sure that this socket is listening,
4212    * and that it has something pending.
4213    */
4214 
4215         if (sk->state != TCP_LISTEN) 
4216         {
4217                 sk->err = EINVAL;
4218                 return(NULL); 
4219         }
4220 
4221         /* Avoid the race. */
4222         cli();
4223         sk->inuse = 1;
4224 
4225         while((skb = tcp_dequeue_established(sk)) == NULL) 
4226         {
4227                 if (flags & O_NONBLOCK) 
4228                 {
4229                         sti();
4230                         release_sock(sk);
4231                         sk->err = EAGAIN;
4232                         return(NULL);
4233                 }
4234 
4235                 release_sock(sk);
4236                 interruptible_sleep_on(sk->sleep);
4237                 if (current->signal & ~current->blocked) 
4238                 {
4239                         sti();
4240                         sk->err = ERESTARTSYS;
4241                         return(NULL);
4242                 }
4243                 sk->inuse = 1;
4244         }
4245         sti();
4246 
4247         /*
4248          *      Now all we need to do is return skb->sk. 
4249          */
4250 
4251         newsk = skb->sk;
4252 
4253         kfree_skb(skb, FREE_READ);
4254         sk->ack_backlog--;
4255         release_sock(sk);
4256         return(newsk);
4257 }
4258 
4259 
4260 /*
4261  *      This will initiate an outgoing connection. 
4262  */
4263  
4264 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /*  */
4265 {
4266         struct sk_buff *buff;
4267         struct device *dev=NULL;
4268         unsigned char *ptr;
4269         int tmp;
4270         int atype;
4271         struct tcphdr *t1;
4272         struct rtable *rt;
4273 
4274         if (sk->state != TCP_CLOSE) 
4275         {
4276                 return(-EISCONN);
4277         }
4278         
4279         if (addr_len < 8) 
4280                 return(-EINVAL);
4281 
4282         if (usin->sin_family && usin->sin_family != AF_INET) 
4283                 return(-EAFNOSUPPORT);
4284 
4285         /*
4286          *      connect() to INADDR_ANY means loopback (BSD'ism).
4287          */
4288         
4289         if(usin->sin_addr.s_addr==INADDR_ANY)
4290                 usin->sin_addr.s_addr=ip_my_addr();
4291                   
4292         /*
4293          *      Don't want a TCP connection going to a broadcast address 
4294          */
4295 
4296         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4297                 return -ENETUNREACH;
4298   
4299         sk->inuse = 1;
4300         sk->daddr = usin->sin_addr.s_addr;
4301         sk->write_seq = tcp_init_seq();
4302         sk->window_seq = sk->write_seq;
4303         sk->rcv_ack_seq = sk->write_seq -1;
4304         sk->err = 0;
4305         sk->dummy_th.dest = usin->sin_port;
4306         release_sock(sk);
4307 
4308         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4309         if (buff == NULL) 
4310         {
4311                 return(-ENOMEM);
4312         }
4313         sk->inuse = 1;
4314         buff->sk = sk;
4315         buff->free = 0;
4316         buff->localroute = sk->localroute;
4317         
4318 
4319         /*
4320          *      Put in the IP header and routing stuff. 
4321          */
4322          
4323         rt=ip_rt_route(sk->daddr, NULL, NULL);
4324         
4325 
4326         /*
4327          *      We need to build the routing stuff from the things saved in skb. 
4328          */
4329 
4330         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4331                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4332         if (tmp < 0) 
4333         {
4334                 sk->prot->wfree(sk, buff);
4335                 release_sock(sk);
4336                 return(-ENETUNREACH);
4337         }
4338 
4339         t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4340 
4341         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4342         t1->seq = ntohl(sk->write_seq++);
4343         sk->sent_seq = sk->write_seq;
4344         buff->h.seq = sk->write_seq;
4345         t1->ack = 0;
4346         t1->window = 2;
4347         t1->res1=0;
4348         t1->res2=0;
4349         t1->rst = 0;
4350         t1->urg = 0;
4351         t1->psh = 0;
4352         t1->syn = 1;
4353         t1->urg_ptr = 0;
4354         t1->doff = 6;
4355         /* use 512 or whatever user asked for */
4356         
4357         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4358                 sk->window_clamp=rt->rt_window;
4359         else
4360                 sk->window_clamp=0;
4361 
4362         if (sk->user_mss)
4363                 sk->mtu = sk->user_mss;
4364         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4365                 sk->mtu = rt->rt_mss;
4366         else 
4367         {
4368 #ifdef CONFIG_INET_SNARL
4369                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4370 #else
4371                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4372 #endif
4373                         sk->mtu = 576 - HEADER_SIZE;
4374                 else
4375                         sk->mtu = MAX_WINDOW;
4376         }
4377         /*
4378          *      but not bigger than device MTU 
4379          */
4380 
4381         if(sk->mtu <32)
4382                 sk->mtu = 32;   /* Sanity limit */
4383                 
4384         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4385         
4386         /*
4387          *      Put in the TCP options to say MTU. 
4388          */
4389 
4390         ptr = skb_put(buff,4);
4391         ptr[0] = 2;
4392         ptr[1] = 4;
4393         ptr[2] = (sk->mtu) >> 8;
4394         ptr[3] = (sk->mtu) & 0xff;
4395         tcp_send_check(t1, sk->saddr, sk->daddr,
4396                   sizeof(struct tcphdr) + 4, sk);
4397 
4398         /*
4399          *      This must go first otherwise a really quick response will get reset. 
4400          */
4401 
4402         tcp_cache_zap();
4403         tcp_set_state(sk,TCP_SYN_SENT);
4404         if(rt&&rt->rt_flags&RTF_IRTT)
4405                 sk->rto = rt->rt_irtt;
4406         else
4407                 sk->rto = TCP_TIMEOUT_INIT;
4408         sk->retransmit_timer.function=&retransmit_timer;
4409         sk->retransmit_timer.data = (unsigned long)sk;
4410         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4411         sk->retransmits = 0;    /* Now works the right way instead of a hacked initial setting */
4412 
4413         sk->prot->queue_xmit(sk, dev, buff, 0);  
4414         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4415         tcp_statistics.TcpActiveOpens++;
4416         tcp_statistics.TcpOutSegs++;
4417   
4418         release_sock(sk);
4419         return(0);
4420 }
4421 
4422 
4423 /* This functions checks to see if the tcp header is actually acceptable. */
4424 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /*  */
4425              struct options *opt, unsigned long saddr, struct device *dev)
4426 {
4427         u32 next_seq;
4428 
4429         next_seq = len - 4*th->doff;
4430         if (th->fin)
4431                 next_seq++;
4432         /* if we have a zero window, we can't have any data in the packet.. */
4433         if (next_seq && !sk->window)
4434                 goto ignore_it;
4435         next_seq += th->seq;
4436 
4437         /*
4438          * This isn't quite right.  sk->acked_seq could be more recent
4439          * than sk->window.  This is however close enough.  We will accept
4440          * slightly more packets than we should, but it should not cause
4441          * problems unless someone is trying to forge packets.
4442          */
4443 
4444         /* have we already seen all of this packet? */
4445         if (!after(next_seq+1, sk->acked_seq))
4446                 goto ignore_it;
4447         /* or does it start beyond the window? */
4448         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4449                 goto ignore_it;
4450 
4451         /* ok, at least part of this packet would seem interesting.. */
4452         return 1;
4453 
4454 ignore_it:
4455         if (th->rst)
4456                 return 0;
4457 
4458         /*
4459          *      Send a reset if we get something not ours and we are
4460          *      unsynchronized. Note: We don't do anything to our end. We
4461          *      are just killing the bogus remote connection then we will
4462          *      connect again and it will work (with luck).
4463          */
4464          
4465         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4466         {
4467                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4468                 return 1;
4469         }
4470 
4471         /* Try to resync things. */
4472         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4473         return 0;
4474 }
4475 
4476 /*
4477  *      When we get a reset we do this.
4478  */
4479 
4480 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /*  */
4481 {
4482         sk->zapped = 1;
4483         sk->err = ECONNRESET;
4484         if (sk->state == TCP_SYN_SENT)
4485                 sk->err = ECONNREFUSED;
4486         if (sk->state == TCP_CLOSE_WAIT)
4487                 sk->err = EPIPE;
4488 #ifdef TCP_DO_RFC1337           
4489         /*
4490          *      Time wait assassination protection [RFC1337]
4491          */
4492         if(sk->state!=TCP_TIME_WAIT)
4493         {       
4494                 tcp_set_state(sk,TCP_CLOSE);
4495                 sk->shutdown = SHUTDOWN_MASK;
4496         }
4497 #else   
4498         tcp_set_state(sk,TCP_CLOSE);
4499         sk->shutdown = SHUTDOWN_MASK;
4500 #endif  
4501         if (!sk->dead) 
4502                 sk->state_change(sk);
4503         kfree_skb(skb, FREE_READ);
4504         release_sock(sk);
4505         return(0);
4506 }
4507 
4508 /*
4509  *      A TCP packet has arrived.
4510  *              skb->h.raw is the TCP header.
4511  */
4512  
4513 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /*  */
4514         unsigned long daddr, unsigned short len,
4515         unsigned long saddr, int redo, struct inet_protocol * protocol)
4516 {
4517         struct tcphdr *th;
4518         struct sock *sk;
4519         int syn_ok=0;
4520         
4521         tcp_statistics.TcpInSegs++;
4522         if(skb->pkt_type!=PACKET_HOST)
4523         {
4524                 kfree_skb(skb,FREE_READ);
4525                 return(0);
4526         }
4527   
4528         th = skb->h.th;
4529 
4530         /*
4531          *      Find the socket, using the last hit cache if applicable.
4532          */
4533 
4534         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4535                 sk=(struct sock *)th_cache_sk;
4536         else
4537         {
4538                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4539                 th_cache_saddr=saddr;
4540                 th_cache_daddr=daddr;
4541                 th_cache_dport=th->dest;
4542                 th_cache_sport=th->source;
4543                 th_cache_sk=sk;
4544         }               
4545 
4546         /*
4547          *      If this socket has got a reset it's to all intents and purposes 
4548          *      really dead. Count closed sockets as dead.
4549          *
4550          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4551          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4552          *      exist so should cause resets as if the port was unreachable.
4553          */
4554          
4555         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4556                 sk=NULL;
4557 
4558         if (!redo) 
4559         {
4560                 /*
4561                  *      Pull up the IP header.
4562                  */
4563                 skb_pull(skb, skb->h.raw-skb->data);
4564                 /*
4565                  *      Try to use the device checksum if provided.
4566                  */
4567                 if (
4568                         (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4569                         (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4570                     )
4571                 {
4572                         skb->sk = NULL;
4573                         kfree_skb(skb,FREE_READ);
4574                         /*
4575                          *      We don't release the socket because it was
4576                          *      never marked in use.
4577                          */
4578                         return(0);
4579                 }
4580                 th->seq = ntohl(th->seq);
4581 
4582                 /* See if we know about the socket. */
4583                 if (sk == NULL) 
4584                 {
4585                         /*
4586                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4587                          */
4588                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4589                         skb->sk = NULL;
4590                         /*
4591                          *      Discard frame
4592                          */
4593                         kfree_skb(skb, FREE_READ);
4594                         return(0);
4595                 }
4596 
4597 /*              skb->len = len;*/
4598                 skb->acked = 0;
4599                 skb->used = 0;
4600                 skb->free = 0;
4601                 skb->saddr = daddr;
4602                 skb->daddr = saddr;
4603         
4604                 /* We may need to add it to the backlog here. */
4605                 cli();
4606                 if (sk->inuse) 
4607                 {
4608                         skb_queue_tail(&sk->back_log, skb);
4609                         sti();
4610                         return(0);
4611                 }
4612                 sk->inuse = 1;
4613                 sti();
4614         }
4615         else
4616         {
4617                 if (sk==NULL) 
4618                 {
4619                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4620                         skb->sk = NULL;
4621                         kfree_skb(skb, FREE_READ);
4622                         return(0);
4623                 }
4624         }
4625 
4626 
4627         if (!sk->prot) 
4628         {
4629                 printk("IMPOSSIBLE 3\n");
4630                 return(0);
4631         }
4632 
4633 
4634         /*
4635          *      Charge the memory to the socket. 
4636          */
4637          
4638         if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf) 
4639         {
4640                 kfree_skb(skb, FREE_READ);
4641                 release_sock(sk);
4642                 return(0);
4643         }
4644 
4645         skb->sk=sk;
4646         sk->rmem_alloc += skb->truesize;
4647 
4648         /*
4649          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4650          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4651          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4652          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4653          */
4654 
4655         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4656         {
4657         
4658                 /*
4659                  *      Now deal with unusual cases.
4660                  */
4661          
4662                 if(sk->state==TCP_LISTEN)
4663                 {
4664                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4665                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4666 
4667                         /*
4668                          *      We don't care for RST, and non SYN are absorbed (old segments)
4669                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4670                          *      netmask on a running connection it can go broadcast. Even Sun's have
4671                          *      this problem so I'm ignoring it 
4672                          */
4673                            
4674                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4675                         {
4676                                 kfree_skb(skb, FREE_READ);
4677                                 release_sock(sk);
4678                                 return 0;
4679                         }
4680                 
4681                         /*      
4682                          *      Guess we need to make a new socket up 
4683                          */
4684                 
4685                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4686                 
4687                         /*
4688                          *      Now we have several options: In theory there is nothing else
4689                          *      in the frame. KA9Q has an option to send data with the syn,
4690                          *      BSD accepts data with the syn up to the [to be] advertised window
4691                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4692                          *      it, that fits the spec precisely and avoids incompatibilities. It
4693                          *      would be nice in future to drop through and process the data.
4694                          */
4695                          
4696                         release_sock(sk);
4697                         return 0;
4698                 }
4699         
4700                 /* retransmitted SYN? */
4701                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4702                 {
4703                         kfree_skb(skb, FREE_READ);
4704                         release_sock(sk);
4705                         return 0;
4706                 }
4707                 
4708                 /*
4709                  *      SYN sent means we have to look for a suitable ack and either reset
4710                  *      for bad matches or go to connected 
4711                  */
4712            
4713                 if(sk->state==TCP_SYN_SENT)
4714                 {
4715                         /* Crossed SYN or previous junk segment */
4716                         if(th->ack)
4717                         {
4718                                 /* We got an ack, but it's not a good ack */
4719                                 if(!tcp_ack(sk,th,saddr,len))
4720                                 {
4721                                         /* Reset the ack - its an ack from a 
4722                                            different connection  [ th->rst is checked in tcp_reset()] */
4723                                         tcp_statistics.TcpAttemptFails++;
4724                                         tcp_reset(daddr, saddr, th,
4725                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4726                                         kfree_skb(skb, FREE_READ);
4727                                         release_sock(sk);
4728                                         return(0);
4729                                 }
4730                                 if(th->rst)
4731                                         return tcp_std_reset(sk,skb);
4732                                 if(!th->syn)
4733                                 {
4734                                         /* A valid ack from a different connection
4735                                            start. Shouldn't happen but cover it */
4736                                         kfree_skb(skb, FREE_READ);
4737                                         release_sock(sk);
4738                                         return 0;
4739                                 }
4740                                 /*
4741                                  *      Ok.. it's good. Set up sequence numbers and
4742                                  *      move to established.
4743                                  */
4744                                 syn_ok=1;       /* Don't reset this connection for the syn */
4745                                 sk->acked_seq=th->seq+1;
4746                                 sk->fin_seq=th->seq;
4747                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4748                                 tcp_set_state(sk, TCP_ESTABLISHED);
4749                                 tcp_options(sk,th);
4750                                 sk->dummy_th.dest=th->source;
4751                                 sk->copied_seq = sk->acked_seq;
4752                                 if(!sk->dead)
4753                                 {
4754                                         sk->state_change(sk);
4755                                         sock_wake_async(sk->socket, 0);
4756                                 }
4757                                 if(sk->max_window==0)
4758                                 {
4759                                         sk->max_window = 32;
4760                                         sk->mss = min(sk->max_window, sk->mtu);
4761                                 }
4762                         }
4763                         else
4764                         {
4765                                 /* See if SYN's cross. Drop if boring */
4766                                 if(th->syn && !th->rst)
4767                                 {
4768                                         /* Crossed SYN's are fine - but talking to
4769                                            yourself is right out... */
4770                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4771                                                 sk->dummy_th.source==th->source &&
4772                                                 sk->dummy_th.dest==th->dest)
4773                                         {
4774                                                 tcp_statistics.TcpAttemptFails++;
4775                                                 return tcp_std_reset(sk,skb);
4776                                         }
4777                                         tcp_set_state(sk,TCP_SYN_RECV);
4778                                         
4779                                         /*
4780                                          *      FIXME:
4781                                          *      Must send SYN|ACK here
4782                                          */
4783                                 }               
4784                                 /* Discard junk segment */
4785                                 kfree_skb(skb, FREE_READ);
4786                                 release_sock(sk);
4787                                 return 0;
4788                         }
4789                         /*
4790                          *      SYN_RECV with data maybe.. drop through
4791                          */
4792                         goto rfc_step6;
4793                 }
4794 
4795         /*
4796          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4797          *      a more complex suggestion for fixing these reuse issues in RFC1644
4798          *      but not yet ready for general use. Also see RFC1379.
4799          */
4800         
4801 #define BSD_TIME_WAIT
4802 #ifdef BSD_TIME_WAIT
4803                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4804                         after(th->seq, sk->acked_seq) && !th->rst)
4805                 {
4806                         u32 seq = sk->write_seq;
4807                         if(sk->debug)
4808                                 printk("Doing a BSD time wait\n");
4809                         tcp_statistics.TcpEstabResets++;           
4810                         sk->rmem_alloc -= skb->truesize;
4811                         skb->sk = NULL;
4812                         sk->err=ECONNRESET;
4813                         tcp_set_state(sk, TCP_CLOSE);
4814                         sk->shutdown = SHUTDOWN_MASK;
4815                         release_sock(sk);
4816                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4817                         if (sk && sk->state==TCP_LISTEN)
4818                         {
4819                                 sk->inuse=1;
4820                                 skb->sk = sk;
4821                                 sk->rmem_alloc += skb->truesize;
4822                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4823                                 release_sock(sk);
4824                                 return 0;
4825                         }
4826                         kfree_skb(skb, FREE_READ);
4827                         return 0;
4828                 }
4829 #endif  
4830         }
4831 
4832         /*
4833          *      We are now in normal data flow (see the step list in the RFC)
4834          *      Note most of these are inline now. I'll inline the lot when
4835          *      I have time to test it hard and look at what gcc outputs 
4836          */
4837         
4838         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4839         {
4840                 kfree_skb(skb, FREE_READ);
4841                 release_sock(sk);
4842                 return 0;
4843         }
4844 
4845         if(th->rst)
4846                 return tcp_std_reset(sk,skb);
4847         
4848         /*
4849          *      !syn_ok is effectively the state test in RFC793.
4850          */
4851          
4852         if(th->syn && !syn_ok)
4853         {
4854                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4855                 return tcp_std_reset(sk,skb);   
4856         }
4857 
4858         /*
4859          *      Process the ACK
4860          */
4861          
4862 
4863         if(th->ack && !tcp_ack(sk,th,saddr,len))
4864         {
4865                 /*
4866                  *      Our three way handshake failed.
4867                  */
4868                  
4869                 if(sk->state==TCP_SYN_RECV)
4870                 {
4871                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4872                 }
4873                 kfree_skb(skb, FREE_READ);
4874                 release_sock(sk);
4875                 return 0;
4876         }
4877         
4878 rfc_step6:              /* I'll clean this up later */
4879 
4880         /*
4881          *      Process urgent data
4882          */
4883                 
4884         if(tcp_urg(sk, th, saddr, len))
4885         {
4886                 kfree_skb(skb, FREE_READ);
4887                 release_sock(sk);
4888                 return 0;
4889         }
4890         
4891         
4892         /*
4893          *      Process the encapsulated data
4894          */
4895         
4896         if(tcp_data(skb,sk, saddr, len))
4897         {
4898                 kfree_skb(skb, FREE_READ);
4899                 release_sock(sk);
4900                 return 0;
4901         }
4902 
4903         /*
4904          *      And done
4905          */     
4906         
4907         release_sock(sk);
4908         return 0;
4909 }
4910 
4911 /*
4912  *      This routine sends a packet with an out of date sequence
4913  *      number. It assumes the other end will try to ack it.
4914  */
4915 
4916 static void tcp_write_wakeup(struct sock *sk)
     /*  */
4917 {
4918         struct sk_buff *buff,*skb;
4919         struct tcphdr *t1;
4920         struct device *dev=NULL;
4921         int tmp;
4922 
4923         if (sk->zapped)
4924                 return; /* After a valid reset we can send no more */
4925 
4926         /*
4927          *      Write data can still be transmitted/retransmitted in the
4928          *      following states.  If any other state is encountered, return.
4929          *      [listen/close will never occur here anyway]
4930          */
4931 
4932         if (sk->state != TCP_ESTABLISHED && 
4933             sk->state != TCP_CLOSE_WAIT &&
4934             sk->state != TCP_FIN_WAIT1 && 
4935             sk->state != TCP_LAST_ACK &&
4936             sk->state != TCP_CLOSING
4937         ) 
4938         {
4939                 return;
4940         }
4941         if ( before(sk->sent_seq, sk->window_seq) && 
4942             (skb=skb_peek(&sk->write_queue)))
4943         {
4944                 /*
4945                  * We are probing the opening of a window
4946                  * but the window size is != 0
4947                  * must have been a result SWS advoidance ( sender )
4948                  */
4949             
4950                 struct iphdr *iph;
4951                 struct tcphdr *th;
4952                 struct tcphdr *nth;
4953                 unsigned long win_size, ow_size;
4954                 void * tcp_data_start;
4955         
4956                 /*
4957                  *      How many bytes can we send ?
4958                  */
4959                  
4960                 win_size = sk->window_seq - sk->sent_seq;
4961 
4962                 /*
4963                  *      Recover the buffer pointers
4964                  */
4965                  
4966                 iph = (struct iphdr *)skb->ip_hdr;
4967                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4968 
4969                 /*
4970                  *      Grab the data for a temporary frame
4971                  */
4972                  
4973                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
4974                                      (iph->ihl << 2) +
4975                                      sk->prot->max_header + 15, 
4976                                      1, GFP_ATOMIC);
4977                 if ( buff == NULL )
4978                         return;
4979 
4980                 /* 
4981                  *      If we strip the packet on the write queue we must
4982                  *      be ready to retransmit this one 
4983                  */
4984             
4985                 buff->free = /*0*/1;
4986 
4987                 buff->sk = sk;
4988                 buff->localroute = sk->localroute;
4989                 
4990                 /*
4991                  *      Put headers on the new packet
4992                  */
4993 
4994                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4995                                          IPPROTO_TCP, sk->opt, buff->truesize,
4996                                          sk->ip_tos,sk->ip_ttl);
4997                 if (tmp < 0) 
4998                 {
4999                         sk->prot->wfree(sk, buff);
5000                         return;
5001                 }
5002                 
5003                 /*
5004                  *      Move the TCP header over
5005                  */
5006 
5007                 buff->dev = dev;
5008 
5009                 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5010 
5011                 memcpy(nth, th, th->doff * 4);
5012                 
5013                 /*
5014                  *      Correct the new header
5015                  */
5016                  
5017                 nth->ack = 1; 
5018                 nth->ack_seq = ntohl(sk->acked_seq);
5019                 nth->window = ntohs(tcp_select_window(sk));
5020                 nth->check = 0;
5021 
5022                 /*
5023                  *      Find the first data byte.
5024                  */
5025                  
5026                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
5027                                 (iph->ihl << 2) + th->doff * 4;
5028 
5029                 /*
5030                  *      Add it to our new buffer
5031                  */
5032                 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5033                 
5034                 /*
5035                  *      Remember our right edge sequence number.
5036                  */
5037                  
5038                 buff->h.seq = sk->sent_seq + win_size;
5039                 sk->sent_seq = buff->h.seq;             /* Hack */
5040 #if 0
5041 
5042                 /*
5043                  *      now: shrink the queue head segment 
5044                  */
5045                  
5046                 th->check = 0;
5047                 ow_size = skb->len - win_size - 
5048                         ((unsigned long) (tcp_data_start - (void *) skb->data));
5049 
5050                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5051                 skb_trim(skb,skb->len-win_size);
5052                 sk->sent_seq += win_size;
5053                 th->seq = htonl(sk->sent_seq);
5054                 if (th->urg)
5055                 {
5056                         unsigned short urg_ptr;
5057         
5058                         urg_ptr = ntohs(th->urg_ptr);
5059                         if (urg_ptr <= win_size)
5060                                 th->urg = 0;
5061                         else
5062                         {
5063                                 urg_ptr -= win_size;
5064                                 th->urg_ptr = htons(urg_ptr);
5065                                 nth->urg_ptr = htons(win_size);
5066                         }
5067                 }
5068 #else
5069                 if(th->urg && ntohs(th->urg_ptr) < win_size)
5070                         nth->urg = 0;
5071 #endif          
5072 
5073                 /*
5074                  *      Checksum the split buffer
5075                  */
5076                  
5077                 tcp_send_check(nth, sk->saddr, sk->daddr, 
5078                            nth->doff * 4 + win_size , sk);
5079         }
5080         else
5081         {       
5082                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5083                 if (buff == NULL) 
5084                         return;
5085 
5086                 buff->free = 1;
5087                 buff->sk = sk;
5088                 buff->localroute = sk->localroute;
5089 
5090                 /*
5091                  *      Put in the IP header and routing stuff. 
5092                  */
5093                  
5094                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5095                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5096                 if (tmp < 0) 
5097                 {
5098                         sk->prot->wfree(sk, buff);
5099                         return;
5100                 }
5101 
5102                 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5103                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5104 
5105                 /*
5106                  *      Use a previous sequence.
5107                  *      This should cause the other end to send an ack.
5108                  */
5109          
5110                 t1->seq = htonl(sk->sent_seq-1);
5111                 t1->ack = 1; 
5112                 t1->res1= 0;
5113                 t1->res2= 0;
5114                 t1->rst = 0;
5115                 t1->urg = 0;
5116                 t1->psh = 0;
5117                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5118                 t1->syn = 0;
5119                 t1->ack_seq = ntohl(sk->acked_seq);
5120                 t1->window = ntohs(tcp_select_window(sk));
5121                 t1->doff = sizeof(*t1)/4;
5122                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5123 
5124         }               
5125 
5126         /*
5127          *      Send it.
5128          */
5129         
5130         sk->prot->queue_xmit(sk, dev, buff, 1);
5131         tcp_statistics.TcpOutSegs++;
5132 }
5133 
5134 /*
5135  *      A window probe timeout has occurred.
5136  */
5137 
5138 void tcp_send_probe0(struct sock *sk)
     /*  */
5139 {
5140         if (sk->zapped)
5141                 return;         /* After a valid reset we can send no more */
5142 
5143         tcp_write_wakeup(sk);
5144 
5145         sk->backoff++;
5146         sk->rto = min(sk->rto << 1, 120*HZ);
5147         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5148         sk->retransmits++;
5149         sk->prot->retransmits ++;
5150 }
5151 
5152 /*
5153  *      Socket option code for TCP. 
5154  */
5155   
5156 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
5157 {
5158         int val,err;
5159 
5160         if(level!=SOL_TCP)
5161                 return ip_setsockopt(sk,level,optname,optval,optlen);
5162 
5163         if (optval == NULL) 
5164                 return(-EINVAL);
5165 
5166         err=verify_area(VERIFY_READ, optval, sizeof(int));
5167         if(err)
5168                 return err;
5169         
5170         val = get_user((int *)optval);
5171 
5172         switch(optname)
5173         {
5174                 case TCP_MAXSEG:
5175 /*
5176  * values greater than interface MTU won't take effect.  however at
5177  * the point when this call is done we typically don't yet know
5178  * which interface is going to be used
5179  */
5180                         if(val<1||val>MAX_WINDOW)
5181                                 return -EINVAL;
5182                         sk->user_mss=val;
5183                         return 0;
5184                 case TCP_NODELAY:
5185                         sk->nonagle=(val==0)?0:1;
5186                         return 0;
5187                 default:
5188                         return(-ENOPROTOOPT);
5189         }
5190 }
5191 
5192 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
5193 {
5194         int val,err;
5195 
5196         if(level!=SOL_TCP)
5197                 return ip_getsockopt(sk,level,optname,optval,optlen);
5198                         
5199         switch(optname)
5200         {
5201                 case TCP_MAXSEG:
5202                         val=sk->user_mss;
5203                         break;
5204                 case TCP_NODELAY:
5205                         val=sk->nonagle;
5206                         break;
5207                 default:
5208                         return(-ENOPROTOOPT);
5209         }
5210         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5211         if(err)
5212                 return err;
5213         put_user(sizeof(int),(int *) optlen);
5214 
5215         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5216         if(err)
5217                 return err;
5218         put_user(val,(int *)optval);
5219 
5220         return(0);
5221 }       
5222 
5223 
5224 struct proto tcp_prot = {
5225         sock_wmalloc,
5226         sock_rmalloc,
5227         sock_wfree,
5228         sock_rfree,
5229         sock_rspace,
5230         sock_wspace,
5231         tcp_close,
5232         tcp_read,
5233         tcp_write,
5234         tcp_sendto,
5235         tcp_recvfrom,
5236         ip_build_header,
5237         tcp_connect,
5238         tcp_accept,
5239         ip_queue_xmit,
5240         tcp_retransmit,
5241         tcp_write_wakeup,
5242         tcp_read_wakeup,
5243         tcp_rcv,
5244         tcp_select,
5245         tcp_ioctl,
5246         NULL,
5247         tcp_shutdown,
5248         tcp_setsockopt,
5249         tcp_getsockopt,
5250         128,
5251         0,
5252         "TCP",
5253         0, 0,
5254         {NULL,}
5255 };
/* */
root/net/ipv4/tcp.c

DEFINITIONS