root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *
 136  *
 137  * To Fix:
 138  *              Fast path the code. Two things here - fix the window calculation
 139  *              so it doesn't iterate over the queue, also spot packets with no funny
 140  *              options arriving in order and process directly.
 141  *
 142  *              Implement RFC 1191 [Path MTU discovery]
 143  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 144  *              Rewrite output state machine to use a single queue and do low window
 145  *              situations as per the spec (RFC 1122)
 146  *              Speed up input assembly algorithm.
 147  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 148  *              could do with it working on IPv4
 149  *              User settable/learned rtt/max window/mtu
 150  *              Cope with MTU/device switches when retransmitting in tcp.
 151  *              Fix the window handling to use PR's new code.
 152  *
 153  *              Change the fundamental structure to a single send queue maintained
 154  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 155  *              active routes too]). Cut the queue off in tcp_retransmit/
 156  *              tcp_transmit.
 157  *              Change the receive queue to assemble as it goes. This lets us
 158  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 159  *              tcp_data/tcp_read as well as the window shrink crud.
 160  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 161  *              tcp_queue_skb seem obvious routines to extract.
 162  *      
 163  *              This program is free software; you can redistribute it and/or
 164  *              modify it under the terms of the GNU General Public License
 165  *              as published by the Free Software Foundation; either version
 166  *              2 of the License, or(at your option) any later version.
 167  *
 168  * Description of States:
 169  *
 170  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 171  *
 172  *      TCP_SYN_RECV            received a connection request, sent ack,
 173  *                              waiting for final ack in three-way handshake.
 174  *
 175  *      TCP_ESTABLISHED         connection established
 176  *
 177  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 178  *                              transmission of remaining buffered data
 179  *
 180  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 181  *                              to shutdown
 182  *
 183  *      TCP_CLOSING             both sides have shutdown but we still have
 184  *                              data we have to finish sending
 185  *
 186  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 187  *                              closed, can only be entered from FIN_WAIT2
 188  *                              or CLOSING.  Required because the other end
 189  *                              may not have gotten our last ACK causing it
 190  *                              to retransmit the data packet (which we ignore)
 191  *
 192  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 193  *                              us to finish writing our data and to shutdown
 194  *                              (we have to close() to move on to LAST_ACK)
 195  *
 196  *      TCP_LAST_ACK            out side has shutdown after remote has
 197  *                              shutdown.  There may still be data in our
 198  *                              buffer that we have to finish sending
 199  *              
 200  *      TCP_CLOSE               socket is finished
 201  */
 202 
 203 #include <linux/types.h>
 204 #include <linux/sched.h>
 205 #include <linux/mm.h>
 206 #include <linux/time.h>
 207 #include <linux/string.h>
 208 #include <linux/config.h>
 209 #include <linux/socket.h>
 210 #include <linux/sockios.h>
 211 #include <linux/termios.h>
 212 #include <linux/in.h>
 213 #include <linux/fcntl.h>
 214 #include <linux/inet.h>
 215 #include <linux/netdevice.h>
 216 #include "snmp.h"
 217 #include "ip.h"
 218 #include "protocol.h"
 219 #include "icmp.h"
 220 #include "tcp.h"
 221 #include "arp.h"
 222 #include <linux/skbuff.h>
 223 #include "sock.h"
 224 #include "route.h"
 225 #include <linux/errno.h>
 226 #include <linux/timer.h>
 227 #include <asm/system.h>
 228 #include <asm/segment.h>
 229 #include <linux/mm.h>
 230 
 231 /*
 232  *      The MSL timer is the 'normal' timer.
 233  */
 234  
 235 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 236 
 237 #define SEQ_TICK 3
 238 unsigned long seq_offset;
 239 struct tcp_mib  tcp_statistics;
 240 
 241 static void tcp_close(struct sock *sk, int timeout);
 242 
 243 
 244 /*
 245  *      The less said about this the better, but it works and will do for 1.2 
 246  */
 247 
 248 static struct wait_queue *master_select_wakeup;
 249 
 250 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 251 {
 252         if (a < b) 
 253                 return(a);
 254         return(b);
 255 }
 256 
 257 #undef STATE_TRACE
 258 
 259 #ifdef STATE_TRACE
 260 static char *statename[]={
 261         "Unused","Established","Syn Sent","Syn Recv",
 262         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 263         "Close Wait","Last ACK","Listen","Closing"
 264 };
 265 #endif
 266 
 267 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 268 {
 269         if(sk->state==TCP_ESTABLISHED)
 270                 tcp_statistics.TcpCurrEstab--;
 271 #ifdef STATE_TRACE
 272         if(sk->debug)
 273                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 274 #endif  
 275         /* This is a hack but it doesn't occur often and its going to
 276            be a real        to fix nicely */
 277            
 278         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 279         {
 280                 wake_up_interruptible(&master_select_wakeup);
 281         }
 282         sk->state=state;
 283         if(state==TCP_ESTABLISHED)
 284                 tcp_statistics.TcpCurrEstab++;
 285 }
 286 
 287 /*
 288  *      This routine picks a TCP windows for a socket based on
 289  *      the following constraints
 290  *  
 291  *      1. The window can never be shrunk once it is offered (RFC 793)
 292  *      2. We limit memory per socket
 293  *   
 294  *      For now we use NET2E3's heuristic of offering half the memory
 295  *      we have handy. All is not as bad as this seems however because
 296  *      of two things. Firstly we will bin packets even within the window
 297  *      in order to get the data we are waiting for into the memory limit.
 298  *      Secondly we bin common duplicate forms at receive time
 299  *      Better heuristics welcome
 300  */
 301    
 302 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 303 {
 304         int new_window = sk->prot->rspace(sk);
 305         
 306         if(sk->window_clamp)
 307                 new_window=min(sk->window_clamp,new_window);
 308         /*
 309          *      Two things are going on here.  First, we don't ever offer a
 310          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 311          *      receiver side of SWS as specified in RFC1122.
 312          *      Second, we always give them at least the window they
 313          *      had before, in order to avoid retracting window.  This
 314          *      is technically allowed, but RFC1122 advises against it and
 315          *      in practice it causes trouble.
 316          *
 317          *      Fixme: This doesn't correctly handle the case where
 318          *      new_window > sk->window but not by enough to allow for the
 319          *      shift in sequence space. 
 320          */
 321         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 322                 return(sk->window);
 323         return(new_window);
 324 }
 325 
 326 /*
 327  *      Find someone to 'accept'. Must be called with
 328  *      sk->inuse=1 or cli()
 329  */ 
 330 
 331 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 332 {
 333         struct sk_buff *p=skb_peek(&s->receive_queue);
 334         if(p==NULL)
 335                 return NULL;
 336         do
 337         {
 338                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 339                         return p;
 340                 p=p->next;
 341         }
 342         while(p!=(struct sk_buff *)&s->receive_queue);
 343         return NULL;
 344 }
 345 
 346 /*
 347  *      Remove a completed connection and return it. This is used by
 348  *      tcp_accept() to get connections from the queue.
 349  */
 350 
 351 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 352 {
 353         struct sk_buff *skb;
 354         unsigned long flags;
 355         save_flags(flags);
 356         cli(); 
 357         skb=tcp_find_established(s);
 358         if(skb!=NULL)
 359                 skb_unlink(skb);        /* Take it off the queue */
 360         restore_flags(flags);
 361         return skb;
 362 }
 363 
 364 /* 
 365  *      This routine closes sockets which have been at least partially
 366  *      opened, but not yet accepted. Currently it is only called by
 367  *      tcp_close, and timeout mirrors the value there. 
 368  */
 369 
 370 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 371 {
 372         struct sk_buff *skb;
 373 
 374         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
 375                 tcp_close(skb->sk, 0);
 376                 kfree_skb(skb, FREE_READ);
 377         }
 378         return;
 379 }
 380 
 381 /*
 382  *      Enter the time wait state. 
 383  */
 384 
 385 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 386 {
 387         tcp_set_state(sk,TCP_TIME_WAIT);
 388         sk->shutdown = SHUTDOWN_MASK;
 389         if (!sk->dead)
 390                 sk->state_change(sk);
 391         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 392 }
 393 
 394 /*
 395  *      A socket has timed out on its send queue and wants to do a
 396  *      little retransmitting. Currently this means TCP.
 397  */
 398 
 399 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 400 {
 401         struct sk_buff * skb;
 402         struct proto *prot;
 403         struct device *dev;
 404         int ct=0;
 405 
 406         prot = sk->prot;
 407         skb = sk->send_head;
 408 
 409         while (skb != NULL)
 410         {
 411                 struct tcphdr *th;
 412                 struct iphdr *iph;
 413                 int size;
 414 
 415                 dev = skb->dev;
 416                 IS_SKB(skb);
 417                 skb->when = jiffies;
 418 
 419                 /*
 420                  * In general it's OK just to use the old packet.  However we
 421                  * need to use the current ack and window fields.  Urg and
 422                  * urg_ptr could possibly stand to be updated as well, but we
 423                  * don't keep the necessary data.  That shouldn't be a problem,
 424                  * if the other end is doing the right thing.  Since we're
 425                  * changing the packet, we have to issue a new IP identifier.
 426                  */
 427 
 428                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 429                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 430                 size = skb->len - (((unsigned char *) th) - skb->data);
 431                 
 432                 /*
 433                  *      Note: We ought to check for window limits here but
 434                  *      currently this is done (less efficiently) elsewhere.
 435                  *      We do need to check for a route change but can't handle
 436                  *      that until we have the new 1.3.x buffers in.
 437                  *
 438                  */
 439 
 440                 iph->id = htons(ip_id_count++);
 441                 ip_send_check(iph);
 442 
 443                 /*
 444                  *      This is not the right way to handle this. We have to
 445                  *      issue an up to date window and ack report with this 
 446                  *      retransmit to keep the odd buggy tcp that relies on 
 447                  *      the fact BSD does this happy. 
 448                  *      We don't however need to recalculate the entire 
 449                  *      checksum, so someone wanting a small problem to play
 450                  *      with might like to implement RFC1141/RFC1624 and speed
 451                  *      this up by avoiding a full checksum.
 452                  */
 453                  
 454                 th->ack_seq = ntohl(sk->acked_seq);
 455                 th->window = ntohs(tcp_select_window(sk));
 456                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 457                 
 458                 /*
 459                  *      If the interface is (still) up and running, kick it.
 460                  */
 461 
 462                 if (dev->flags & IFF_UP)
 463                 {
 464                         /*
 465                          *      If the packet is still being sent by the device/protocol
 466                          *      below then don't retransmit. This is both needed, and good -
 467                          *      especially with connected mode AX.25 where it stops resends
 468                          *      occurring of an as yet unsent anyway frame!
 469                          *      We still add up the counts as the round trip time wants
 470                          *      adjusting.
 471                          */
 472                         if (sk && !skb_device_locked(skb))
 473                         {
 474                                 /* Remove it from any existing driver queue first! */
 475                                 skb_unlink(skb);
 476                                 /* Now queue it */
 477                                 ip_statistics.IpOutRequests++;
 478                                 dev_queue_xmit(skb, dev, sk->priority);
 479                         }
 480                 }
 481 
 482                 /*
 483                  *      Count retransmissions
 484                  */
 485                  
 486                 ct++;
 487                 sk->prot->retransmits ++;
 488 
 489                 /*
 490                  *      Only one retransmit requested.
 491                  */
 492         
 493                 if (!all)
 494                         break;
 495 
 496                 /*
 497                  *      This should cut it off before we send too many packets.
 498                  */
 499 
 500                 if (ct >= sk->cong_window)
 501                         break;
 502                 skb = skb->link3;
 503         }
 504 }
 505 
 506 /*
 507  *      Reset the retransmission timer
 508  */
 509  
 510 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 511 {
 512         del_timer(&sk->retransmit_timer);
 513         sk->ip_xmit_timeout = why;
 514         if((int)when < 0)
 515         {
 516                 when=3;
 517                 printk("Error: Negative timer in xmit_timer\n");
 518         }
 519         sk->retransmit_timer.expires=when;
 520         add_timer(&sk->retransmit_timer);
 521 }
 522 
 523 /*
 524  *      This is the normal code called for timeouts.  It does the retransmission
 525  *      and then does backoff.  tcp_do_retransmit is separated out because
 526  *      tcp_ack needs to send stuff from the retransmit queue without
 527  *      initiating a backoff.
 528  */
 529 
 530 
 531 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 532 {
 533         tcp_do_retransmit(sk, all);
 534 
 535         /*
 536          * Increase the timeout each time we retransmit.  Note that
 537          * we do not increase the rtt estimate.  rto is initialized
 538          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 539          * that doubling rto each time is the least we can get away with.
 540          * In KA9Q, Karn uses this for the first few times, and then
 541          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 542          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 543          * defined in the protocol as the maximum possible RTT.  I guess
 544          * we'll have to use something other than TCP to talk to the
 545          * University of Mars.
 546          *
 547          * PAWS allows us longer timeouts and large windows, so once
 548          * implemented ftp to mars will work nicely. We will have to fix
 549          * the 120 second clamps though!
 550          */
 551 
 552         sk->retransmits++;
 553         sk->backoff++;
 554         sk->rto = min(sk->rto << 1, 120*HZ);
 555         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 556 }
 557 
 558 
 559 /*
 560  *      A timer event has trigger a tcp retransmit timeout. The
 561  *      socket xmit queue is ready and set up to send. Because
 562  *      the ack receive code keeps the queue straight we do
 563  *      nothing clever here.
 564  */
 565 
 566 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 567 {
 568         if (all) 
 569         {
 570                 tcp_retransmit_time(sk, all);
 571                 return;
 572         }
 573 
 574         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 575         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 576         sk->cong_count = 0;
 577 
 578         sk->cong_window = 1;
 579 
 580         /* Do the actual retransmit. */
 581         tcp_retransmit_time(sk, all);
 582 }
 583 
 584 /*
 585  *      A write timeout has occurred. Process the after effects.
 586  */
 587 
 588 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590         /*
 591          *      Look for a 'soft' timeout.
 592          */
 593         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 594                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 595         {
 596                 /*
 597                  *      Attempt to recover if arp has changed (unlikely!) or
 598                  *      a route has shifted (not supported prior to 1.3).
 599                  */
 600                 arp_destroy (sk->daddr, 0);
 601                 ip_route_check (sk->daddr);
 602         }
 603         /*
 604          *      Has it gone just too far ?
 605          */
 606         if (sk->retransmits > TCP_RETR2) 
 607         {
 608                 sk->err = ETIMEDOUT;
 609                 sk->error_report(sk);
 610                 /*
 611                  *      Time wait the socket 
 612                  */
 613                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) 
 614                 {
 615                         tcp_set_state(sk,TCP_TIME_WAIT);
 616                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 617                 }
 618                 else
 619                 {
 620                         /*
 621                          *      Clean up time.
 622                          */
 623                         tcp_set_state(sk, TCP_CLOSE);
 624                         return 0;
 625                 }
 626         }
 627         return 1;
 628 }
 629 
 630 /*
 631  *      The TCP retransmit timer. This lacks a few small details.
 632  *
 633  *      1.      An initial rtt timeout on the probe0 should cause what we can
 634  *              of the first write queue buffer to be split and sent.
 635  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 636  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 637  *              tcp_err should save a 'soft error' for us.
 638  */
 639 
 640 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 641 {
 642         struct sock *sk = (struct sock*)data;
 643         int why = sk->ip_xmit_timeout;
 644 
 645         /* 
 646          * only process if socket is not in use
 647          */
 648 
 649         cli();
 650         if (sk->inuse || in_bh) 
 651         {
 652                 /* Try again in 1 second */
 653                 sk->retransmit_timer.expires = HZ;
 654                 add_timer(&sk->retransmit_timer);
 655                 sti();
 656                 return;
 657         }
 658 
 659         sk->inuse = 1;
 660         sti();
 661 
 662         /* Always see if we need to send an ack. */
 663 
 664         if (sk->ack_backlog && !sk->zapped) 
 665         {
 666                 sk->prot->read_wakeup (sk);
 667                 if (! sk->dead)
 668                         sk->data_ready(sk,0);
 669         }
 670 
 671         /* Now we need to figure out why the socket was on the timer. */
 672 
 673         switch (why) 
 674         {
 675                 /* Window probing */
 676                 case TIME_PROBE0:
 677                         tcp_send_probe0(sk);
 678                         tcp_write_timeout(sk);
 679                         break;
 680                 /* Retransmitting */
 681                 case TIME_WRITE:
 682                         /* It could be we got here because we needed to send an ack.
 683                          * So we need to check for that.
 684                          */
 685                 {
 686                         struct sk_buff *skb;
 687                         unsigned long flags;
 688 
 689                         save_flags(flags);
 690                         cli();
 691                         skb = sk->send_head;
 692                         if (!skb) 
 693                         {
 694                                 restore_flags(flags);
 695                         } 
 696                         else 
 697                         {
 698                                 /*
 699                                  *      Kicked by a delayed ack. Reset timer
 700                                  *      correctly now
 701                                  */
 702                                 if (jiffies < skb->when + sk->rto) 
 703                                 {
 704                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 705                                         restore_flags(flags);
 706                                         break;
 707                                 }
 708                                 restore_flags(flags);
 709                                 /*
 710                                  *      Retransmission
 711                                  */
 712                                 sk->prot->retransmit (sk, 0);
 713                                 tcp_write_timeout(sk);
 714                         }
 715                         break;
 716                 }
 717                 /* Sending Keepalives */
 718                 case TIME_KEEPOPEN:
 719                         /* 
 720                          * this reset_timer() call is a hack, this is not
 721                          * how KEEPOPEN is supposed to work.
 722                          */
 723                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 724 
 725                         /* Send something to keep the connection open. */
 726                         if (sk->prot->write_wakeup)
 727                                   sk->prot->write_wakeup (sk);
 728                         sk->retransmits++;
 729                         tcp_write_timeout(sk);
 730                         break;
 731                 default:
 732                         printk ("rexmit_timer: timer expired - reason unknown\n");
 733                         break;
 734         }
 735         release_sock(sk);
 736 }
 737 
 738 /*
 739  * This routine is called by the ICMP module when it gets some
 740  * sort of error condition.  If err < 0 then the socket should
 741  * be closed and the error returned to the user.  If err > 0
 742  * it's just the icmp type << 8 | icmp code.  After adjustment
 743  * header points to the first 8 bytes of the tcp header.  We need
 744  * to find the appropriate port.
 745  */
 746 
 747 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 748         unsigned long saddr, struct inet_protocol *protocol)
 749 {
 750         struct tcphdr *th;
 751         struct sock *sk;
 752         struct iphdr *iph=(struct iphdr *)header;
 753   
 754         header+=4*iph->ihl;
 755    
 756 
 757         th =(struct tcphdr *)header;
 758         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 759 
 760         if (sk == NULL) 
 761                 return;
 762   
 763         if(err<0)
 764         {
 765                 sk->err = -err;
 766                 sk->error_report(sk);
 767                 return;
 768         }
 769 
 770         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 771         {
 772                 /*
 773                  * FIXME:
 774                  * For now we will just trigger a linear backoff.
 775                  * The slow start code should cause a real backoff here.
 776                  */
 777                 if (sk->cong_window > 4)
 778                         sk->cong_window--;
 779                 return;
 780         }
 781 
 782 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 783 
 784         /*
 785          * If we've already connected we will keep trying
 786          * until we time out, or the user gives up.
 787          */
 788 
 789         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 790         {
 791                 if (sk->state == TCP_SYN_SENT) 
 792                 {
 793                         tcp_statistics.TcpAttemptFails++;
 794                         tcp_set_state(sk,TCP_CLOSE);
 795                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 796                 }
 797                 sk->err = icmp_err_convert[err & 0xff].errno;           
 798         }
 799         return;
 800 }
 801 
 802 
 803 /*
 804  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 805  *      in the received data queue (ie a frame missing that needs sending to us). Not
 806  *      sorting using two queues as data arrives makes life so much harder.
 807  */
 808 
 809 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 810 {
 811         unsigned long counted;
 812         unsigned long amount;
 813         struct sk_buff *skb;
 814         int sum;
 815         unsigned long flags;
 816 
 817         if(sk && sk->debug)
 818                 printk("tcp_readable: %p - ",sk);
 819 
 820         save_flags(flags);
 821         cli();
 822         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 823         {
 824                 restore_flags(flags);
 825                 if(sk && sk->debug) 
 826                         printk("empty\n");
 827                 return(0);
 828         }
 829   
 830         counted = sk->copied_seq;       /* Where we are at the moment */
 831         amount = 0;
 832   
 833         /* 
 834          *      Do until a push or until we are out of data. 
 835          */
 836          
 837         do 
 838         {
 839                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 840                         break;
 841                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 842                 if (skb->h.th->syn)
 843                         sum++;
 844                 if (sum > 0) 
 845                 {                                       /* Add it up, move on */
 846                         amount += sum;
 847                         if (skb->h.th->syn) 
 848                                 amount--;
 849                         counted += sum;
 850                 }
 851                 /*
 852                  * Don't count urg data ... but do it in the right place!
 853                  * Consider: "old_data (ptr is here) URG PUSH data"
 854                  * The old code would stop at the first push because
 855                  * it counted the urg (amount==1) and then does amount--
 856                  * *after* the loop.  This means tcp_readable() always
 857                  * returned zero if any URG PUSH was in the queue, even
 858                  * though there was normal data available. If we subtract
 859                  * the urg data right here, we even get it to work for more
 860                  * than one URG PUSH skb without normal data.
 861                  * This means that select() finally works now with urg data
 862                  * in the queue.  Note that rlogin was never affected
 863                  * because it doesn't use select(); it uses two processes
 864                  * and a blocking read().  And the queue scan in tcp_read()
 865                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 866                  */
 867                 if (skb->h.th->urg)
 868                         amount--;       /* don't count urg data */
 869                 if (amount && skb->h.th->psh) break;
 870                 skb = skb->next;
 871         }
 872         while(skb != (struct sk_buff *)&sk->receive_queue);
 873 
 874         restore_flags(flags);
 875         if(sk->debug)
 876                 printk("got %lu bytes.\n",amount);
 877         return(amount);
 878 }
 879 
 880 /*
 881  * LISTEN is a special case for select..
 882  */
 883 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 884 {
 885         if (sel_type == SEL_IN) {
 886                 int retval;
 887 
 888                 sk->inuse = 1;
 889                 retval = (tcp_find_established(sk) != NULL);
 890                 release_sock(sk);
 891                 if (!retval)
 892                         select_wait(&master_select_wakeup,wait);
 893                 return retval;
 894         }
 895         return 0;
 896 }
 897 
 898 
 899 /*
 900  *      Wait for a TCP event.
 901  *
 902  *      Note that we don't need to set "sk->inuse", as the upper select layers
 903  *      take care of normal races (between the test and the event) and we don't
 904  *      go look at any of the socket buffers directly.
 905  */
 906 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 907 {
 908         if (sk->state == TCP_LISTEN)
 909                 return tcp_listen_select(sk, sel_type, wait);
 910 
 911         switch(sel_type) {
 912         case SEL_IN:
 913                 if (sk->err)
 914                         return 1;
 915                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 916                         break;
 917 
 918                 if (sk->shutdown & RCV_SHUTDOWN)
 919                         return 1;
 920                         
 921                 if (sk->acked_seq == sk->copied_seq)
 922                         break;
 923 
 924                 if (sk->urg_seq != sk->copied_seq ||
 925                     sk->acked_seq != sk->copied_seq+1 ||
 926                     sk->urginline || !sk->urg_data)
 927                         return 1;
 928                 break;
 929 
 930         case SEL_OUT:
 931                 if (sk->shutdown & SEND_SHUTDOWN) 
 932                         return 0;
 933                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 934                         break;
 935                 /*
 936                  * This is now right thanks to a small fix
 937                  * by Matt Dillon.
 938                  */
 939 
 940                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 941                         break;
 942                 return 1;
 943 
 944         case SEL_EX:
 945                 if (sk->err || sk->urg_data)
 946                         return 1;
 947                 break;
 948         }
 949         select_wait(sk->sleep, wait);
 950         return 0;
 951 }
 952 
 953 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 954 {
 955         int err;
 956         switch(cmd) 
 957         {
 958 
 959                 case TIOCINQ:
 960 #ifdef FIXME    /* FIXME: */
 961                 case FIONREAD:
 962 #endif
 963                 {
 964                         unsigned long amount;
 965 
 966                         if (sk->state == TCP_LISTEN) 
 967                                 return(-EINVAL);
 968 
 969                         sk->inuse = 1;
 970                         amount = tcp_readable(sk);
 971                         release_sock(sk);
 972                         err=verify_area(VERIFY_WRITE,(void *)arg,
 973                                                    sizeof(unsigned long));
 974                         if(err)
 975                                 return err;
 976                         put_fs_long(amount,(unsigned long *)arg);
 977                         return(0);
 978                 }
 979                 case SIOCATMARK:
 980                 {
 981                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 982 
 983                         err = verify_area(VERIFY_WRITE,(void *) arg,
 984                                                   sizeof(unsigned long));
 985                         if (err)
 986                                 return err;
 987                         put_fs_long(answ,(int *) arg);
 988                         return(0);
 989                 }
 990                 case TIOCOUTQ:
 991                 {
 992                         unsigned long amount;
 993 
 994                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 995                         amount = sk->prot->wspace(sk);
 996                         err=verify_area(VERIFY_WRITE,(void *)arg,
 997                                                    sizeof(unsigned long));
 998                         if(err)
 999                                 return err;
1000                         put_fs_long(amount,(unsigned long *)arg);
1001                         return(0);
1002                 }
1003                 default:
1004                         return(-EINVAL);
1005         }
1006 }
1007 
1008 
1009 /*
1010  *      This routine computes a TCP checksum. 
1011  */
1012  
1013 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1014           unsigned long saddr, unsigned long daddr)
1015 {     
1016         unsigned long sum;
1017    
1018         if (saddr == 0) saddr = ip_my_addr();
1019 
1020 /*
1021  * stupid, gcc complains when I use just one __asm__ block,
1022  * something about too many reloads, but this is just two
1023  * instructions longer than what I want
1024  */
1025         __asm__("
1026             addl %%ecx, %%ebx
1027             adcl %%edx, %%ebx
1028             adcl $0, %%ebx
1029             "
1030         : "=b"(sum)
1031         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1032         : "bx", "cx", "dx" );
1033         __asm__("
1034             movl %%ecx, %%edx
1035             cld
1036             cmpl $32, %%ecx
1037             jb 2f
1038             shrl $5, %%ecx
1039             clc
1040 1:          lodsl
1041             adcl %%eax, %%ebx
1042             lodsl
1043             adcl %%eax, %%ebx
1044             lodsl
1045             adcl %%eax, %%ebx
1046             lodsl
1047             adcl %%eax, %%ebx
1048             lodsl
1049             adcl %%eax, %%ebx
1050             lodsl
1051             adcl %%eax, %%ebx
1052             lodsl
1053             adcl %%eax, %%ebx
1054             lodsl
1055             adcl %%eax, %%ebx
1056             loop 1b
1057             adcl $0, %%ebx
1058             movl %%edx, %%ecx
1059 2:          andl $28, %%ecx
1060             je 4f
1061             shrl $2, %%ecx
1062             clc
1063 3:          lodsl
1064             adcl %%eax, %%ebx
1065             loop 3b
1066             adcl $0, %%ebx
1067 4:          movl $0, %%eax
1068             testw $2, %%dx
1069             je 5f
1070             lodsw
1071             addl %%eax, %%ebx
1072             adcl $0, %%ebx
1073             movw $0, %%ax
1074 5:          test $1, %%edx
1075             je 6f
1076             lodsb
1077             addl %%eax, %%ebx
1078             adcl $0, %%ebx
1079 6:          movl %%ebx, %%eax
1080             shrl $16, %%eax
1081             addw %%ax, %%bx
1082             adcw $0, %%bx
1083             "
1084         : "=b"(sum)
1085         : "0"(sum), "c"(len), "S"(th)
1086         : "ax", "bx", "cx", "dx", "si" );
1087 
1088         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1089   
1090         return((~sum) & 0xffff);
1091 }
1092 
1093 
1094 
1095 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1096                 unsigned long daddr, int len, struct sock *sk)
1097 {
1098         th->check = 0;
1099         th->check = tcp_check(th, len, saddr, daddr);
1100         return;
1101 }
1102 
1103 /*
1104  *      This is the main buffer sending routine. We queue the buffer
1105  *      having checked it is sane seeming.
1106  */
1107  
1108 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1109 {
1110         int size;
1111         struct tcphdr * th = skb->h.th;
1112 
1113         /*
1114          *      length of packet (not counting length of pre-tcp headers) 
1115          */
1116          
1117         size = skb->len - ((unsigned char *) th - skb->data);
1118 
1119         /*
1120          *      Sanity check it.. 
1121          */
1122          
1123         if (size < sizeof(struct tcphdr) || size > skb->len) 
1124         {
1125                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1126                         skb, skb->data, th, skb->len);
1127                 kfree_skb(skb, FREE_WRITE);
1128                 return;
1129         }
1130 
1131         /*
1132          *      If we have queued a header size packet.. (these crash a few
1133          *      tcp stacks if ack is not set)
1134          */
1135          
1136         if (size == sizeof(struct tcphdr)) 
1137         {
1138                 /* If its got a syn or fin its notionally included in the size..*/
1139                 if(!th->syn && !th->fin) 
1140                 {
1141                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1142                         kfree_skb(skb,FREE_WRITE);
1143                         return;
1144                 }
1145         }
1146 
1147         /*
1148          *      Actual processing.
1149          */
1150          
1151         tcp_statistics.TcpOutSegs++;  
1152         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1153         
1154         /*
1155          *      We must queue if
1156          *
1157          *      a) The right edge of this frame exceeds the window
1158          *      b) We are retransmitting (Nagle's rule)
1159          *      c) We have too many packets 'in flight'
1160          */
1161          
1162         if (after(skb->h.seq, sk->window_seq) ||
1163             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1164              sk->packets_out >= sk->cong_window) 
1165         {
1166                 /* checksum will be supplied by tcp_write_xmit.  So
1167                  * we shouldn't need to set it at all.  I'm being paranoid */
1168                 th->check = 0;
1169                 if (skb->next != NULL) 
1170                 {
1171                         printk("tcp_send_partial: next != NULL\n");
1172                         skb_unlink(skb);
1173                 }
1174                 skb_queue_tail(&sk->write_queue, skb);
1175                 
1176                 /*
1177                  *      If we don't fit we have to start the zero window
1178                  *      probes. This is broken - we really need to do a partial
1179                  *      send _first_ (This is what causes the Cisco and PC/TCP
1180                  *      grief).
1181                  */
1182                  
1183                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1184                     sk->send_head == NULL && sk->ack_backlog == 0)
1185                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1186         } 
1187         else 
1188         {
1189                 /*
1190                  *      This is going straight out
1191                  */
1192                  
1193                 th->ack_seq = ntohl(sk->acked_seq);
1194                 th->window = ntohs(tcp_select_window(sk));
1195 
1196                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1197 
1198                 sk->sent_seq = sk->write_seq;
1199                 
1200                 /*
1201                  *      This is mad. The tcp retransmit queue is put together
1202                  *      by the ip layer. This causes half the problems with
1203                  *      unroutable FIN's and other things.
1204                  */
1205                  
1206                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1207                 
1208                 /*
1209                  *      Set for next retransmit based on expected ACK time.
1210                  *      FIXME: We set this every time which means our 
1211                  *      retransmits are really about a window behind.
1212                  */
1213 
1214                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1215         }
1216 }
1217 
1218 /*
1219  *      Locking problems lead us to a messy situation where we can have
1220  *      multiple partially complete buffers queued up. This is really bad
1221  *      as we don't want to be sending partial buffers. Fix this with
1222  *      a semaphore or similar to lock tcp_write per socket.
1223  *
1224  *      These routines are pretty self descriptive.
1225  */
1226  
1227 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1228 {
1229         struct sk_buff * skb;
1230         unsigned long flags;
1231 
1232         save_flags(flags);
1233         cli();
1234         skb = sk->partial;
1235         if (skb) {
1236                 sk->partial = NULL;
1237                 del_timer(&sk->partial_timer);
1238         }
1239         restore_flags(flags);
1240         return skb;
1241 }
1242 
1243 /*
1244  *      Empty the partial queue
1245  */
1246  
1247 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1248 {
1249         struct sk_buff *skb;
1250 
1251         if (sk == NULL)
1252                 return;
1253         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1254                 tcp_send_skb(sk, skb);
1255 }
1256 
1257 /*
1258  *      Queue a partial frame
1259  */
1260  
1261 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1262 {
1263         struct sk_buff * tmp;
1264         unsigned long flags;
1265 
1266         save_flags(flags);
1267         cli();
1268         tmp = sk->partial;
1269         if (tmp)
1270                 del_timer(&sk->partial_timer);
1271         sk->partial = skb;
1272         init_timer(&sk->partial_timer);
1273         /*
1274          *      Wait up to 1 second for the buffer to fill.
1275          */
1276         sk->partial_timer.expires = HZ;
1277         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1278         sk->partial_timer.data = (unsigned long) sk;
1279         add_timer(&sk->partial_timer);
1280         restore_flags(flags);
1281         if (tmp)
1282                 tcp_send_skb(sk, tmp);
1283 }
1284 
1285 
1286 /*
1287  *      This routine sends an ack and also updates the window. 
1288  */
1289  
1290 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1291              struct sock *sk,
1292              struct tcphdr *th, unsigned long daddr)
1293 {
1294         struct sk_buff *buff;
1295         struct tcphdr *t1;
1296         struct device *dev = NULL;
1297         int tmp;
1298 
1299         if(sk->zapped)
1300                 return;         /* We have been reset, we may not send again */
1301                 
1302         /*
1303          * We need to grab some memory, and put together an ack,
1304          * and then put it into the queue to be sent.
1305          */
1306 
1307         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1308         if (buff == NULL) 
1309         {
1310                 /* 
1311                  *      Force it to send an ack. We don't have to do this
1312                  *      (ACK is unreliable) but its much better use of 
1313                  *      bandwidth on slow links to send a spare ack than
1314                  *      resend packets. 
1315                  */
1316                  
1317                 sk->ack_backlog++;
1318                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1319                 {
1320                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1321                 }
1322                 return;
1323         }
1324 
1325         /*
1326          *      Assemble a suitable TCP frame
1327          */
1328          
1329         buff->len = sizeof(struct tcphdr);
1330         buff->sk = sk;
1331         buff->localroute = sk->localroute;
1332         t1 =(struct tcphdr *) buff->data;
1333 
1334         /* 
1335          *      Put in the IP header and routing stuff. 
1336          */
1337          
1338         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1339                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1340         if (tmp < 0) 
1341         {
1342                 buff->free = 1;
1343                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1344                 return;
1345         }
1346         buff->len += tmp;
1347         t1 =(struct tcphdr *)((char *)t1 +tmp);
1348 
1349         memcpy(t1, th, sizeof(*t1));
1350 
1351         /*
1352          *      Swap the send and the receive. 
1353          */
1354          
1355         t1->dest = th->source;
1356         t1->source = th->dest;
1357         t1->seq = ntohl(sequence);
1358         t1->ack = 1;
1359         sk->window = tcp_select_window(sk);
1360         t1->window = ntohs(sk->window);
1361         t1->res1 = 0;
1362         t1->res2 = 0;
1363         t1->rst = 0;
1364         t1->urg = 0;
1365         t1->syn = 0;
1366         t1->psh = 0;
1367         t1->fin = 0;
1368         
1369         /*
1370          *      If we have nothing queued for transmit and the transmit timer
1371          *      is on we are just doing an ACK timeout and need to switch
1372          *      to a keepalive.
1373          */
1374          
1375         if (ack == sk->acked_seq) 
1376         {
1377                 sk->ack_backlog = 0;
1378                 sk->bytes_rcv = 0;
1379                 sk->ack_timed = 0;
1380                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1381                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1382                 {
1383                         if(sk->keepopen) {
1384                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1385                         } else {
1386                                 delete_timer(sk);
1387                         }
1388                 }
1389         }
1390         
1391         /*
1392          *      Fill in the packet and send it
1393          */
1394          
1395         t1->ack_seq = ntohl(ack);
1396         t1->doff = sizeof(*t1)/4;
1397         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1398         if (sk->debug)
1399                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1400         tcp_statistics.TcpOutSegs++;
1401         sk->prot->queue_xmit(sk, dev, buff, 1);
1402 }
1403 
1404 
1405 /* 
1406  *      This routine builds a generic TCP header. 
1407  */
1408  
1409 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1410 {
1411 
1412         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1413         th->seq = htonl(sk->write_seq);
1414         th->psh =(push == 0) ? 1 : 0;
1415         th->doff = sizeof(*th)/4;
1416         th->ack = 1;
1417         th->fin = 0;
1418         sk->ack_backlog = 0;
1419         sk->bytes_rcv = 0;
1420         sk->ack_timed = 0;
1421         th->ack_seq = htonl(sk->acked_seq);
1422         sk->window = tcp_select_window(sk);
1423         th->window = htons(sk->window);
1424 
1425         return(sizeof(*th));
1426 }
1427 
1428 /*
1429  *      This routine copies from a user buffer into a socket,
1430  *      and starts the transmit system.
1431  */
1432 
1433 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1434           int len, int nonblock, unsigned flags)
1435 {
1436         int copied = 0;
1437         int copy;
1438         int tmp;
1439         struct sk_buff *skb;
1440         struct sk_buff *send_tmp;
1441         unsigned char *buff;
1442         struct proto *prot;
1443         struct device *dev = NULL;
1444 
1445         sk->inuse=1;
1446         prot = sk->prot;
1447         while(len > 0) 
1448         {
1449                 if (sk->err) 
1450                 {                       /* Stop on an error */
1451                         release_sock(sk);
1452                         if (copied) 
1453                                 return(copied);
1454                         tmp = -sk->err;
1455                         sk->err = 0;
1456                         return(tmp);
1457                 }
1458 
1459                 /*
1460                  *      First thing we do is make sure that we are established. 
1461                  */
1462         
1463                 if (sk->shutdown & SEND_SHUTDOWN) 
1464                 {
1465                         release_sock(sk);
1466                         sk->err = EPIPE;
1467                         if (copied) 
1468                                 return(copied);
1469                         sk->err = 0;
1470                         return(-EPIPE);
1471                 }
1472 
1473                 /* 
1474                  *      Wait for a connection to finish.
1475                  */
1476         
1477                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1478                 {
1479                         if (sk->err) 
1480                         {
1481                                 release_sock(sk);
1482                                 if (copied) 
1483                                         return(copied);
1484                                 tmp = -sk->err;
1485                                 sk->err = 0;
1486                                 return(tmp);
1487                         }
1488 
1489                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1490                         {
1491                                 release_sock(sk);
1492                                 if (copied) 
1493                                         return(copied);
1494 
1495                                 if (sk->err) 
1496                                 {
1497                                         tmp = -sk->err;
1498                                         sk->err = 0;
1499                                         return(tmp);
1500                                 }
1501 
1502                                 if (sk->keepopen) 
1503                                 {
1504                                         send_sig(SIGPIPE, current, 0);
1505                                 }
1506                                 return(-EPIPE);
1507                         }
1508 
1509                         if (nonblock || copied) 
1510                         {
1511                                 release_sock(sk);
1512                                 if (copied) 
1513                                         return(copied);
1514                                 return(-EAGAIN);
1515                         }
1516 
1517                         release_sock(sk);
1518                         cli();
1519                 
1520                         if (sk->state != TCP_ESTABLISHED &&
1521                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1522                         {
1523                                 interruptible_sleep_on(sk->sleep);
1524                                 if (current->signal & ~current->blocked) 
1525                                 {
1526                                         sti();
1527                                         if (copied) 
1528                                                 return(copied);
1529                                         return(-ERESTARTSYS);
1530                                 }
1531                         }
1532                         sk->inuse = 1;
1533                         sti();
1534                 }
1535 
1536         /*
1537          * The following code can result in copy <= if sk->mss is ever
1538          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1539          * sk->mtu is constant once SYN processing is finished.  I.e. we
1540          * had better not get here until we've seen his SYN and at least one
1541          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1542          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1543          * non-decreasing.  Note that any ioctl to set user_mss must be done
1544          * before the exchange of SYN's.  If the initial ack from the other
1545          * end has a window of 0, max_window and thus mss will both be 0.
1546          */
1547 
1548         /* 
1549          *      Now we need to check if we have a half built packet. 
1550          */
1551 
1552                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1553                 {
1554                         int hdrlen;
1555 
1556                          /* IP header + TCP header */
1557                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1558                                  + sizeof(struct tcphdr);
1559         
1560                         /* Add more stuff to the end of skb->len */
1561                         if (!(flags & MSG_OOB)) 
1562                         {
1563                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1564                                 /* FIXME: this is really a bug. */
1565                                 if (copy <= 0) 
1566                                 {
1567                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1568                                         copy = 0;
1569                                 }
1570           
1571                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1572                                 skb->len += copy;
1573                                 from += copy;
1574                                 copied += copy;
1575                                 len -= copy;
1576                                 sk->write_seq += copy;
1577                         }
1578                         if ((skb->len - hdrlen) >= sk->mss ||
1579                                 (flags & MSG_OOB) || !sk->packets_out)
1580                                 tcp_send_skb(sk, skb);
1581                         else
1582                                 tcp_enqueue_partial(skb, sk);
1583                         continue;
1584                 }
1585 
1586         /*
1587          * We also need to worry about the window.
1588          * If window < 1/2 the maximum window we've seen from this
1589          *   host, don't use it.  This is sender side
1590          *   silly window prevention, as specified in RFC1122.
1591          *   (Note that this is different than earlier versions of
1592          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1593          *   use the whole MSS.  Since the results in the right
1594          *   edge of the packet being outside the window, it will
1595          *   be queued for later rather than sent.
1596          */
1597 
1598                 copy = sk->window_seq - sk->write_seq;
1599                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1600                         copy = sk->mss;
1601                 if (copy > len)
1602                         copy = len;
1603 
1604         /*
1605          *      We should really check the window here also. 
1606          */
1607          
1608                 send_tmp = NULL;
1609                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1610                 {
1611                         /*
1612                          *      We will release the socket incase we sleep here. 
1613                          */
1614                         release_sock(sk);
1615                         /*
1616                          *      NB: following must be mtu, because mss can be increased.
1617                          *      mss is always <= mtu 
1618                          */
1619                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1620                         sk->inuse = 1;
1621                         send_tmp = skb;
1622                 } 
1623                 else 
1624                 {
1625                         /*
1626                          *      We will release the socket incase we sleep here. 
1627                          */
1628                         release_sock(sk);
1629                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1630                         sk->inuse = 1;
1631                 }
1632 
1633                 /*
1634                  *      If we didn't get any memory, we need to sleep. 
1635                  */
1636 
1637                 if (skb == NULL) 
1638                 {
1639                         sk->socket->flags |= SO_NOSPACE;
1640                         if (nonblock) 
1641                         {
1642                                 release_sock(sk);
1643                                 if (copied) 
1644                                         return(copied);
1645                                 return(-EAGAIN);
1646                         }
1647 
1648                         /*
1649                          *      FIXME: here is another race condition. 
1650                          */
1651 
1652                         tmp = sk->wmem_alloc;
1653                         release_sock(sk);
1654                         cli();
1655                         /*
1656                          *      Again we will try to avoid it. 
1657                          */
1658                         if (tmp <= sk->wmem_alloc &&
1659                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1660                                 && sk->err == 0) 
1661                         {
1662                                 sk->socket->flags &= ~SO_NOSPACE;
1663                                 interruptible_sleep_on(sk->sleep);
1664                                 if (current->signal & ~current->blocked) 
1665                                 {
1666                                         sti();
1667                                         if (copied) 
1668                                                 return(copied);
1669                                         return(-ERESTARTSYS);
1670                                 }
1671                         }
1672                         sk->inuse = 1;
1673                         sti();
1674                         continue;
1675                 }
1676 
1677                 skb->len = 0;
1678                 skb->sk = sk;
1679                 skb->free = 0;
1680                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1681         
1682                 buff = skb->data;
1683         
1684                 /*
1685                  * FIXME: we need to optimize this.
1686                  * Perhaps some hints here would be good.
1687                  */
1688                 
1689                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1690                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1691                 if (tmp < 0 ) 
1692                 {
1693                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1694                         release_sock(sk);
1695                         if (copied) 
1696                                 return(copied);
1697                         return(tmp);
1698                 }
1699                 skb->len += tmp;
1700                 skb->dev = dev;
1701                 buff += tmp;
1702                 skb->h.th =(struct tcphdr *) buff;
1703                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1704                 if (tmp < 0) 
1705                 {
1706                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1707                         release_sock(sk);
1708                         if (copied) 
1709                                 return(copied);
1710                         return(tmp);
1711                 }
1712 
1713                 if (flags & MSG_OOB) 
1714                 {
1715                         ((struct tcphdr *)buff)->urg = 1;
1716                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1717                 }
1718                 skb->len += tmp;
1719                 memcpy_fromfs(buff+tmp, from, copy);
1720 
1721                 from += copy;
1722                 copied += copy;
1723                 len -= copy;
1724                 skb->len += copy;
1725                 skb->free = 0;
1726                 sk->write_seq += copy;
1727         
1728                 if (send_tmp != NULL && sk->packets_out) 
1729                 {
1730                         tcp_enqueue_partial(send_tmp, sk);
1731                         continue;
1732                 }
1733                 tcp_send_skb(sk, skb);
1734         }
1735         sk->err = 0;
1736 
1737 /*
1738  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1739  *      interactive fast network servers. It's meant to be on and
1740  *      it really improves the throughput though not the echo time
1741  *      on my slow slip link - Alan
1742  */
1743 
1744 /*
1745  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1746  */
1747  
1748         if(sk->partial && ((!sk->packets_out) 
1749      /* If not nagling we can send on the before case too.. */
1750               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1751         ))
1752                 tcp_send_partial(sk);
1753 
1754         release_sock(sk);
1755         return(copied);
1756 }
1757 
1758 /*
1759  *      This is just a wrapper. 
1760  */
1761 
1762 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1763            int len, int nonblock, unsigned flags,
1764            struct sockaddr_in *addr, int addr_len)
1765 {
1766         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1767                 return -EINVAL;
1768         if (sk->state == TCP_CLOSE)
1769                 return -ENOTCONN;
1770         if (addr_len < sizeof(*addr))
1771                 return -EINVAL;
1772         if (addr->sin_family && addr->sin_family != AF_INET) 
1773                 return -EINVAL;
1774         if (addr->sin_port != sk->dummy_th.dest) 
1775                 return -EISCONN;
1776         if (addr->sin_addr.s_addr != sk->daddr) 
1777                 return -EISCONN;
1778         return tcp_write(sk, from, len, nonblock, flags);
1779 }
1780 
1781 
1782 /*
1783  *      Send an ack if one is backlogged at this point. Ought to merge
1784  *      this with tcp_send_ack().
1785  */
1786  
1787 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1788 {
1789         int tmp;
1790         struct device *dev = NULL;
1791         struct tcphdr *t1;
1792         struct sk_buff *buff;
1793 
1794         if (!sk->ack_backlog) 
1795                 return;
1796 
1797         /*
1798          * FIXME: we need to put code here to prevent this routine from
1799          * being called.  Being called once in a while is ok, so only check
1800          * if this is the second time in a row.
1801          */
1802 
1803         /*
1804          * We need to grab some memory, and put together an ack,
1805          * and then put it into the queue to be sent.
1806          */
1807 
1808         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1809         if (buff == NULL) 
1810         {
1811                 /* Try again real soon. */
1812                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1813                 return;
1814         }
1815 
1816         buff->len = sizeof(struct tcphdr);
1817         buff->sk = sk;
1818         buff->localroute = sk->localroute;
1819         
1820         /*
1821          *      Put in the IP header and routing stuff. 
1822          */
1823 
1824         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1825                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1826         if (tmp < 0) 
1827         {
1828                 buff->free = 1;
1829                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1830                 return;
1831         }
1832 
1833         buff->len += tmp;
1834         t1 =(struct tcphdr *)(buff->data +tmp);
1835 
1836         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1837         t1->seq = htonl(sk->sent_seq);
1838         t1->ack = 1;
1839         t1->res1 = 0;
1840         t1->res2 = 0;
1841         t1->rst = 0;
1842         t1->urg = 0;
1843         t1->syn = 0;
1844         t1->psh = 0;
1845         sk->ack_backlog = 0;
1846         sk->bytes_rcv = 0;
1847         sk->window = tcp_select_window(sk);
1848         t1->window = ntohs(sk->window);
1849         t1->ack_seq = ntohl(sk->acked_seq);
1850         t1->doff = sizeof(*t1)/4;
1851         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1852         sk->prot->queue_xmit(sk, dev, buff, 1);
1853         tcp_statistics.TcpOutSegs++;
1854 }
1855 
1856 
1857 /*
1858  *      FIXME:
1859  *      This routine frees used buffers.
1860  *      It should consider sending an ACK to let the
1861  *      other end know we now have a bigger window.
1862  */
1863 
1864 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1865 {
1866         unsigned long flags;
1867         unsigned long left;
1868         struct sk_buff *skb;
1869         unsigned long rspace;
1870 
1871         if(sk->debug)
1872                 printk("cleaning rbuf for sk=%p\n", sk);
1873   
1874         save_flags(flags);
1875         cli();
1876   
1877         left = sk->prot->rspace(sk);
1878  
1879         /*
1880          *      We have to loop through all the buffer headers,
1881          *      and try to free up all the space we can.
1882          */
1883 
1884         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1885         {
1886                 if (!skb->used || skb->users) 
1887                         break;
1888                 skb_unlink(skb);
1889                 skb->sk = sk;
1890                 kfree_skb(skb, FREE_READ);
1891         }
1892 
1893         restore_flags(flags);
1894 
1895         /*
1896          *      FIXME:
1897          *      At this point we should send an ack if the difference
1898          *      in the window, and the amount of space is bigger than
1899          *      TCP_WINDOW_DIFF.
1900          */
1901 
1902         if(sk->debug)
1903                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1904                                             left);
1905         if ((rspace=sk->prot->rspace(sk)) != left) 
1906         {
1907                 /*
1908                  * This area has caused the most trouble.  The current strategy
1909                  * is to simply do nothing if the other end has room to send at
1910                  * least 3 full packets, because the ack from those will auto-
1911                  * matically update the window.  If the other end doesn't think
1912                  * we have much space left, but we have room for at least 1 more
1913                  * complete packet than it thinks we do, we will send an ack
1914                  * immediately.  Otherwise we will wait up to .5 seconds in case
1915                  * the user reads some more.
1916                  */
1917                 sk->ack_backlog++;
1918         /*
1919          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1920          * if the other end is offering a window smaller than the agreed on MSS
1921          * (called sk->mtu here).  In theory there's no connection between send
1922          * and receive, and so no reason to think that they're going to send
1923          * small packets.  For the moment I'm using the hack of reducing the mss
1924          * only on the send side, so I'm putting mtu here.
1925          */
1926 
1927                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1928                 {
1929                         /* Send an ack right now. */
1930                         tcp_read_wakeup(sk);
1931                 } 
1932                 else 
1933                 {
1934                         /* Force it to send an ack soon. */
1935                         int was_active = del_timer(&sk->retransmit_timer);
1936                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1937                         {
1938                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1939                         } 
1940                         else
1941                                 add_timer(&sk->retransmit_timer);
1942                 }
1943         }
1944 } 
1945 
1946 
1947 /*
1948  *      Handle reading urgent data. BSD has very simple semantics for
1949  *      this, no blocking and very strange errors 8)
1950  */
1951  
1952 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1953              unsigned char *to, int len, unsigned flags)
1954 {
1955         /*
1956          *      No URG data to read
1957          */
1958         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1959                 return -EINVAL; /* Yes this is right ! */
1960                 
1961         if (sk->err) 
1962         {
1963                 int tmp = -sk->err;
1964                 sk->err = 0;
1965                 return tmp;
1966         }
1967 
1968         if (sk->state == TCP_CLOSE || sk->done) 
1969         {
1970                 if (!sk->done) {
1971                         sk->done = 1;
1972                         return 0;
1973                 }
1974                 return -ENOTCONN;
1975         }
1976 
1977         if (sk->shutdown & RCV_SHUTDOWN) 
1978         {
1979                 sk->done = 1;
1980                 return 0;
1981         }
1982         sk->inuse = 1;
1983         if (sk->urg_data & URG_VALID) 
1984         {
1985                 char c = sk->urg_data;
1986                 if (!(flags & MSG_PEEK))
1987                         sk->urg_data = URG_READ;
1988                 put_fs_byte(c, to);
1989                 release_sock(sk);
1990                 return 1;
1991         }
1992         release_sock(sk);
1993         
1994         /*
1995          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1996          * the available implementations agree in this case:
1997          * this call should never block, independent of the
1998          * blocking state of the socket.
1999          * Mike <pall@rz.uni-karlsruhe.de>
2000          */
2001         return -EAGAIN;
2002 }
2003 
2004 
2005 /*
2006  *      This routine copies from a sock struct into the user buffer. 
2007  */
2008  
2009 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2010         int len, int nonblock, unsigned flags)
2011 {
2012         struct wait_queue wait = { current, NULL };
2013         int copied = 0;
2014         unsigned long peek_seq;
2015         volatile unsigned long *seq;    /* So gcc doesn't overoptimise */
2016         unsigned long used;
2017 
2018         /* 
2019          *      This error should be checked. 
2020          */
2021          
2022         if (sk->state == TCP_LISTEN)
2023                 return -ENOTCONN;
2024 
2025         /*
2026          *      Urgent data needs to be handled specially. 
2027          */
2028          
2029         if (flags & MSG_OOB)
2030                 return tcp_read_urg(sk, nonblock, to, len, flags);
2031 
2032         /*
2033          *      Copying sequence to update. This is volatile to handle
2034          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2035          *      inline and thus not flush cached variables otherwise).
2036          */
2037          
2038         peek_seq = sk->copied_seq;
2039         seq = &sk->copied_seq;
2040         if (flags & MSG_PEEK)
2041                 seq = &peek_seq;
2042 
2043         add_wait_queue(sk->sleep, &wait);
2044         sk->inuse = 1;
2045         while (len > 0) 
2046         {
2047                 struct sk_buff * skb;
2048                 unsigned long offset;
2049         
2050                 /*
2051                  * Are we at urgent data? Stop if we have read anything.
2052                  */
2053                  
2054                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2055                         break;
2056 
2057                 /*
2058                  *      Next get a buffer.
2059                  */
2060                  
2061                 current->state = TASK_INTERRUPTIBLE;
2062 
2063                 skb = skb_peek(&sk->receive_queue);
2064                 do 
2065                 {
2066                         if (!skb)
2067                                 break;
2068                         if (before(*seq, skb->h.th->seq))
2069                                 break;
2070                         offset = *seq - skb->h.th->seq;
2071                         if (skb->h.th->syn)
2072                                 offset--;
2073                         if (offset < skb->len)
2074                                 goto found_ok_skb;
2075                         if (skb->h.th->fin)
2076                                 goto found_fin_ok;
2077                         if (!(flags & MSG_PEEK))
2078                                 skb->used = 1;
2079                         skb = skb->next;
2080                 }
2081                 while (skb != (struct sk_buff *)&sk->receive_queue);
2082 
2083                 if (copied)
2084                         break;
2085 
2086                 if (sk->err) 
2087                 {
2088                         copied = -sk->err;
2089                         sk->err = 0;
2090                         break;
2091                 }
2092 
2093                 if (sk->state == TCP_CLOSE) 
2094                 {
2095                         if (!sk->done) 
2096                         {
2097                                 sk->done = 1;
2098                                 break;
2099                         }
2100                         copied = -ENOTCONN;
2101                         break;
2102                 }
2103 
2104                 if (sk->shutdown & RCV_SHUTDOWN) 
2105                 {
2106                         sk->done = 1;
2107                         break;
2108                 }
2109                         
2110                 if (nonblock) 
2111                 {
2112                         copied = -EAGAIN;
2113                         break;
2114                 }
2115 
2116                 cleanup_rbuf(sk);
2117                 release_sock(sk);
2118                 sk->socket->flags |= SO_WAITDATA;
2119                 schedule();
2120                 sk->socket->flags &= ~SO_WAITDATA;
2121                 sk->inuse = 1;
2122 
2123                 if (current->signal & ~current->blocked) 
2124                 {
2125                         copied = -ERESTARTSYS;
2126                         break;
2127                 }
2128                 continue;
2129 
2130         found_ok_skb:
2131                 /*
2132                  *      Lock the buffer. We can be fairly relaxed as
2133                  *      an interrupt will never steal a buffer we are 
2134                  *      using unless I've missed something serious in
2135                  *      tcp_data.
2136                  */
2137                 
2138                 skb->users++;
2139                 
2140                 /*
2141                  *      Ok so how much can we use ? 
2142                  */
2143                  
2144                 used = skb->len - offset;
2145                 if (len < used)
2146                         used = len;
2147                 /*
2148                  *      Do we have urgent data here? 
2149                  */
2150                 
2151                 if (sk->urg_data) 
2152                 {
2153                         unsigned long urg_offset = sk->urg_seq - *seq;
2154                         if (urg_offset < used) 
2155                         {
2156                                 if (!urg_offset) 
2157                                 {
2158                                         if (!sk->urginline) 
2159                                         {
2160                                                 ++*seq;
2161                                                 offset++;
2162                                                 used--;
2163                                         }
2164                                 }
2165                                 else
2166                                         used = urg_offset;
2167                         }
2168                 }
2169                 
2170                 /*
2171                  *      Copy it - We _MUST_ update *seq first so that we
2172                  *      don't ever double read when we have dual readers
2173                  */
2174                  
2175                 *seq += used;
2176 
2177                 /*
2178                  *      This memcpy_tofs can sleep. If it sleeps and we
2179                  *      do a second read it relies on the skb->users to avoid
2180                  *      a crash when cleanup_rbuf() gets called.
2181                  */
2182                  
2183                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2184                         skb->h.th->doff*4 + offset, used);
2185                 copied += used;
2186                 len -= used;
2187                 to += used;
2188                 
2189                 /*
2190                  *      We now will not sleep again until we are finished
2191                  *      with skb. Sorry if you are doing the SMP port
2192                  *      but you'll just have to fix it neatly ;)
2193                  */
2194                  
2195                 skb->users --;
2196                 
2197                 if (after(sk->copied_seq,sk->urg_seq))
2198                         sk->urg_data = 0;
2199                 if (used + offset < skb->len)
2200                         continue;
2201                 
2202                 /*
2203                  *      Process the FIN.
2204                  */
2205 
2206                 if (skb->h.th->fin)
2207                         goto found_fin_ok;
2208                 if (flags & MSG_PEEK)
2209                         continue;
2210                 skb->used = 1;
2211                 continue;
2212 
2213         found_fin_ok:
2214                 ++*seq;
2215                 if (flags & MSG_PEEK)
2216                         break;
2217                         
2218                 /*
2219                  *      All is done
2220                  */
2221                  
2222                 skb->used = 1;
2223                 sk->shutdown |= RCV_SHUTDOWN;
2224                 break;
2225 
2226         }
2227         remove_wait_queue(sk->sleep, &wait);
2228         current->state = TASK_RUNNING;
2229 
2230         /* Clean up data we have read: This will do ACK frames */
2231         cleanup_rbuf(sk);
2232         release_sock(sk);
2233         return copied;
2234 }
2235 
2236 /*
2237  *      State processing on a close. This implements the state shift for
2238  *      sending our FIN frame. Note that we only send a FIN for some 
2239  *      states. A shutdown() may have already sent the FIN, or we may be
2240  *      closed.
2241  */
2242  
2243 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2244 {
2245         int ns=TCP_CLOSE;
2246         int send_fin=0;
2247         switch(sk->state)
2248         {
2249                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2250                         break;
2251                 case TCP_SYN_RECV:
2252                 case TCP_ESTABLISHED:   /* Closedown begin */
2253                         ns=TCP_FIN_WAIT1;
2254                         send_fin=1;
2255                         break;
2256                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2257                 case TCP_FIN_WAIT2:
2258                 case TCP_CLOSING:
2259                         ns=sk->state;
2260                         break;
2261                 case TCP_CLOSE:
2262                 case TCP_LISTEN:
2263                         break;
2264                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2265                                            wait only for the ACK */
2266                         ns=TCP_LAST_ACK;
2267                         send_fin=1;
2268         }
2269         
2270         tcp_set_state(sk,ns);
2271                 
2272         /*
2273          *      This is a (useful) BSD violating of the RFC. There is a
2274          *      problem with TCP as specified in that the other end could
2275          *      keep a socket open forever with no application left this end.
2276          *      We use a 3 minute timeout (about the same as BSD) then kill
2277          *      our end. If they send after that then tough - BUT: long enough
2278          *      that we won't make the old 4*rto = almost no time - whoops
2279          *      reset mistake.
2280          */
2281         if(dead && ns==TCP_FIN_WAIT2)
2282         {
2283                 int timer_active=del_timer(&sk->timer);
2284                 if(timer_active)
2285                         add_timer(&sk->timer);
2286                 else
2287                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2288         }
2289         
2290         return send_fin;
2291 }
2292 
2293 /*
2294  *      Send a fin.
2295  */
2296 
2297 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2298 {
2299         struct proto *prot =(struct proto *)sk->prot;
2300         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2301         struct tcphdr *t1;
2302         struct sk_buff *buff;
2303         struct device *dev=NULL;
2304         int tmp;
2305                 
2306         release_sock(sk); /* in case the malloc sleeps. */
2307         
2308         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2309         sk->inuse = 1;
2310 
2311         if (buff == NULL)
2312         {
2313                 /* This is a disaster if it occurs */
2314                 printk("tcp_send_fin: Impossible malloc failure");
2315                 return;
2316         }
2317 
2318         /*
2319          *      Administrivia
2320          */
2321          
2322         buff->sk = sk;
2323         buff->len = sizeof(*t1);
2324         buff->localroute = sk->localroute;
2325         t1 =(struct tcphdr *) buff->data;
2326 
2327         /*
2328          *      Put in the IP header and routing stuff. 
2329          */
2330 
2331         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2332                            IPPROTO_TCP, sk->opt,
2333                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2334         if (tmp < 0) 
2335         {
2336                 int t;
2337                 /*
2338                  *      Finish anyway, treat this as a send that got lost. 
2339                  *      (Not good).
2340                  */
2341                  
2342                 buff->free = 1;
2343                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2344                 sk->write_seq++;
2345                 t=del_timer(&sk->timer);
2346                 if(t)
2347                         add_timer(&sk->timer);
2348                 else
2349                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2350                 return;
2351         }
2352         
2353         /*
2354          *      We ought to check if the end of the queue is a buffer and
2355          *      if so simply add the fin to that buffer, not send it ahead.
2356          */
2357 
2358         t1 =(struct tcphdr *)((char *)t1 +tmp);
2359         buff->len += tmp;
2360         buff->dev = dev;
2361         memcpy(t1, th, sizeof(*t1));
2362         t1->seq = ntohl(sk->write_seq);
2363         sk->write_seq++;
2364         buff->h.seq = sk->write_seq;
2365         t1->ack = 1;
2366         t1->ack_seq = ntohl(sk->acked_seq);
2367         t1->window = ntohs(sk->window=tcp_select_window(sk));
2368         t1->fin = 1;
2369         t1->rst = 0;
2370         t1->doff = sizeof(*t1)/4;
2371         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2372 
2373         /*
2374          * If there is data in the write queue, the fin must be appended to
2375          * the write queue.
2376          */
2377         
2378         if (skb_peek(&sk->write_queue) != NULL) 
2379         {
2380                 buff->free = 0;
2381                 if (buff->next != NULL) 
2382                 {
2383                         printk("tcp_send_fin: next != NULL\n");
2384                         skb_unlink(buff);
2385                 }
2386                 skb_queue_tail(&sk->write_queue, buff);
2387         } 
2388         else 
2389         {
2390                 sk->sent_seq = sk->write_seq;
2391                 sk->prot->queue_xmit(sk, dev, buff, 0);
2392                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2393         }
2394 }
2395 
2396 /*
2397  *      Shutdown the sending side of a connection. Much like close except
2398  *      that we don't receive shut down or set sk->dead=1.
2399  */
2400 
2401 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2402 {
2403         /*
2404          *      We need to grab some memory, and put together a FIN,
2405          *      and then put it into the queue to be sent.
2406          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2407          */
2408 
2409         if (!(how & SEND_SHUTDOWN)) 
2410                 return;
2411          
2412         /*
2413          *      If we've already sent a FIN, or its a closed state
2414          */
2415          
2416         if (sk->state == TCP_FIN_WAIT1 ||
2417             sk->state == TCP_FIN_WAIT2 ||
2418             sk->state == TCP_CLOSING ||
2419             sk->state == TCP_LAST_ACK ||
2420             sk->state == TCP_TIME_WAIT || 
2421             sk->state == TCP_CLOSE ||
2422             sk->state == TCP_LISTEN
2423           )
2424         {
2425                 return;
2426         }
2427         sk->inuse = 1;
2428 
2429         /*
2430          * flag that the sender has shutdown
2431          */
2432 
2433         sk->shutdown |= SEND_SHUTDOWN;
2434 
2435         /*
2436          *  Clear out any half completed packets. 
2437          */
2438 
2439         if (sk->partial)
2440                 tcp_send_partial(sk);
2441                 
2442         /*
2443          *      FIN if needed
2444          */
2445          
2446         if(tcp_close_state(sk,0))
2447                 tcp_send_fin(sk);
2448                 
2449         release_sock(sk);
2450 }
2451 
2452 
2453 static int
2454 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2455              int to_len, int nonblock, unsigned flags,
2456              struct sockaddr_in *addr, int *addr_len)
2457 {
2458         int result;
2459   
2460         /* 
2461          *      Have to check these first unlike the old code. If 
2462          *      we check them after we lose data on an error
2463          *      which is wrong 
2464          */
2465 
2466         if(addr_len)
2467                 *addr_len = sizeof(*addr);
2468         result=tcp_read(sk, to, to_len, nonblock, flags);
2469 
2470         if (result < 0) 
2471                 return(result);
2472   
2473         if(addr)
2474         {
2475                 addr->sin_family = AF_INET;
2476                 addr->sin_port = sk->dummy_th.dest;
2477                 addr->sin_addr.s_addr = sk->daddr;
2478         }
2479         return(result);
2480 }
2481 
2482 
2483 /*
2484  *      This routine will send an RST to the other tcp. 
2485  */
2486  
2487 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2488           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2489 {
2490         struct sk_buff *buff;
2491         struct tcphdr *t1;
2492         int tmp;
2493         struct device *ndev=NULL;
2494 
2495         /*
2496          *      Cannot reset a reset (Think about it).
2497          */
2498          
2499         if(th->rst)
2500                 return;
2501   
2502         /*
2503          * We need to grab some memory, and put together an RST,
2504          * and then put it into the queue to be sent.
2505          */
2506 
2507         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2508         if (buff == NULL) 
2509                 return;
2510 
2511         buff->len = sizeof(*t1);
2512         buff->sk = NULL;
2513         buff->dev = dev;
2514         buff->localroute = 0;
2515 
2516         t1 =(struct tcphdr *) buff->data;
2517 
2518         /*
2519          *      Put in the IP header and routing stuff. 
2520          */
2521 
2522         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2523                            sizeof(struct tcphdr),tos,ttl);
2524         if (tmp < 0) 
2525         {
2526                 buff->free = 1;
2527                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2528                 return;
2529         }
2530 
2531         t1 =(struct tcphdr *)((char *)t1 +tmp);
2532         buff->len += tmp;
2533         memcpy(t1, th, sizeof(*t1));
2534 
2535         /*
2536          *      Swap the send and the receive. 
2537          */
2538 
2539         t1->dest = th->source;
2540         t1->source = th->dest;
2541         t1->rst = 1;  
2542         t1->window = 0;
2543   
2544         if(th->ack)
2545         {
2546                 t1->ack = 0;
2547                 t1->seq = th->ack_seq;
2548                 t1->ack_seq = 0;
2549         }
2550         else
2551         {
2552                 t1->ack = 1;
2553                 if(!th->syn)
2554                         t1->ack_seq=htonl(th->seq);
2555                 else
2556                         t1->ack_seq=htonl(th->seq+1);
2557                 t1->seq=0;
2558         }
2559 
2560         t1->syn = 0;
2561         t1->urg = 0;
2562         t1->fin = 0;
2563         t1->psh = 0;
2564         t1->doff = sizeof(*t1)/4;
2565         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2566         prot->queue_xmit(NULL, ndev, buff, 1);
2567         tcp_statistics.TcpOutSegs++;
2568 }
2569 
2570 
2571 /*
2572  *      Look for tcp options. Parses everything but only knows about MSS.
2573  *      This routine is always called with the packet containing the SYN.
2574  *      However it may also be called with the ack to the SYN.  So you
2575  *      can't assume this is always the SYN.  It's always called after
2576  *      we have set up sk->mtu to our own MTU.
2577  *
2578  *      We need at minimum to add PAWS support here. Possibly large windows
2579  *      as Linux gets deployed on 100Mb/sec networks.
2580  */
2581  
2582 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2583 {
2584         unsigned char *ptr;
2585         int length=(th->doff*4)-sizeof(struct tcphdr);
2586         int mss_seen = 0;
2587     
2588         ptr = (unsigned char *)(th + 1);
2589   
2590         while(length>0)
2591         {
2592                 int opcode=*ptr++;
2593                 int opsize=*ptr++;
2594                 switch(opcode)
2595                 {
2596                         case TCPOPT_EOL:
2597                                 return;
2598                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2599                                 length--;
2600                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2601                                 continue;
2602                         
2603                         default:
2604                                 if(opsize<=2)   /* Avoid silly options looping forever */
2605                                         return;
2606                                 switch(opcode)
2607                                 {
2608                                         case TCPOPT_MSS:
2609                                                 if(opsize==4 && th->syn)
2610                                                 {
2611                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2612                                                         mss_seen = 1;
2613                                                 }
2614                                                 break;
2615                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2616                                 }
2617                                 ptr+=opsize-2;
2618                                 length-=opsize;
2619                 }
2620         }
2621         if (th->syn) 
2622         {
2623                 if (! mss_seen)
2624                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2625         }
2626 #ifdef CONFIG_INET_PCTCP
2627         sk->mss = min(sk->max_window >> 1, sk->mtu);
2628 #else    
2629         sk->mss = min(sk->max_window, sk->mtu);
2630 #endif  
2631 }
2632 
2633 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2634 {
2635         dst = ntohl(dst);
2636         if (IN_CLASSA(dst))
2637                 return htonl(IN_CLASSA_NET);
2638         if (IN_CLASSB(dst))
2639                 return htonl(IN_CLASSB_NET);
2640         return htonl(IN_CLASSC_NET);
2641 }
2642 
2643 /*
2644  *      Default sequence number picking algorithm.
2645  */
2646 
2647 extern inline long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2648 {
2649         return jiffies * SEQ_TICK - seq_offset; 
2650 }
2651 
2652 /*
2653  *      This routine handles a connection request.
2654  *      It should make sure we haven't already responded.
2655  *      Because of the way BSD works, we have to send a syn/ack now.
2656  *      This also means it will be harder to close a socket which is
2657  *      listening.
2658  */
2659  
2660 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2661                  unsigned long daddr, unsigned long saddr,
2662                  struct options *opt, struct device *dev, unsigned long seq)
2663 {
2664         struct sk_buff *buff;
2665         struct tcphdr *t1;
2666         unsigned char *ptr;
2667         struct sock *newsk;
2668         struct tcphdr *th;
2669         struct device *ndev=NULL;
2670         int tmp;
2671         struct rtable *rt;
2672   
2673         th = skb->h.th;
2674 
2675         /* If the socket is dead, don't accept the connection. */
2676         if (!sk->dead) 
2677         {
2678                 sk->data_ready(sk,0);
2679         }
2680         else 
2681         {
2682                 if(sk->debug)
2683                         printk("Reset on %p: Connect on dead socket.\n",sk);
2684                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2685                 tcp_statistics.TcpAttemptFails++;
2686                 kfree_skb(skb, FREE_READ);
2687                 return;
2688         }
2689 
2690         /*
2691          * Make sure we can accept more.  This will prevent a
2692          * flurry of syns from eating up all our memory.
2693          */
2694 
2695         if (sk->ack_backlog >= sk->max_ack_backlog) 
2696         {
2697                 tcp_statistics.TcpAttemptFails++;
2698                 kfree_skb(skb, FREE_READ);
2699                 return;
2700         }
2701 
2702         /*
2703          * We need to build a new sock struct.
2704          * It is sort of bad to have a socket without an inode attached
2705          * to it, but the wake_up's will just wake up the listening socket,
2706          * and if the listening socket is destroyed before this is taken
2707          * off of the queue, this will take care of it.
2708          */
2709 
2710         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2711         if (newsk == NULL) 
2712         {
2713                 /* just ignore the syn.  It will get retransmitted. */
2714                 tcp_statistics.TcpAttemptFails++;
2715                 kfree_skb(skb, FREE_READ);
2716                 return;
2717         }
2718 
2719         memcpy(newsk, sk, sizeof(*newsk));
2720         skb_queue_head_init(&newsk->write_queue);
2721         skb_queue_head_init(&newsk->receive_queue);
2722         newsk->send_head = NULL;
2723         newsk->send_tail = NULL;
2724         skb_queue_head_init(&newsk->back_log);
2725         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2726         newsk->rto = TCP_TIMEOUT_INIT;
2727         newsk->mdev = 0;
2728         newsk->max_window = 0;
2729         newsk->cong_window = 1;
2730         newsk->cong_count = 0;
2731         newsk->ssthresh = 0;
2732         newsk->backoff = 0;
2733         newsk->blog = 0;
2734         newsk->intr = 0;
2735         newsk->proc = 0;
2736         newsk->done = 0;
2737         newsk->partial = NULL;
2738         newsk->pair = NULL;
2739         newsk->wmem_alloc = 0;
2740         newsk->rmem_alloc = 0;
2741         newsk->localroute = sk->localroute;
2742 
2743         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2744 
2745         newsk->err = 0;
2746         newsk->shutdown = 0;
2747         newsk->ack_backlog = 0;
2748         newsk->acked_seq = skb->h.th->seq+1;
2749         newsk->copied_seq = skb->h.th->seq+1;
2750         newsk->fin_seq = skb->h.th->seq;
2751         newsk->state = TCP_SYN_RECV;
2752         newsk->timeout = 0;
2753         newsk->ip_xmit_timeout = 0;
2754         newsk->write_seq = seq; 
2755         newsk->window_seq = newsk->write_seq;
2756         newsk->rcv_ack_seq = newsk->write_seq;
2757         newsk->urg_data = 0;
2758         newsk->retransmits = 0;
2759         newsk->linger=0;
2760         newsk->destroy = 0;
2761         init_timer(&newsk->timer);
2762         init_timer(&newsk->retransmit_timer);
2763         newsk->timer.data = (unsigned long)newsk;
2764         newsk->timer.function = &net_timer;
2765         newsk->retransmit_timer.data = (unsigned long)newsk;
2766         newsk->retransmit_timer.function=&retransmit_timer;
2767         newsk->dummy_th.source = skb->h.th->dest;
2768         newsk->dummy_th.dest = skb->h.th->source;
2769         
2770         /*
2771          *      Swap these two, they are from our point of view. 
2772          */
2773          
2774         newsk->daddr = saddr;
2775         newsk->saddr = daddr;
2776 
2777         put_sock(newsk->num,newsk);
2778         newsk->dummy_th.res1 = 0;
2779         newsk->dummy_th.doff = 6;
2780         newsk->dummy_th.fin = 0;
2781         newsk->dummy_th.syn = 0;
2782         newsk->dummy_th.rst = 0;        
2783         newsk->dummy_th.psh = 0;
2784         newsk->dummy_th.ack = 0;
2785         newsk->dummy_th.urg = 0;
2786         newsk->dummy_th.res2 = 0;
2787         newsk->acked_seq = skb->h.th->seq + 1;
2788         newsk->copied_seq = skb->h.th->seq + 1;
2789         newsk->socket = NULL;
2790 
2791         /*
2792          *      Grab the ttl and tos values and use them 
2793          */
2794 
2795         newsk->ip_ttl=sk->ip_ttl;
2796         newsk->ip_tos=skb->ip_hdr->tos;
2797 
2798         /*
2799          *      Use 512 or whatever user asked for 
2800          */
2801 
2802         /*
2803          *      Note use of sk->user_mss, since user has no direct access to newsk 
2804          */
2805 
2806         rt=ip_rt_route(saddr, NULL,NULL);
2807         
2808         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2809                 newsk->window_clamp = rt->rt_window;
2810         else
2811                 newsk->window_clamp = 0;
2812                 
2813         if (sk->user_mss)
2814                 newsk->mtu = sk->user_mss;
2815         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2816                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2817         else 
2818         {
2819 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2820                 if ((saddr ^ daddr) & default_mask(saddr))
2821 #else
2822                 if ((saddr ^ daddr) & dev->pa_mask)
2823 #endif
2824                         newsk->mtu = 576 - HEADER_SIZE;
2825                 else
2826                         newsk->mtu = MAX_WINDOW;
2827         }
2828 
2829         /*
2830          *      But not bigger than device MTU 
2831          */
2832 
2833         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2834 
2835         /*
2836          *      This will min with what arrived in the packet 
2837          */
2838 
2839         tcp_options(newsk,skb->h.th);
2840 
2841         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2842         if (buff == NULL) 
2843         {
2844                 sk->err = -ENOMEM;
2845                 newsk->dead = 1;
2846                 release_sock(newsk);
2847                 kfree_skb(skb, FREE_READ);
2848                 tcp_statistics.TcpAttemptFails++;
2849                 return;
2850         }
2851   
2852         buff->len = sizeof(struct tcphdr)+4;
2853         buff->sk = newsk;
2854         buff->localroute = newsk->localroute;
2855 
2856         t1 =(struct tcphdr *) buff->data;
2857 
2858         /*
2859          *      Put in the IP header and routing stuff. 
2860          */
2861 
2862         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2863                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2864 
2865         /*
2866          *      Something went wrong. 
2867          */
2868 
2869         if (tmp < 0) 
2870         {
2871                 sk->err = tmp;
2872                 buff->free = 1;
2873                 kfree_skb(buff,FREE_WRITE);
2874                 newsk->dead = 1;
2875                 release_sock(newsk);
2876                 skb->sk = sk;
2877                 kfree_skb(skb, FREE_READ);
2878                 tcp_statistics.TcpAttemptFails++;
2879                 return;
2880         }
2881 
2882         buff->len += tmp;
2883         t1 =(struct tcphdr *)((char *)t1 +tmp);
2884   
2885         memcpy(t1, skb->h.th, sizeof(*t1));
2886         buff->h.seq = newsk->write_seq;
2887         /*
2888          *      Swap the send and the receive. 
2889          */
2890         t1->dest = skb->h.th->source;
2891         t1->source = newsk->dummy_th.source;
2892         t1->seq = ntohl(newsk->write_seq++);
2893         t1->ack = 1;
2894         newsk->window = tcp_select_window(newsk);
2895         newsk->sent_seq = newsk->write_seq;
2896         t1->window = ntohs(newsk->window);
2897         t1->res1 = 0;
2898         t1->res2 = 0;
2899         t1->rst = 0;
2900         t1->urg = 0;
2901         t1->psh = 0;
2902         t1->syn = 1;
2903         t1->ack_seq = ntohl(skb->h.th->seq+1);
2904         t1->doff = sizeof(*t1)/4+1;
2905         ptr =(unsigned char *)(t1+1);
2906         ptr[0] = 2;
2907         ptr[1] = 4;
2908         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2909         ptr[3] =(newsk->mtu) & 0xff;
2910 
2911         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2912         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2913         reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2914 
2915         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2916         skb->sk = newsk;
2917 
2918         /*
2919          *      Charge the sock_buff to newsk. 
2920          */
2921          
2922         sk->rmem_alloc -= skb->mem_len;
2923         newsk->rmem_alloc += skb->mem_len;
2924         
2925         skb_queue_tail(&sk->receive_queue,skb);
2926         sk->ack_backlog++;
2927         release_sock(newsk);
2928         tcp_statistics.TcpOutSegs++;
2929 }
2930 
2931 
2932 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2933 {
2934         /*
2935          * We need to grab some memory, and put together a FIN, 
2936          * and then put it into the queue to be sent.
2937          */
2938         
2939         sk->inuse = 1;
2940         
2941         if(sk->state == TCP_LISTEN)
2942         {
2943                 /* Special case */
2944                 tcp_set_state(sk, TCP_CLOSE);
2945                 tcp_close_pending(sk);
2946                 release_sock(sk);
2947                 return;
2948         }
2949         
2950         sk->keepopen = 1;
2951         sk->shutdown = SHUTDOWN_MASK;
2952 
2953         if (!sk->dead) 
2954                 sk->state_change(sk);
2955 
2956         if (timeout == 0) 
2957         {
2958                 struct sk_buff *skb;
2959                 
2960                 /*
2961                  *  We need to flush the recv. buffs.  We do this only on the
2962                  *  descriptor close, not protocol-sourced closes, because the
2963                  *  reader process may not have drained the data yet!
2964                  */
2965                  
2966                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2967                         kfree_skb(skb, FREE_READ);
2968                 /*
2969                  *      Get rid off any half-completed packets. 
2970                  */
2971 
2972                 if (sk->partial) 
2973                         tcp_send_partial(sk);
2974         }
2975 
2976                 
2977         /*
2978          *      Timeout is not the same thing - however the code likes
2979          *      to send both the same way (sigh).
2980          */
2981          
2982         if(timeout)
2983         {
2984                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2985         }
2986         else
2987         {
2988                 if(tcp_close_state(sk,1)==1)
2989                 {
2990                         tcp_send_fin(sk);
2991                 }
2992         }
2993         release_sock(sk);
2994 }
2995 
2996 
2997 /*
2998  *      This routine takes stuff off of the write queue,
2999  *      and puts it in the xmit queue. This happens as incoming acks
3000  *      open up the remote window for us.
3001  */
3002  
3003 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3004 {
3005         struct sk_buff *skb;
3006 
3007         /*
3008          *      The bytes will have to remain here. In time closedown will
3009          *      empty the write queue and all will be happy 
3010          */
3011 
3012         if(sk->zapped)
3013                 return;
3014 
3015         /*
3016          *      Anything on the transmit queue that fits the window can
3017          *      be added providing we are not
3018          *
3019          *      a) retransmitting (Nagle's rule)
3020          *      b) exceeding our congestion window.
3021          */
3022          
3023         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3024                 before(skb->h.seq, sk->window_seq + 1) &&
3025                 (sk->retransmits == 0 ||
3026                  sk->ip_xmit_timeout != TIME_WRITE ||
3027                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3028                 && sk->packets_out < sk->cong_window) 
3029         {
3030                 IS_SKB(skb);
3031                 skb_unlink(skb);
3032                 
3033                 /*
3034                  *      See if we really need to send the packet. 
3035                  */
3036                  
3037                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3038                 {
3039                         /*
3040                          *      This is acked data. We can discard it. This 
3041                          *      cannot currently occur.
3042                          */
3043                          
3044                         sk->retransmits = 0;
3045                         kfree_skb(skb, FREE_WRITE);
3046                         if (!sk->dead) 
3047                                 sk->write_space(sk);
3048                 } 
3049                 else
3050                 {
3051                         struct tcphdr *th;
3052                         struct iphdr *iph;
3053                         int size;
3054 /*
3055  * put in the ack seq and window at this point rather than earlier,
3056  * in order to keep them monotonic.  We really want to avoid taking
3057  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3058  * Ack and window will in general have changed since this packet was put
3059  * on the write queue.
3060  */
3061                         iph = (struct iphdr *)(skb->data +
3062                                                skb->dev->hard_header_len);
3063                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3064                         size = skb->len - (((unsigned char *) th) - skb->data);
3065                         
3066                         th->ack_seq = ntohl(sk->acked_seq);
3067                         th->window = ntohs(tcp_select_window(sk));
3068 
3069                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3070 
3071                         sk->sent_seq = skb->h.seq;
3072                         
3073                         /*
3074                          *      IP manages our queue for some crazy reason
3075                          */
3076                          
3077                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3078                         
3079                         /*
3080                          *      Again we slide the timer wrongly
3081                          */
3082                          
3083                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3084                 }
3085         }
3086 }
3087 
3088 
3089 /*
3090  *      This routine deals with incoming acks, but not outgoing ones.
3091  */
3092 
3093 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3094 {
3095         unsigned long ack;
3096         int flag = 0;
3097 
3098         /* 
3099          * 1 - there was data in packet as well as ack or new data is sent or 
3100          *     in shutdown state
3101          * 2 - data from retransmit queue was acked and removed
3102          * 4 - window shrunk or data from retransmit queue was acked and removed
3103          */
3104 
3105         if(sk->zapped)
3106                 return(1);      /* Dead, cant ack any more so why bother */
3107 
3108         /*
3109          *      Have we discovered a larger window
3110          */
3111          
3112         ack = ntohl(th->ack_seq);
3113 
3114         if (ntohs(th->window) > sk->max_window) 
3115         {
3116                 sk->max_window = ntohs(th->window);
3117 #ifdef CONFIG_INET_PCTCP
3118                 /* Hack because we don't send partial packets to non SWS
3119                    handling hosts */
3120                 sk->mss = min(sk->max_window>>1, sk->mtu);
3121 #else
3122                 sk->mss = min(sk->max_window, sk->mtu);
3123 #endif  
3124         }
3125 
3126         /*
3127          *      We have dropped back to keepalive timeouts. Thus we have
3128          *      no retransmits pending.
3129          */
3130          
3131         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3132                 sk->retransmits = 0;
3133 
3134         /*
3135          *      If the ack is newer than sent or older than previous acks
3136          *      then we can probably ignore it.
3137          */
3138          
3139         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3140         {
3141                 if(sk->debug)
3142                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3143                         
3144                 /*
3145                  *      Keepalive processing.
3146                  */
3147                  
3148                 if (after(ack, sk->sent_seq)) 
3149                 {
3150                         return(0);
3151                 }
3152                 
3153                 /*
3154                  *      Restart the keepalive timer.
3155                  */
3156                  
3157                 if (sk->keepopen) 
3158                 {
3159                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3160                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3161                 }
3162                 return(1);
3163         }
3164 
3165         /*
3166          *      If there is data set flag 1
3167          */
3168          
3169         if (len != th->doff*4) 
3170                 flag |= 1;
3171 
3172         /*
3173          *      See if our window has been shrunk. 
3174          */
3175 
3176         if (after(sk->window_seq, ack+ntohs(th->window))) 
3177         {
3178                 /*
3179                  * We may need to move packets from the send queue
3180                  * to the write queue, if the window has been shrunk on us.
3181                  * The RFC says you are not allowed to shrink your window
3182                  * like this, but if the other end does, you must be able
3183                  * to deal with it.
3184                  */
3185                 struct sk_buff *skb;
3186                 struct sk_buff *skb2;
3187                 struct sk_buff *wskb = NULL;
3188         
3189                 skb2 = sk->send_head;
3190                 sk->send_head = NULL;
3191                 sk->send_tail = NULL;
3192         
3193                 /*
3194                  *      This is an artifact of a flawed concept. We want one
3195                  *      queue and a smarter send routine when we send all.
3196                  */
3197         
3198                 flag |= 4;      /* Window changed */
3199         
3200                 sk->window_seq = ack + ntohs(th->window);
3201                 cli();
3202                 while (skb2 != NULL) 
3203                 {
3204                         skb = skb2;
3205                         skb2 = skb->link3;
3206                         skb->link3 = NULL;
3207                         if (after(skb->h.seq, sk->window_seq)) 
3208                         {
3209                                 if (sk->packets_out > 0) 
3210                                         sk->packets_out--;
3211                                 /* We may need to remove this from the dev send list. */
3212                                 if (skb->next != NULL) 
3213                                 {
3214                                         skb_unlink(skb);                                
3215                                 }
3216                                 /* Now add it to the write_queue. */
3217                                 if (wskb == NULL)
3218                                         skb_queue_head(&sk->write_queue,skb);
3219                                 else
3220                                         skb_append(wskb,skb);
3221                                 wskb = skb;
3222                         } 
3223                         else 
3224                         {
3225                                 if (sk->send_head == NULL) 
3226                                 {
3227                                         sk->send_head = skb;
3228                                         sk->send_tail = skb;
3229                                 }
3230                                 else
3231                                 {
3232                                         sk->send_tail->link3 = skb;
3233                                         sk->send_tail = skb;
3234                                 }
3235                                 skb->link3 = NULL;
3236                         }
3237                 }
3238                 sti();
3239         }
3240 
3241         /*
3242          *      Pipe has emptied
3243          */
3244          
3245         if (sk->send_tail == NULL || sk->send_head == NULL) 
3246         {
3247                 sk->send_head = NULL;
3248                 sk->send_tail = NULL;
3249                 sk->packets_out= 0;
3250         }
3251 
3252         /*
3253          *      Update the right hand window edge of the host
3254          */
3255          
3256         sk->window_seq = ack + ntohs(th->window);
3257 
3258         /*
3259          *      We don't want too many packets out there. 
3260          */
3261          
3262         if (sk->ip_xmit_timeout == TIME_WRITE && 
3263                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3264         {
3265                 /* 
3266                  * This is Jacobson's slow start and congestion avoidance. 
3267                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3268                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3269                  * counter and increment it once every cwnd times.  It's possible
3270                  * that this should be done only if sk->retransmits == 0.  I'm
3271                  * interpreting "new data is acked" as including data that has
3272                  * been retransmitted but is just now being acked.
3273                  */
3274                 if (sk->cong_window < sk->ssthresh)  
3275                         /* 
3276                          *      In "safe" area, increase
3277                          */
3278                         sk->cong_window++;
3279                 else 
3280                 {
3281                         /*
3282                          *      In dangerous area, increase slowly.  In theory this is
3283                          *      sk->cong_window += 1 / sk->cong_window
3284                          */
3285                         if (sk->cong_count >= sk->cong_window) 
3286                         {
3287                                 sk->cong_window++;
3288                                 sk->cong_count = 0;
3289                         }
3290                         else 
3291                                 sk->cong_count++;
3292                 }
3293         }
3294 
3295         /*
3296          *      Remember the highest ack received.
3297          */
3298          
3299         sk->rcv_ack_seq = ack;
3300 
3301         /*
3302          *      If this ack opens up a zero window, clear backoff.  It was
3303          *      being used to time the probes, and is probably far higher than
3304          *      it needs to be for normal retransmission.
3305          */
3306 
3307         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3308         {
3309                 sk->retransmits = 0;    /* Our probe was answered */
3310                 
3311                 /*
3312                  *      Was it a usable window open ?
3313                  */
3314                  
3315                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3316                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3317                 {
3318                         sk->backoff = 0;
3319                         
3320                         /*
3321                          *      Recompute rto from rtt.  this eliminates any backoff.
3322                          */
3323 
3324                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3325                         if (sk->rto > 120*HZ)
3326                                 sk->rto = 120*HZ;
3327                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3328                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3329                                                    .2 of a second is going to need huge windows (SIGH) */
3330                         sk->rto = 20;
3331                 }
3332         }
3333 
3334         /* 
3335          *      See if we can take anything off of the retransmit queue.
3336          */
3337    
3338         while(sk->send_head != NULL) 
3339         {
3340                 /* Check for a bug. */
3341                 if (sk->send_head->link3 &&
3342                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3343                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3344                         
3345                 /*
3346                  *      If our packet is before the ack sequence we can
3347                  *      discard it as its confirmed to have arrived the other end.
3348                  */
3349                  
3350                 if (before(sk->send_head->h.seq, ack+1)) 
3351                 {
3352                         struct sk_buff *oskb;   
3353                         if (sk->retransmits) 
3354                         {       
3355                                 /*
3356                                  *      We were retransmitting.  don't count this in RTT est 
3357                                  */
3358                                 flag |= 2;
3359 
3360                                 /*
3361                                  * even though we've gotten an ack, we're still
3362                                  * retransmitting as long as we're sending from
3363                                  * the retransmit queue.  Keeping retransmits non-zero
3364                                  * prevents us from getting new data interspersed with
3365                                  * retransmissions.
3366                                  */
3367 
3368                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3369                                         sk->retransmits = 1;
3370                                 else
3371                                         sk->retransmits = 0;
3372                         }
3373                         /*
3374                          * Note that we only reset backoff and rto in the
3375                          * rtt recomputation code.  And that doesn't happen
3376                          * if there were retransmissions in effect.  So the
3377                          * first new packet after the retransmissions is
3378                          * sent with the backoff still in effect.  Not until
3379                          * we get an ack from a non-retransmitted packet do
3380                          * we reset the backoff and rto.  This allows us to deal
3381                          * with a situation where the network delay has increased
3382                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3383                          */
3384 
3385                         /*
3386                          *      We have one less packet out there. 
3387                          */
3388                          
3389                         if (sk->packets_out > 0) 
3390                                 sk->packets_out --;
3391                         /* 
3392                          *      Wake up the process, it can probably write more. 
3393                          */
3394                         if (!sk->dead) 
3395                                 sk->write_space(sk);
3396                         oskb = sk->send_head;
3397 
3398                         if (!(flag&2))  /* Not retransmitting */
3399                         {
3400                                 long m;
3401         
3402                                 /*
3403                                  *      The following amusing code comes from Jacobson's
3404                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3405                                  *      are scaled versions of rtt and mean deviation.
3406                                  *      This is designed to be as fast as possible 
3407                                  *      m stands for "measurement".
3408                                  */
3409         
3410                                 m = jiffies - oskb->when;  /* RTT */
3411                                 if(m<=0)
3412                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3413                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3414                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3415                                 if (m < 0)
3416                                         m = -m;         /* m is now abs(error) */
3417                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3418                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3419         
3420                                 /*
3421                                  *      Now update timeout.  Note that this removes any backoff.
3422                                  */
3423                          
3424                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3425                                 if (sk->rto > 120*HZ)
3426                                         sk->rto = 120*HZ;
3427                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3428                                         sk->rto = 20;
3429                                 sk->backoff = 0;
3430                         }
3431                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3432                                            In this case as we just set it up */
3433                         cli();
3434                         oskb = sk->send_head;
3435                         IS_SKB(oskb);
3436                         sk->send_head = oskb->link3;
3437                         if (sk->send_head == NULL) 
3438                         {
3439                                 sk->send_tail = NULL;
3440                         }
3441 
3442                 /*
3443                  *      We may need to remove this from the dev send list. 
3444                  */
3445 
3446                         if (oskb->next)
3447                                 skb_unlink(oskb);
3448                         sti();
3449                         kfree_skb(oskb, FREE_WRITE); /* write. */
3450                         if (!sk->dead) 
3451                                 sk->write_space(sk);
3452                 }
3453                 else
3454                 {
3455                         break;
3456                 }
3457         }
3458 
3459         /*
3460          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3461          * returns non-NULL, we complete ignore the timer stuff in the else
3462          * clause.  We ought to organize the code so that else clause can
3463          * (should) be executed regardless, possibly moving the PROBE timer
3464          * reset over.  The skb_peek() thing should only move stuff to the
3465          * write queue, NOT also manage the timer functions.
3466          */
3467 
3468         /*
3469          * Maybe we can take some stuff off of the write queue,
3470          * and put it onto the xmit queue.
3471          */
3472         if (skb_peek(&sk->write_queue) != NULL) 
3473         {
3474                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3475                         (sk->retransmits == 0 || 
3476                          sk->ip_xmit_timeout != TIME_WRITE ||
3477                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3478                         && sk->packets_out < sk->cong_window) 
3479                 {
3480                         /*
3481                          *      Add more data to the send queue.
3482                          */
3483                         flag |= 1;
3484                         tcp_write_xmit(sk);
3485                 }
3486                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3487                         sk->send_head == NULL &&
3488                         sk->ack_backlog == 0 &&
3489                         sk->state != TCP_TIME_WAIT) 
3490                 {
3491                         /*
3492                          *      Data to queue but no room.
3493                          */
3494                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3495                 }               
3496         }
3497         else
3498         {
3499                 /*
3500                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3501                  * from TCP_CLOSE we don't do anything
3502                  *
3503                  * from anything else, if there is write data (or fin) pending,
3504                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3505                  * a KEEPALIVE timeout, else we delete the timer.
3506                  *
3507                  * We do not set flag for nominal write data, otherwise we may
3508                  * force a state where we start to write itsy bitsy tidbits
3509                  * of data.
3510                  */
3511 
3512                 switch(sk->state) {
3513                 case TCP_TIME_WAIT:
3514                         /*
3515                          * keep us in TIME_WAIT until we stop getting packets,
3516                          * reset the timeout.
3517                          */
3518                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3519                         break;
3520                 case TCP_CLOSE:
3521                         /*
3522                          * don't touch the timer.
3523                          */
3524                         break;
3525                 default:
3526                         /*
3527                          *      Must check send_head, write_queue, and ack_backlog
3528                          *      to determine which timeout to use.
3529                          */
3530                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3531                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3532                         } else if (sk->keepopen) {
3533                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3534                         } else {
3535                                 del_timer(&sk->retransmit_timer);
3536                                 sk->ip_xmit_timeout = 0;
3537                         }
3538                         break;
3539                 }
3540         }
3541 
3542         /*
3543          *      We have nothing queued but space to send. Send any partial
3544          *      packets immediately (end of Nagle rule application).
3545          */
3546          
3547         if (sk->packets_out == 0 && sk->partial != NULL &&
3548                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3549         {
3550                 flag |= 1;
3551                 tcp_send_partial(sk);
3552         }
3553 
3554         /*
3555          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3556          * we are now waiting for an acknowledge to our FIN.  The other end is
3557          * already in TIME_WAIT.
3558          *
3559          * Move to TCP_CLOSE on success.
3560          */
3561 
3562         if (sk->state == TCP_LAST_ACK) 
3563         {
3564                 if (!sk->dead)
3565                         sk->state_change(sk);
3566                 if(sk->debug)
3567                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3568                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3569                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3570                 {
3571                         flag |= 1;
3572                         tcp_set_state(sk,TCP_CLOSE);
3573                         sk->shutdown = SHUTDOWN_MASK;
3574                 }
3575         }
3576 
3577         /*
3578          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3579          *
3580          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3581          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3582          */
3583 
3584         if (sk->state == TCP_FIN_WAIT1) 
3585         {
3586 
3587                 if (!sk->dead) 
3588                         sk->state_change(sk);
3589                 if (sk->rcv_ack_seq == sk->write_seq) 
3590                 {
3591                         flag |= 1;
3592                         sk->shutdown |= SEND_SHUTDOWN;
3593                         tcp_set_state(sk, TCP_FIN_WAIT2);
3594                 }
3595         }
3596 
3597         /*
3598          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3599          *
3600          *      Move to TIME_WAIT
3601          */
3602 
3603         if (sk->state == TCP_CLOSING) 
3604         {
3605 
3606                 if (!sk->dead) 
3607                         sk->state_change(sk);
3608                 if (sk->rcv_ack_seq == sk->write_seq) 
3609                 {
3610                         flag |= 1;
3611                         tcp_time_wait(sk);
3612                 }
3613         }
3614         
3615         /*
3616          *      Final ack of a three way shake 
3617          */
3618          
3619         if(sk->state==TCP_SYN_RECV)
3620         {
3621                 tcp_set_state(sk, TCP_ESTABLISHED);
3622                 tcp_options(sk,th);
3623                 sk->dummy_th.dest=th->source;
3624                 sk->copied_seq = sk->acked_seq;
3625                 if(!sk->dead)
3626                         sk->state_change(sk);
3627                 if(sk->max_window==0)
3628                 {
3629                         sk->max_window=32;      /* Sanity check */
3630                         sk->mss=min(sk->max_window,sk->mtu);
3631                 }
3632         }
3633         
3634         /*
3635          * I make no guarantees about the first clause in the following
3636          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3637          * what conditions "!flag" would be true.  However I think the rest
3638          * of the conditions would prevent that from causing any
3639          * unnecessary retransmission. 
3640          *   Clearly if the first packet has expired it should be 
3641          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3642          * harder to explain:  You have to look carefully at how and when the
3643          * timer is set and with what timeout.  The most recent transmission always
3644          * sets the timer.  So in general if the most recent thing has timed
3645          * out, everything before it has as well.  So we want to go ahead and
3646          * retransmit some more.  If we didn't explicitly test for this
3647          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3648          * would not be true.  If you look at the pattern of timing, you can
3649          * show that rto is increased fast enough that the next packet would
3650          * almost never be retransmitted immediately.  Then you'd end up
3651          * waiting for a timeout to send each packet on the retransmission
3652          * queue.  With my implementation of the Karn sampling algorithm,
3653          * the timeout would double each time.  The net result is that it would
3654          * take a hideous amount of time to recover from a single dropped packet.
3655          * It's possible that there should also be a test for TIME_WRITE, but
3656          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3657          * got to be in real retransmission mode.
3658          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3659          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3660          * As long as no further losses occur, this seems reasonable.
3661          */
3662         
3663         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3664                (((flag&2) && sk->retransmits) ||
3665                (sk->send_head->when + sk->rto < jiffies))) 
3666         {
3667                 if(sk->send_head->when + sk->rto < jiffies)
3668                         tcp_retransmit(sk,0);   
3669                 else
3670                 {
3671                         tcp_do_retransmit(sk, 1);
3672                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3673                 }
3674         }
3675 
3676         return(1);
3677 }
3678 
3679 
3680 /*
3681  *      Process the FIN bit. This now behaves as it is supposed to work
3682  *      and the FIN takes effect when it is validly part of sequence
3683  *      space. Not before when we get holes.
3684  *
3685  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3686  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3687  *      TIME-WAIT)
3688  *
3689  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3690  *      close and we go into CLOSING (and later onto TIME-WAIT)
3691  *
3692  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3693  *
3694  */
3695  
3696 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3697 {
3698         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3699 
3700         if (!sk->dead) 
3701         {
3702                 sk->state_change(sk);
3703                 sock_wake_async(sk->socket, 1);
3704         }
3705 
3706         switch(sk->state) 
3707         {
3708                 case TCP_SYN_RECV:
3709                 case TCP_SYN_SENT:
3710                 case TCP_ESTABLISHED:
3711                         /*
3712                          * move to CLOSE_WAIT, tcp_data() already handled
3713                          * sending the ack.
3714                          */
3715                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3716                         if (th->rst)
3717                                 sk->shutdown = SHUTDOWN_MASK;
3718                         break;
3719 
3720                 case TCP_CLOSE_WAIT:
3721                 case TCP_CLOSING:
3722                         /*
3723                          * received a retransmission of the FIN, do
3724                          * nothing.
3725                          */
3726                         break;
3727                 case TCP_TIME_WAIT:
3728                         /*
3729                          * received a retransmission of the FIN,
3730                          * restart the TIME_WAIT timer.
3731                          */
3732                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3733                         return(0);
3734                 case TCP_FIN_WAIT1:
3735                         /*
3736                          * This case occurs when a simultaneous close
3737                          * happens, we must ack the received FIN and
3738                          * enter the CLOSING state.
3739                          *
3740                          * This causes a WRITE timeout, which will either
3741                          * move on to TIME_WAIT when we timeout, or resend
3742                          * the FIN properly (maybe we get rid of that annoying
3743                          * FIN lost hang). The TIME_WRITE code is already correct
3744                          * for handling this timeout.
3745                          */
3746 
3747                         if(sk->ip_xmit_timeout != TIME_WRITE)
3748                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3749                         tcp_set_state(sk,TCP_CLOSING);
3750                         break;
3751                 case TCP_FIN_WAIT2:
3752                         /*
3753                          * received a FIN -- send ACK and enter TIME_WAIT
3754                          */
3755                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3756                         sk->shutdown|=SHUTDOWN_MASK;
3757                         tcp_set_state(sk,TCP_TIME_WAIT);
3758                         break;
3759                 case TCP_CLOSE:
3760                         /*
3761                          * already in CLOSE
3762                          */
3763                         break;
3764                 default:
3765                         tcp_set_state(sk,TCP_LAST_ACK);
3766         
3767                         /* Start the timers. */
3768                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3769                         return(0);
3770         }
3771 
3772         return(0);
3773 }
3774 
3775 
3776 
3777 /*
3778  *      This routine handles the data.  If there is room in the buffer,
3779  *      it will be have already been moved into it.  If there is no
3780  *      room, then we will just have to discard the packet.
3781  */
3782 
3783 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3784          unsigned long saddr, unsigned short len)
3785 {
3786         struct sk_buff *skb1, *skb2;
3787         struct tcphdr *th;
3788         int dup_dumped=0;
3789         unsigned long new_seq;
3790         unsigned long shut_seq;
3791 
3792         th = skb->h.th;
3793         skb->len = len -(th->doff*4);
3794 
3795         /*
3796          *      The bytes in the receive read/assembly queue has increased. Needed for the
3797          *      low memory discard algorithm 
3798          */
3799            
3800         sk->bytes_rcv += skb->len;
3801         
3802         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3803         {
3804                 /* 
3805                  *      Don't want to keep passing ack's back and forth. 
3806                  *      (someone sent us dataless, boring frame)
3807                  */
3808                 if (!th->ack)
3809                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3810                 kfree_skb(skb, FREE_READ);
3811                 return(0);
3812         }
3813         
3814         /*
3815          *      We no longer have anyone receiving data on this connection.
3816          */
3817 
3818 #ifndef TCP_DONT_RST_SHUTDOWN            
3819 
3820         if(sk->shutdown & RCV_SHUTDOWN)
3821         {
3822                 /*
3823                  *      FIXME: BSD has some magic to avoid sending resets to
3824                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3825                  *      BSD stacks still have broken keepalives so we want to
3826                  *      cope with it.
3827                  */
3828 
3829                 if(skb->len)    /* We don't care if its just an ack or
3830                                    a keepalive/window probe */
3831                 {
3832                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3833                         
3834                         /* Do this the way 4.4BSD treats it. Not what I'd
3835                            regard as the meaning of the spec but its what BSD
3836                            does and clearly they know everything 8) */
3837 
3838                         /*
3839                          *      This is valid because of two things
3840                          *
3841                          *      a) The way tcp_data behaves at the bottom.
3842                          *      b) A fin takes effect when read not when received.
3843                          */
3844                          
3845                         shut_seq=sk->acked_seq+1;       /* Last byte */
3846                         
3847                         if(after(new_seq,shut_seq))
3848                         {
3849                                 if(sk->debug)
3850                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3851                                                 sk, new_seq, shut_seq, sk->blog);
3852                                 if(sk->dead)
3853                                 {
3854                                         sk->acked_seq = new_seq + th->fin;
3855                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3856                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3857                                         tcp_statistics.TcpEstabResets++;
3858                                         tcp_set_state(sk,TCP_CLOSE);
3859                                         sk->err = EPIPE;
3860                                         sk->shutdown = SHUTDOWN_MASK;
3861                                         kfree_skb(skb, FREE_READ);
3862                                         return 0;
3863                                 }
3864                         }
3865                 }
3866         }
3867 
3868 #endif
3869 
3870         /*
3871          *      Now we have to walk the chain, and figure out where this one
3872          *      goes into it.  This is set up so that the last packet we received
3873          *      will be the first one we look at, that way if everything comes
3874          *      in order, there will be no performance loss, and if they come
3875          *      out of order we will be able to fit things in nicely.
3876          *
3877          *      [AC: This is wrong. We should assume in order first and then walk
3878          *       forwards from the first hole based upon real traffic patterns.]
3879          *      
3880          */
3881 
3882         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3883         {
3884                 skb_queue_head(&sk->receive_queue,skb);
3885                 skb1= NULL;
3886         } 
3887         else
3888         {
3889                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3890                 {
3891                         if(sk->debug)
3892                         {
3893                                 printk("skb1=%p :", skb1);
3894                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3895                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3896                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3897                                                 sk->acked_seq);
3898                         }
3899                         
3900                         /*
3901                          *      Optimisation: Duplicate frame or extension of previous frame from
3902                          *      same sequence point (lost ack case).
3903                          *      The frame contains duplicate data or replaces a previous frame
3904                          *      discard the previous frame (safe as sk->inuse is set) and put
3905                          *      the new one in its place.
3906                          */
3907                          
3908                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3909                         {
3910                                 skb_append(skb1,skb);
3911                                 skb_unlink(skb1);
3912                                 kfree_skb(skb1,FREE_READ);
3913                                 dup_dumped=1;
3914                                 skb1=NULL;
3915                                 break;
3916                         }
3917                         
3918                         /*
3919                          *      Found where it fits
3920                          */
3921                          
3922                         if (after(th->seq+1, skb1->h.th->seq))
3923                         {
3924                                 skb_append(skb1,skb);
3925                                 break;
3926                         }
3927                         
3928                         /*
3929                          *      See if we've hit the start. If so insert.
3930                          */
3931                         if (skb1 == skb_peek(&sk->receive_queue))
3932                         {
3933                                 skb_queue_head(&sk->receive_queue, skb);
3934                                 break;
3935                         }
3936                 }
3937         }
3938 
3939         /*
3940          *      Figure out what the ack value for this frame is
3941          */
3942          
3943         th->ack_seq = th->seq + skb->len;
3944         if (th->syn) 
3945                 th->ack_seq++;
3946         if (th->fin)
3947                 th->ack_seq++;
3948 
3949         if (before(sk->acked_seq, sk->copied_seq)) 
3950         {
3951                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3952                 sk->acked_seq = sk->copied_seq;
3953         }
3954 
3955         /*
3956          *      Now figure out if we can ack anything. This is very messy because we really want two
3957          *      receive queues, a completed and an assembly queue. We also want only one transmit
3958          *      queue.
3959          */
3960 
3961         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3962         {
3963                 if (before(th->seq, sk->acked_seq+1)) 
3964                 {
3965                         int newwindow;
3966 
3967                         if (after(th->ack_seq, sk->acked_seq)) 
3968                         {
3969                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3970                                 if (newwindow < 0)
3971                                         newwindow = 0;  
3972                                 sk->window = newwindow;
3973                                 sk->acked_seq = th->ack_seq;
3974                         }
3975                         skb->acked = 1;
3976 
3977                         /*
3978                          *      When we ack the fin, we do the FIN 
3979                          *      processing.
3980                          */
3981 
3982                         if (skb->h.th->fin) 
3983                         {
3984                                 tcp_fin(skb,sk,skb->h.th);
3985                         }
3986           
3987                         for(skb2 = skb->next;
3988                             skb2 != (struct sk_buff *)&sk->receive_queue;
3989                             skb2 = skb2->next) 
3990                         {
3991                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3992                                 {
3993                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3994                                         {
3995                                                 newwindow = sk->window -
3996                                                  (skb2->h.th->ack_seq - sk->acked_seq);
3997                                                 if (newwindow < 0)
3998                                                         newwindow = 0;  
3999                                                 sk->window = newwindow;
4000                                                 sk->acked_seq = skb2->h.th->ack_seq;
4001                                         }
4002                                         skb2->acked = 1;
4003                                         /*
4004                                          *      When we ack the fin, we do
4005                                          *      the fin handling.
4006                                          */
4007                                         if (skb2->h.th->fin) 
4008                                         {
4009                                                 tcp_fin(skb,sk,skb->h.th);
4010                                         }
4011 
4012                                         /*
4013                                          *      Force an immediate ack.
4014                                          */
4015                                          
4016                                         sk->ack_backlog = sk->max_ack_backlog;
4017                                 }
4018                                 else
4019                                 {
4020                                         break;
4021                                 }
4022                         }
4023 
4024                         /*
4025                          *      This also takes care of updating the window.
4026                          *      This if statement needs to be simplified.
4027                          */
4028                         if (!sk->delay_acks ||
4029                             sk->ack_backlog >= sk->max_ack_backlog || 
4030                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4031         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4032                         }
4033                         else 
4034                         {
4035                                 sk->ack_backlog++;
4036                                 if(sk->debug)
4037                                         printk("Ack queued.\n");
4038                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4039                         }
4040                 }
4041         }
4042 
4043         /*
4044          *      If we've missed a packet, send an ack.
4045          *      Also start a timer to send another.
4046          */
4047          
4048         if (!skb->acked) 
4049         {
4050         
4051         /*
4052          *      This is important.  If we don't have much room left,
4053          *      we need to throw out a few packets so we have a good
4054          *      window.  Note that mtu is used, not mss, because mss is really
4055          *      for the send side.  He could be sending us stuff as large as mtu.
4056          */
4057                  
4058                 while (sk->prot->rspace(sk) < sk->mtu) 
4059                 {
4060                         skb1 = skb_peek(&sk->receive_queue);
4061                         if (skb1 == NULL) 
4062                         {
4063                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4064                                 break;
4065                         }
4066 
4067                         /*
4068                          *      Don't throw out something that has been acked. 
4069                          */
4070                  
4071                         if (skb1->acked) 
4072                         {
4073                                 break;
4074                         }
4075                 
4076                         skb_unlink(skb1);
4077                         kfree_skb(skb1, FREE_READ);
4078                 }
4079                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4080                 sk->ack_backlog++;
4081                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4082         }
4083         else
4084         {
4085                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4086         }
4087 
4088         /*
4089          *      Now tell the user we may have some data. 
4090          */
4091          
4092         if (!sk->dead) 
4093         {
4094                 if(sk->debug)
4095                         printk("Data wakeup.\n");
4096                 sk->data_ready(sk,0);
4097         } 
4098         return(0);
4099 }
4100 
4101 
4102 /*
4103  *      This routine is only called when we have urgent data
4104  *      signalled. Its the 'slow' part of tcp_urg. It could be
4105  *      moved inline now as tcp_urg is only called from one
4106  *      place. We handle URGent data wrong. We have to - as
4107  *      BSD still doesn't use the correction from RFC961.
4108  */
4109  
4110 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4111 {
4112         unsigned long ptr = ntohs(th->urg_ptr);
4113 
4114         if (ptr)
4115                 ptr--;
4116         ptr += th->seq;
4117 
4118         /* ignore urgent data that we've already seen and read */
4119         if (after(sk->copied_seq, ptr))
4120                 return;
4121 
4122         /* do we already have a newer (or duplicate) urgent pointer? */
4123         if (sk->urg_data && !after(ptr, sk->urg_seq))
4124                 return;
4125 
4126         /* tell the world about our new urgent pointer */
4127         if (sk->proc != 0) {
4128                 if (sk->proc > 0) {
4129                         kill_proc(sk->proc, SIGURG, 1);
4130                 } else {
4131                         kill_pg(-sk->proc, SIGURG, 1);
4132                 }
4133         }
4134         sk->urg_data = URG_NOTYET;
4135         sk->urg_seq = ptr;
4136 }
4137 
4138 /*
4139  *      This is the 'fast' part of urgent handling.
4140  */
4141  
4142 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4143         unsigned long saddr, unsigned long len)
4144 {
4145         unsigned long ptr;
4146 
4147         /*
4148          *      Check if we get a new urgent pointer - normally not 
4149          */
4150          
4151         if (th->urg)
4152                 tcp_check_urg(sk,th);
4153 
4154         /*
4155          *      Do we wait for any urgent data? - normally not
4156          */
4157          
4158         if (sk->urg_data != URG_NOTYET)
4159                 return 0;
4160 
4161         /*
4162          *      Is the urgent pointer pointing into this packet? 
4163          */
4164          
4165         ptr = sk->urg_seq - th->seq + th->doff*4;
4166         if (ptr >= len)
4167                 return 0;
4168 
4169         /*
4170          *      Ok, got the correct packet, update info 
4171          */
4172          
4173         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4174         if (!sk->dead)
4175                 sk->data_ready(sk,0);
4176         return 0;
4177 }
4178 
4179 /*
4180  *      This will accept the next outstanding connection. 
4181  */
4182  
4183 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4184 {
4185         struct sock *newsk;
4186         struct sk_buff *skb;
4187   
4188   /*
4189    * We need to make sure that this socket is listening,
4190    * and that it has something pending.
4191    */
4192 
4193         if (sk->state != TCP_LISTEN) 
4194         {
4195                 sk->err = EINVAL;
4196                 return(NULL); 
4197         }
4198 
4199         /* Avoid the race. */
4200         cli();
4201         sk->inuse = 1;
4202 
4203         while((skb = tcp_dequeue_established(sk)) == NULL) 
4204         {
4205                 if (flags & O_NONBLOCK) 
4206                 {
4207                         sti();
4208                         release_sock(sk);
4209                         sk->err = EAGAIN;
4210                         return(NULL);
4211                 }
4212 
4213                 release_sock(sk);
4214                 interruptible_sleep_on(sk->sleep);
4215                 if (current->signal & ~current->blocked) 
4216                 {
4217                         sti();
4218                         sk->err = ERESTARTSYS;
4219                         return(NULL);
4220                 }
4221                 sk->inuse = 1;
4222         }
4223         sti();
4224 
4225         /*
4226          *      Now all we need to do is return skb->sk. 
4227          */
4228 
4229         newsk = skb->sk;
4230 
4231         kfree_skb(skb, FREE_READ);
4232         sk->ack_backlog--;
4233         release_sock(sk);
4234         return(newsk);
4235 }
4236 
4237 
4238 /*
4239  *      This will initiate an outgoing connection. 
4240  */
4241  
4242 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4243 {
4244         struct sk_buff *buff;
4245         struct device *dev=NULL;
4246         unsigned char *ptr;
4247         int tmp;
4248         int atype;
4249         struct tcphdr *t1;
4250         struct rtable *rt;
4251 
4252         if (sk->state != TCP_CLOSE) 
4253         {
4254                 return(-EISCONN);
4255         }
4256         
4257         if (addr_len < 8) 
4258                 return(-EINVAL);
4259 
4260         if (usin->sin_family && usin->sin_family != AF_INET) 
4261                 return(-EAFNOSUPPORT);
4262 
4263         /*
4264          *      connect() to INADDR_ANY means loopback (BSD'ism).
4265          */
4266         
4267         if(usin->sin_addr.s_addr==INADDR_ANY)
4268                 usin->sin_addr.s_addr=ip_my_addr();
4269                   
4270         /*
4271          *      Don't want a TCP connection going to a broadcast address 
4272          */
4273 
4274         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4275                 return -ENETUNREACH;
4276   
4277         sk->inuse = 1;
4278         sk->daddr = usin->sin_addr.s_addr;
4279         sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4280         sk->window_seq = sk->write_seq;
4281         sk->rcv_ack_seq = sk->write_seq -1;
4282         sk->err = 0;
4283         sk->dummy_th.dest = usin->sin_port;
4284         release_sock(sk);
4285 
4286         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4287         if (buff == NULL) 
4288         {
4289                 return(-ENOMEM);
4290         }
4291         sk->inuse = 1;
4292         buff->len = 24;
4293         buff->sk = sk;
4294         buff->free = 0;
4295         buff->localroute = sk->localroute;
4296         
4297         t1 = (struct tcphdr *) buff->data;
4298 
4299         /*
4300          *      Put in the IP header and routing stuff. 
4301          */
4302          
4303         rt=ip_rt_route(sk->daddr, NULL, NULL);
4304         
4305 
4306         /*
4307          *      We need to build the routing stuff from the things saved in skb. 
4308          */
4309 
4310         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4311                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4312         if (tmp < 0) 
4313         {
4314                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4315                 release_sock(sk);
4316                 return(-ENETUNREACH);
4317         }
4318 
4319         buff->len += tmp;
4320         t1 = (struct tcphdr *)((char *)t1 +tmp);
4321 
4322         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4323         t1->seq = ntohl(sk->write_seq++);
4324         sk->sent_seq = sk->write_seq;
4325         buff->h.seq = sk->write_seq;
4326         t1->ack = 0;
4327         t1->window = 2;
4328         t1->res1=0;
4329         t1->res2=0;
4330         t1->rst = 0;
4331         t1->urg = 0;
4332         t1->psh = 0;
4333         t1->syn = 1;
4334         t1->urg_ptr = 0;
4335         t1->doff = 6;
4336         /* use 512 or whatever user asked for */
4337         
4338         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4339                 sk->window_clamp=rt->rt_window;
4340         else
4341                 sk->window_clamp=0;
4342 
4343         if (sk->user_mss)
4344                 sk->mtu = sk->user_mss;
4345         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4346                 sk->mtu = rt->rt_mss;
4347         else 
4348         {
4349 #ifdef CONFIG_INET_SNARL
4350                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4351 #else
4352                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4353 #endif
4354                         sk->mtu = 576 - HEADER_SIZE;
4355                 else
4356                         sk->mtu = MAX_WINDOW;
4357         }
4358         /*
4359          *      but not bigger than device MTU 
4360          */
4361 
4362         if(sk->mtu <32)
4363                 sk->mtu = 32;   /* Sanity limit */
4364                 
4365         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4366         
4367         /*
4368          *      Put in the TCP options to say MTU. 
4369          */
4370 
4371         ptr = (unsigned char *)(t1+1);
4372         ptr[0] = 2;
4373         ptr[1] = 4;
4374         ptr[2] = (sk->mtu) >> 8;
4375         ptr[3] = (sk->mtu) & 0xff;
4376         tcp_send_check(t1, sk->saddr, sk->daddr,
4377                   sizeof(struct tcphdr) + 4, sk);
4378 
4379         /*
4380          *      This must go first otherwise a really quick response will get reset. 
4381          */
4382 
4383         tcp_set_state(sk,TCP_SYN_SENT);
4384         sk->rto = TCP_TIMEOUT_INIT;
4385         init_timer(&sk->retransmit_timer);
4386         sk->retransmit_timer.function=&retransmit_timer;
4387         sk->retransmit_timer.data = (unsigned long)sk;
4388         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4389         sk->retransmits = TCP_SYN_RETRIES;
4390 
4391         sk->prot->queue_xmit(sk, dev, buff, 0);  
4392         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4393         tcp_statistics.TcpActiveOpens++;
4394         tcp_statistics.TcpOutSegs++;
4395   
4396         release_sock(sk);
4397         return(0);
4398 }
4399 
4400 
4401 /* This functions checks to see if the tcp header is actually acceptable. */
4402 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4403              struct options *opt, unsigned long saddr, struct device *dev)
4404 {
4405         unsigned long next_seq;
4406 
4407         next_seq = len - 4*th->doff;
4408         if (th->fin)
4409                 next_seq++;
4410         /* if we have a zero window, we can't have any data in the packet.. */
4411         if (next_seq && !sk->window)
4412                 goto ignore_it;
4413         next_seq += th->seq;
4414 
4415         /*
4416          * This isn't quite right.  sk->acked_seq could be more recent
4417          * than sk->window.  This is however close enough.  We will accept
4418          * slightly more packets than we should, but it should not cause
4419          * problems unless someone is trying to forge packets.
4420          */
4421 
4422         /* have we already seen all of this packet? */
4423         if (!after(next_seq+1, sk->acked_seq))
4424                 goto ignore_it;
4425         /* or does it start beyond the window? */
4426         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4427                 goto ignore_it;
4428 
4429         /* ok, at least part of this packet would seem interesting.. */
4430         return 1;
4431 
4432 ignore_it:
4433         if (th->rst)
4434                 return 0;
4435 
4436         /*
4437          *      Send a reset if we get something not ours and we are
4438          *      unsynchronized. Note: We don't do anything to our end. We
4439          *      are just killing the bogus remote connection then we will
4440          *      connect again and it will work (with luck).
4441          */
4442          
4443         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4444         {
4445                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4446                 return 1;
4447         }
4448 
4449         /* Try to resync things. */
4450         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4451         return 0;
4452 }
4453 
4454 /*
4455  *      When we get a reset we do this.
4456  */
4457 
4458 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4459 {
4460         sk->zapped = 1;
4461         sk->err = ECONNRESET;
4462         if (sk->state == TCP_SYN_SENT)
4463                 sk->err = ECONNREFUSED;
4464         if (sk->state == TCP_CLOSE_WAIT)
4465                 sk->err = EPIPE;
4466 #ifdef TCP_DO_RFC1337           
4467         /*
4468          *      Time wait assassination protection [RFC1337]
4469          */
4470         if(sk->state!=TCP_TIME_WAIT)
4471         {       
4472                 tcp_set_state(sk,TCP_CLOSE);
4473                 sk->shutdown = SHUTDOWN_MASK;
4474         }
4475 #else   
4476         tcp_set_state(sk,TCP_CLOSE);
4477         sk->shutdown = SHUTDOWN_MASK;
4478 #endif  
4479         if (!sk->dead) 
4480                 sk->state_change(sk);
4481         kfree_skb(skb, FREE_READ);
4482         release_sock(sk);
4483         return(0);
4484 }
4485 
4486 /*
4487  *      A TCP packet has arrived.
4488  */
4489  
4490 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4491         unsigned long daddr, unsigned short len,
4492         unsigned long saddr, int redo, struct inet_protocol * protocol)
4493 {
4494         struct tcphdr *th;
4495         struct sock *sk;
4496         int syn_ok=0;
4497         
4498         if (!skb) 
4499         {
4500                 printk("IMPOSSIBLE 1\n");
4501                 return(0);
4502         }
4503 
4504         if (!dev) 
4505         {
4506                 printk("IMPOSSIBLE 2\n");
4507                 return(0);
4508         }
4509   
4510         tcp_statistics.TcpInSegs++;
4511   
4512         if(skb->pkt_type!=PACKET_HOST)
4513         {
4514                 kfree_skb(skb,FREE_READ);
4515                 return(0);
4516         }
4517   
4518         th = skb->h.th;
4519 
4520         /*
4521          *      Find the socket.
4522          */
4523 
4524         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4525 
4526         /*
4527          *      If this socket has got a reset its to all intents and purposes 
4528          *      really dead. Count closed sockets as dead.
4529          *
4530          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4531          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4532          *      exist so should cause resets as if the port was unreachable.
4533          */
4534          
4535         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4536                 sk=NULL;
4537 
4538         if (!redo) 
4539         {
4540                 if (tcp_check(th, len, saddr, daddr )) 
4541                 {
4542                         skb->sk = NULL;
4543                         kfree_skb(skb,FREE_READ);
4544                         /*
4545                          *      We don't release the socket because it was
4546                          *      never marked in use.
4547                          */
4548                         return(0);
4549                 }
4550                 th->seq = ntohl(th->seq);
4551 
4552                 /* See if we know about the socket. */
4553                 if (sk == NULL) 
4554                 {
4555                         /*
4556                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4557                          */
4558                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4559                         skb->sk = NULL;
4560                         /*
4561                          *      Discard frame
4562                          */
4563                         kfree_skb(skb, FREE_READ);
4564                         return(0);
4565                 }
4566 
4567                 skb->len = len;
4568                 skb->acked = 0;
4569                 skb->used = 0;
4570                 skb->free = 0;
4571                 skb->saddr = daddr;
4572                 skb->daddr = saddr;
4573         
4574                 /* We may need to add it to the backlog here. */
4575                 cli();
4576                 if (sk->inuse) 
4577                 {
4578                         skb_queue_tail(&sk->back_log, skb);
4579                         sti();
4580                         return(0);
4581                 }
4582                 sk->inuse = 1;
4583                 sti();
4584         }
4585         else
4586         {
4587                 if (sk==NULL) 
4588                 {
4589                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4590                         skb->sk = NULL;
4591                         kfree_skb(skb, FREE_READ);
4592                         return(0);
4593                 }
4594         }
4595 
4596 
4597         if (!sk->prot) 
4598         {
4599                 printk("IMPOSSIBLE 3\n");
4600                 return(0);
4601         }
4602 
4603 
4604         /*
4605          *      Charge the memory to the socket. 
4606          */
4607          
4608         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4609         {
4610                 kfree_skb(skb, FREE_READ);
4611                 release_sock(sk);
4612                 return(0);
4613         }
4614 
4615         skb->sk=sk;
4616         sk->rmem_alloc += skb->mem_len;
4617 
4618         /*
4619          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4620          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4621          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4622          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4623          */
4624 
4625         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4626         {
4627         
4628                 /*
4629                  *      Now deal with unusual cases.
4630                  */
4631          
4632                 if(sk->state==TCP_LISTEN)
4633                 {
4634                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4635                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4636 
4637                         /*
4638                          *      We don't care for RST, and non SYN are absorbed (old segments)
4639                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4640                          *      netmask on a running connection it can go broadcast. Even Sun's have
4641                          *      this problem so I'm ignoring it 
4642                          */
4643                            
4644                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4645                         {
4646                                 kfree_skb(skb, FREE_READ);
4647                                 release_sock(sk);
4648                                 return 0;
4649                         }
4650                 
4651                         /*      
4652                          *      Guess we need to make a new socket up 
4653                          */
4654                 
4655                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4656                 
4657                         /*
4658                          *      Now we have several options: In theory there is nothing else
4659                          *      in the frame. KA9Q has an option to send data with the syn,
4660                          *      BSD accepts data with the syn up to the [to be] advertised window
4661                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4662                          *      it, that fits the spec precisely and avoids incompatibilities. It
4663                          *      would be nice in future to drop through and process the data.
4664                          */
4665                          
4666                         release_sock(sk);
4667                         return 0;
4668                 }
4669         
4670                 /* retransmitted SYN? */
4671                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4672                 {
4673                         kfree_skb(skb, FREE_READ);
4674                         release_sock(sk);
4675                         return 0;
4676                 }
4677                 
4678                 /*
4679                  *      SYN sent means we have to look for a suitable ack and either reset
4680                  *      for bad matches or go to connected 
4681                  */
4682            
4683                 if(sk->state==TCP_SYN_SENT)
4684                 {
4685                         /* Crossed SYN or previous junk segment */
4686                         if(th->ack)
4687                         {
4688                                 /* We got an ack, but its not a good ack */
4689                                 if(!tcp_ack(sk,th,saddr,len))
4690                                 {
4691                                         /* Reset the ack - its an ack from a 
4692                                            different connection  [ th->rst is checked in tcp_reset()] */
4693                                         tcp_statistics.TcpAttemptFails++;
4694                                         tcp_reset(daddr, saddr, th,
4695                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4696                                         kfree_skb(skb, FREE_READ);
4697                                         release_sock(sk);
4698                                         return(0);
4699                                 }
4700                                 if(th->rst)
4701                                         return tcp_std_reset(sk,skb);
4702                                 if(!th->syn)
4703                                 {
4704                                         /* A valid ack from a different connection
4705                                            start. Shouldn't happen but cover it */
4706                                         kfree_skb(skb, FREE_READ);
4707                                         release_sock(sk);
4708                                         return 0;
4709                                 }
4710                                 /*
4711                                  *      Ok.. its good. Set up sequence numbers and
4712                                  *      move to established.
4713                                  */
4714                                 syn_ok=1;       /* Don't reset this connection for the syn */
4715                                 sk->acked_seq=th->seq+1;
4716                                 sk->fin_seq=th->seq;
4717                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4718                                 tcp_set_state(sk, TCP_ESTABLISHED);
4719                                 tcp_options(sk,th);
4720                                 sk->dummy_th.dest=th->source;
4721                                 sk->copied_seq = sk->acked_seq;
4722                                 if(!sk->dead)
4723                                 {
4724                                         sk->state_change(sk);
4725                                         sock_wake_async(sk->socket, 0);
4726                                 }
4727                                 if(sk->max_window==0)
4728                                 {
4729                                         sk->max_window = 32;
4730                                         sk->mss = min(sk->max_window, sk->mtu);
4731                                 }
4732                         }
4733                         else
4734                         {
4735                                 /* See if SYN's cross. Drop if boring */
4736                                 if(th->syn && !th->rst)
4737                                 {
4738                                         /* Crossed SYN's are fine - but talking to
4739                                            yourself is right out... */
4740                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4741                                                 sk->dummy_th.source==th->source &&
4742                                                 sk->dummy_th.dest==th->dest)
4743                                         {
4744                                                 tcp_statistics.TcpAttemptFails++;
4745                                                 return tcp_std_reset(sk,skb);
4746                                         }
4747                                         tcp_set_state(sk,TCP_SYN_RECV);
4748                                         
4749                                         /*
4750                                          *      FIXME:
4751                                          *      Must send SYN|ACK here
4752                                          */
4753                                 }               
4754                                 /* Discard junk segment */
4755                                 kfree_skb(skb, FREE_READ);
4756                                 release_sock(sk);
4757                                 return 0;
4758                         }
4759                         /*
4760                          *      SYN_RECV with data maybe.. drop through
4761                          */
4762                         goto rfc_step6;
4763                 }
4764 
4765         /*
4766          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4767          *      a more complex suggestion for fixing these reuse issues in RFC1644
4768          *      but not yet ready for general use. Also see RFC1379.
4769          */
4770         
4771 #define BSD_TIME_WAIT
4772 #ifdef BSD_TIME_WAIT
4773                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4774                         after(th->seq, sk->acked_seq) && !th->rst)
4775                 {
4776                         long seq=sk->write_seq;
4777                         if(sk->debug)
4778                                 printk("Doing a BSD time wait\n");
4779                         tcp_statistics.TcpEstabResets++;           
4780                         sk->rmem_alloc -= skb->mem_len;
4781                         skb->sk = NULL;
4782                         sk->err=ECONNRESET;
4783                         tcp_set_state(sk, TCP_CLOSE);
4784                         sk->shutdown = SHUTDOWN_MASK;
4785                         release_sock(sk);
4786                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4787                         if (sk && sk->state==TCP_LISTEN)
4788                         {
4789                                 sk->inuse=1;
4790                                 skb->sk = sk;
4791                                 sk->rmem_alloc += skb->mem_len;
4792                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4793                                 release_sock(sk);
4794                                 return 0;
4795                         }
4796                         kfree_skb(skb, FREE_READ);
4797                         return 0;
4798                 }
4799 #endif  
4800         }
4801 
4802         /*
4803          *      We are now in normal data flow (see the step list in the RFC)
4804          *      Note most of these are inline now. I'll inline the lot when
4805          *      I have time to test it hard and look at what gcc outputs 
4806          */
4807         
4808         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4809         {
4810                 kfree_skb(skb, FREE_READ);
4811                 release_sock(sk);
4812                 return 0;
4813         }
4814 
4815         if(th->rst)
4816                 return tcp_std_reset(sk,skb);
4817         
4818         /*
4819          *      !syn_ok is effectively the state test in RFC793.
4820          */
4821          
4822         if(th->syn && !syn_ok)
4823         {
4824                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4825                 return tcp_std_reset(sk,skb);   
4826         }
4827 
4828         /*
4829          *      Process the ACK
4830          */
4831          
4832 
4833         if(th->ack && !tcp_ack(sk,th,saddr,len))
4834         {
4835                 /*
4836                  *      Our three way handshake failed.
4837                  */
4838                  
4839                 if(sk->state==TCP_SYN_RECV)
4840                 {
4841                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4842                 }
4843                 kfree_skb(skb, FREE_READ);
4844                 release_sock(sk);
4845                 return 0;
4846         }
4847         
4848 rfc_step6:              /* I'll clean this up later */
4849 
4850         /*
4851          *      Process urgent data
4852          */
4853                 
4854         if(tcp_urg(sk, th, saddr, len))
4855         {
4856                 kfree_skb(skb, FREE_READ);
4857                 release_sock(sk);
4858                 return 0;
4859         }
4860         
4861         
4862         /*
4863          *      Process the encapsulated data
4864          */
4865         
4866         if(tcp_data(skb,sk, saddr, len))
4867         {
4868                 kfree_skb(skb, FREE_READ);
4869                 release_sock(sk);
4870                 return 0;
4871         }
4872 
4873         /*
4874          *      And done
4875          */     
4876         
4877         release_sock(sk);
4878         return 0;
4879 }
4880 
4881 /*
4882  *      This routine sends a packet with an out of date sequence
4883  *      number. It assumes the other end will try to ack it.
4884  */
4885 
4886 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4887 {
4888         struct sk_buff *buff;
4889         struct tcphdr *t1;
4890         struct device *dev=NULL;
4891         int tmp;
4892 
4893         if (sk->zapped)
4894                 return; /* After a valid reset we can send no more */
4895 
4896         /*
4897          *      Write data can still be transmitted/retransmitted in the
4898          *      following states.  If any other state is encountered, return.
4899          *      [listen/close will never occur here anyway]
4900          */
4901 
4902         if (sk->state != TCP_ESTABLISHED && 
4903             sk->state != TCP_CLOSE_WAIT &&
4904             sk->state != TCP_FIN_WAIT1 && 
4905             sk->state != TCP_LAST_ACK &&
4906             sk->state != TCP_CLOSING
4907         ) 
4908         {
4909                 return;
4910         }
4911 
4912         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4913         if (buff == NULL) 
4914                 return;
4915 
4916         buff->len = sizeof(struct tcphdr);
4917         buff->free = 1;
4918         buff->sk = sk;
4919         buff->localroute = sk->localroute;
4920 
4921         t1 = (struct tcphdr *) buff->data;
4922 
4923         /* Put in the IP header and routing stuff. */
4924         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4925                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4926         if (tmp < 0) 
4927         {
4928                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4929                 return;
4930         }
4931 
4932         buff->len += tmp;
4933         t1 = (struct tcphdr *)((char *)t1 +tmp);
4934 
4935         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4936 
4937         /*
4938          *      Use a previous sequence.
4939          *      This should cause the other end to send an ack.
4940          */
4941          
4942         t1->seq = htonl(sk->sent_seq-1);
4943         t1->ack = 1; 
4944         t1->res1= 0;
4945         t1->res2= 0;
4946         t1->rst = 0;
4947         t1->urg = 0;
4948         t1->psh = 0;
4949         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4950         t1->syn = 0;
4951         t1->ack_seq = ntohl(sk->acked_seq);
4952         t1->window = ntohs(tcp_select_window(sk));
4953         t1->doff = sizeof(*t1)/4;
4954         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4955          /*
4956           *     Send it and free it.
4957           *     This will prevent the timer from automatically being restarted.
4958           */
4959         sk->prot->queue_xmit(sk, dev, buff, 1);
4960         tcp_statistics.TcpOutSegs++;
4961 }
4962 
4963 /*
4964  *      A window probe timeout has occurred.
4965  */
4966 
4967 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4968 {
4969         if (sk->zapped)
4970                 return;         /* After a valid reset we can send no more */
4971 
4972         tcp_write_wakeup(sk);
4973 
4974         sk->backoff++;
4975         sk->rto = min(sk->rto << 1, 120*HZ);
4976         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4977         sk->retransmits++;
4978         sk->prot->retransmits ++;
4979 }
4980 
4981 /*
4982  *      Socket option code for TCP. 
4983  */
4984   
4985 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4986 {
4987         int val,err;
4988 
4989         if(level!=SOL_TCP)
4990                 return ip_setsockopt(sk,level,optname,optval,optlen);
4991 
4992         if (optval == NULL) 
4993                 return(-EINVAL);
4994 
4995         err=verify_area(VERIFY_READ, optval, sizeof(int));
4996         if(err)
4997                 return err;
4998         
4999         val = get_fs_long((unsigned long *)optval);
5000 
5001         switch(optname)
5002         {
5003                 case TCP_MAXSEG:
5004 /*
5005  * values greater than interface MTU won't take effect.  however at
5006  * the point when this call is done we typically don't yet know
5007  * which interface is going to be used
5008  */
5009                         if(val<1||val>MAX_WINDOW)
5010                                 return -EINVAL;
5011                         sk->user_mss=val;
5012                         return 0;
5013                 case TCP_NODELAY:
5014                         sk->nonagle=(val==0)?0:1;
5015                         return 0;
5016                 default:
5017                         return(-ENOPROTOOPT);
5018         }
5019 }
5020 
5021 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5022 {
5023         int val,err;
5024 
5025         if(level!=SOL_TCP)
5026                 return ip_getsockopt(sk,level,optname,optval,optlen);
5027                         
5028         switch(optname)
5029         {
5030                 case TCP_MAXSEG:
5031                         val=sk->user_mss;
5032                         break;
5033                 case TCP_NODELAY:
5034                         val=sk->nonagle;
5035                         break;
5036                 default:
5037                         return(-ENOPROTOOPT);
5038         }
5039         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5040         if(err)
5041                 return err;
5042         put_fs_long(sizeof(int),(unsigned long *) optlen);
5043 
5044         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5045         if(err)
5046                 return err;
5047         put_fs_long(val,(unsigned long *)optval);
5048 
5049         return(0);
5050 }       
5051 
5052 
5053 struct proto tcp_prot = {
5054         sock_wmalloc,
5055         sock_rmalloc,
5056         sock_wfree,
5057         sock_rfree,
5058         sock_rspace,
5059         sock_wspace,
5060         tcp_close,
5061         tcp_read,
5062         tcp_write,
5063         tcp_sendto,
5064         tcp_recvfrom,
5065         ip_build_header,
5066         tcp_connect,
5067         tcp_accept,
5068         ip_queue_xmit,
5069         tcp_retransmit,
5070         tcp_write_wakeup,
5071         tcp_read_wakeup,
5072         tcp_rcv,
5073         tcp_select,
5074         tcp_ioctl,
5075         NULL,
5076         tcp_shutdown,
5077         tcp_setsockopt,
5078         tcp_getsockopt,
5079         128,
5080         0,
5081         {NULL,},
5082         "TCP",
5083         0, 0
5084 };

/* [previous][next][first][last][top][bottom][index][help] */