root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *
 136  *
 137  * To Fix:
 138  *              Fast path the code. Two things here - fix the window calculation
 139  *              so it doesn't iterate over the queue, also spot packets with no funny
 140  *              options arriving in order and process directly.
 141  *
 142  *              Implement RFC 1191 [Path MTU discovery]
 143  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 144  *              Rewrite output state machine to use a single queue and do low window
 145  *              situations as per the spec (RFC 1122)
 146  *              Speed up input assembly algorithm.
 147  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 148  *              could do with it working on IPv4
 149  *              User settable/learned rtt/max window/mtu
 150  *              Cope with MTU/device switches when retransmitting in tcp.
 151  *              Fix the window handling to use PR's new code.
 152  *
 153  *              Change the fundamental structure to a single send queue maintained
 154  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 155  *              active routes too]). Cut the queue off in tcp_retransmit/
 156  *              tcp_transmit.
 157  *              Change the receive queue to assemble as it goes. This lets us
 158  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 159  *              tcp_data/tcp_read as well as the window shrink crud.
 160  *              Seperate out duplicated code - tcp_alloc_skb, tcp_build_ack
 161  *              tcp_queue_skb seem obvious routines to extract.
 162  *      
 163  *              This program is free software; you can redistribute it and/or
 164  *              modify it under the terms of the GNU General Public License
 165  *              as published by the Free Software Foundation; either version
 166  *              2 of the License, or(at your option) any later version.
 167  *
 168  * Description of States:
 169  *
 170  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 171  *
 172  *      TCP_SYN_RECV            received a connection request, sent ack,
 173  *                              waiting for final ack in three-way handshake.
 174  *
 175  *      TCP_ESTABLISHED         connection established
 176  *
 177  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 178  *                              transmission of remaining buffered data
 179  *
 180  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 181  *                              to shutdown
 182  *
 183  *      TCP_CLOSING             both sides have shutdown but we still have
 184  *                              data we have to finish sending
 185  *
 186  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 187  *                              closed, can only be entered from FIN_WAIT2
 188  *                              or CLOSING.  Required because the other end
 189  *                              may not have gotten our last ACK causing it
 190  *                              to retransmit the data packet (which we ignore)
 191  *
 192  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 193  *                              us to finish writing our data and to shutdown
 194  *                              (we have to close() to move on to LAST_ACK)
 195  *
 196  *      TCP_LAST_ACK            out side has shutdown after remote has
 197  *                              shutdown.  There may still be data in our
 198  *                              buffer that we have to finish sending
 199  *              
 200  *      TCP_CLOSE               socket is finished
 201  */
 202 
 203 #include <linux/types.h>
 204 #include <linux/sched.h>
 205 #include <linux/mm.h>
 206 #include <linux/time.h>
 207 #include <linux/string.h>
 208 #include <linux/config.h>
 209 #include <linux/socket.h>
 210 #include <linux/sockios.h>
 211 #include <linux/termios.h>
 212 #include <linux/in.h>
 213 #include <linux/fcntl.h>
 214 #include <linux/inet.h>
 215 #include <linux/netdevice.h>
 216 #include "snmp.h"
 217 #include "ip.h"
 218 #include "protocol.h"
 219 #include "icmp.h"
 220 #include "tcp.h"
 221 #include "arp.h"
 222 #include <linux/skbuff.h>
 223 #include "sock.h"
 224 #include "route.h"
 225 #include <linux/errno.h>
 226 #include <linux/timer.h>
 227 #include <asm/system.h>
 228 #include <asm/segment.h>
 229 #include <linux/mm.h>
 230 
 231 /*
 232  *      The MSL timer is the 'normal' timer.
 233  */
 234  
 235 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 236 
 237 #define SEQ_TICK 3
 238 unsigned long seq_offset;
 239 struct tcp_mib  tcp_statistics;
 240 
 241 static void tcp_close(struct sock *sk, int timeout);
 242 
 243 
 244 /*
 245  *      The less said about this the better, but it works and will do for 1.2 
 246  */
 247 
 248 static struct wait_queue *master_select_wakeup;
 249 
 250 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 251 {
 252         if (a < b) 
 253                 return(a);
 254         return(b);
 255 }
 256 
 257 #undef STATE_TRACE
 258 
 259 #ifdef STATE_TRACE
 260 static char *statename[]={
 261         "Unused","Established","Syn Sent","Syn Recv",
 262         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 263         "Close Wait","Last ACK","Listen","Closing"
 264 };
 265 #endif
 266 
 267 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 268 {
 269         if(sk->state==TCP_ESTABLISHED)
 270                 tcp_statistics.TcpCurrEstab--;
 271 #ifdef STATE_TRACE
 272         if(sk->debug)
 273                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 274 #endif  
 275         /* This is a hack but it doesn't occur often and its going to
 276            be a real        to fix nicely */
 277            
 278         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 279         {
 280                 wake_up_interruptible(&master_select_wakeup);
 281         }
 282         sk->state=state;
 283         if(state==TCP_ESTABLISHED)
 284                 tcp_statistics.TcpCurrEstab++;
 285 }
 286 
 287 /*
 288  *      This routine picks a TCP windows for a socket based on
 289  *      the following constraints
 290  *  
 291  *      1. The window can never be shrunk once it is offered (RFC 793)
 292  *      2. We limit memory per socket
 293  *   
 294  *      For now we use NET2E3's heuristic of offering half the memory
 295  *      we have handy. All is not as bad as this seems however because
 296  *      of two things. Firstly we will bin packets even within the window
 297  *      in order to get the data we are waiting for into the memory limit.
 298  *      Secondly we bin common duplicate forms at receive time
 299  *      Better heuristics welcome
 300  */
 301    
 302 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 303 {
 304         int new_window = sk->prot->rspace(sk);
 305         
 306         if(sk->window_clamp)
 307                 new_window=min(sk->window_clamp,new_window);
 308         /*
 309          *      Two things are going on here.  First, we don't ever offer a
 310          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 311          *      receiver side of SWS as specified in RFC1122.
 312          *      Second, we always give them at least the window they
 313          *      had before, in order to avoid retracting window.  This
 314          *      is technically allowed, but RFC1122 advises against it and
 315          *      in practice it causes trouble.
 316          *
 317          *      Fixme: This doesn't correctly handle the case where
 318          *      new_window > sk->window but not by enough to allow for the
 319          *      shift in sequence space. 
 320          */
 321         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 322                 return(sk->window);
 323         return(new_window);
 324 }
 325 
 326 /*
 327  *      Find someone to 'accept'. Must be called with
 328  *      sk->inuse=1 or cli()
 329  */ 
 330 
 331 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 332 {
 333         struct sk_buff *p=skb_peek(&s->receive_queue);
 334         if(p==NULL)
 335                 return NULL;
 336         do
 337         {
 338                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 339                         return p;
 340                 p=p->next;
 341         }
 342         while(p!=(struct sk_buff *)&s->receive_queue);
 343         return NULL;
 344 }
 345 
 346 /*
 347  *      Remove a completed connection and return it. This is used by
 348  *      tcp_accept() to get connections from the queue.
 349  */
 350 
 351 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 352 {
 353         struct sk_buff *skb;
 354         unsigned long flags;
 355         save_flags(flags);
 356         cli(); 
 357         skb=tcp_find_established(s);
 358         if(skb!=NULL)
 359                 skb_unlink(skb);        /* Take it off the queue */
 360         restore_flags(flags);
 361         return skb;
 362 }
 363 
 364 /* 
 365  *      This routine closes sockets which have been at least partially
 366  *      opened, but not yet accepted. Currently it is only called by
 367  *      tcp_close, and timeout mirrors the value there. 
 368  */
 369 
 370 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 371 {
 372         struct sk_buff *skb;
 373 
 374         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
 375                 tcp_close(skb->sk, 0);
 376                 kfree_skb(skb, FREE_READ);
 377         }
 378         return;
 379 }
 380 
 381 /*
 382  *      Enter the time wait state. 
 383  */
 384 
 385 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 386 {
 387         tcp_set_state(sk,TCP_TIME_WAIT);
 388         sk->shutdown = SHUTDOWN_MASK;
 389         if (!sk->dead)
 390                 sk->state_change(sk);
 391         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 392 }
 393 
 394 /*
 395  *      A socket has timed out on its send queue and wants to do a
 396  *      little retransmitting. Currently this means TCP.
 397  */
 398 
 399 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 400 {
 401         struct sk_buff * skb;
 402         struct proto *prot;
 403         struct device *dev;
 404         int ct=0;
 405 
 406         prot = sk->prot;
 407         skb = sk->send_head;
 408 
 409         while (skb != NULL)
 410         {
 411                 struct tcphdr *th;
 412                 struct iphdr *iph;
 413                 int size;
 414 
 415                 dev = skb->dev;
 416                 IS_SKB(skb);
 417                 skb->when = jiffies;
 418 
 419                 /*
 420                  * In general it's OK just to use the old packet.  However we
 421                  * need to use the current ack and window fields.  Urg and
 422                  * urg_ptr could possibly stand to be updated as well, but we
 423                  * don't keep the necessary data.  That shouldn't be a problem,
 424                  * if the other end is doing the right thing.  Since we're
 425                  * changing the packet, we have to issue a new IP identifier.
 426                  */
 427 
 428                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 429                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 430                 size = skb->len - (((unsigned char *) th) - skb->data);
 431                 
 432                 /*
 433                  *      Note: We ought to check for window limits here but
 434                  *      currently this is done (less efficiently) elsewhere.
 435                  *      We do need to check for a route change but can't handle
 436                  *      that until we have the new 1.3.x buffers in.
 437                  *
 438                  */
 439 
 440                 iph->id = htons(ip_id_count++);
 441                 ip_send_check(iph);
 442 
 443                 /*
 444                  *      This is not the right way to handle this. We have to
 445                  *      issue an up to date window and ack report with this 
 446                  *      retransmit to keep the odd buggy tcp that relies on 
 447                  *      the fact BSD does this happy. 
 448                  *      We don't however need to recalculate the entire 
 449                  *      checksum, so someone wanting a small problem to play
 450                  *      with might like to implement RFC1141/RFC1624 and speed
 451                  *      this up by avoiding a full checksum.
 452                  */
 453                  
 454                 th->ack_seq = ntohl(sk->acked_seq);
 455                 th->window = ntohs(tcp_select_window(sk));
 456                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 457                 
 458                 /*
 459                  *      If the interface is (still) up and running, kick it.
 460                  */
 461 
 462                 if (dev->flags & IFF_UP)
 463                 {
 464                         /*
 465                          *      If the packet is still being sent by the device/protocol
 466                          *      below then don't retransmit. This is both needed, and good -
 467                          *      especially with connected mode AX.25 where it stops resends
 468                          *      occurring of an as yet unsent anyway frame!
 469                          *      We still add up the counts as the round trip time wants
 470                          *      adjusting.
 471                          */
 472                         if (sk && !skb_device_locked(skb))
 473                         {
 474                                 /* Remove it from any existing driver queue first! */
 475                                 skb_unlink(skb);
 476                                 /* Now queue it */
 477                                 ip_statistics.IpOutRequests++;
 478                                 dev_queue_xmit(skb, dev, sk->priority);
 479                         }
 480                 }
 481 
 482                 /*
 483                  *      Count retransmissions
 484                  */
 485                  
 486                 ct++;
 487                 sk->prot->retransmits ++;
 488 
 489                 /*
 490                  *      Only one retransmit requested.
 491                  */
 492         
 493                 if (!all)
 494                         break;
 495 
 496                 /*
 497                  *      This should cut it off before we send too many packets.
 498                  */
 499 
 500                 if (ct >= sk->cong_window)
 501                         break;
 502                 skb = skb->link3;
 503         }
 504 }
 505 
 506 /*
 507  *      Reset the retransmission timer
 508  */
 509  
 510 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 511 {
 512         del_timer(&sk->retransmit_timer);
 513         sk->ip_xmit_timeout = why;
 514         if((int)when < 0)
 515         {
 516                 when=3;
 517                 printk("Error: Negative timer in xmit_timer\n");
 518         }
 519         sk->retransmit_timer.expires=when;
 520         add_timer(&sk->retransmit_timer);
 521 }
 522 
 523 /*
 524  *      This is the normal code called for timeouts.  It does the retransmission
 525  *      and then does backoff.  tcp_do_retransmit is separated out because
 526  *      tcp_ack needs to send stuff from the retransmit queue without
 527  *      initiating a backoff.
 528  */
 529 
 530 
 531 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 532 {
 533         tcp_do_retransmit(sk, all);
 534 
 535         /*
 536          * Increase the timeout each time we retransmit.  Note that
 537          * we do not increase the rtt estimate.  rto is initialized
 538          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 539          * that doubling rto each time is the least we can get away with.
 540          * In KA9Q, Karn uses this for the first few times, and then
 541          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 542          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 543          * defined in the protocol as the maximum possible RTT.  I guess
 544          * we'll have to use something other than TCP to talk to the
 545          * University of Mars.
 546          *
 547          * PAWS allows us longer timeouts and large windows, so once
 548          * implemented ftp to mars will work nicely. We will have to fix
 549          * the 120 second clamps though!
 550          */
 551 
 552         sk->retransmits++;
 553         sk->backoff++;
 554         sk->rto = min(sk->rto << 1, 120*HZ);
 555         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 556 }
 557 
 558 
 559 /*
 560  *      A timer event has trigger a tcp retransmit timeout. The
 561  *      socket xmit queue is ready and set up to send. Because
 562  *      the ack receive code keeps the queue straight we do
 563  *      nothing clever here.
 564  */
 565 
 566 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 567 {
 568         if (all) 
 569         {
 570                 tcp_retransmit_time(sk, all);
 571                 return;
 572         }
 573 
 574         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 575         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 576         sk->cong_count = 0;
 577 
 578         sk->cong_window = 1;
 579 
 580         /* Do the actual retransmit. */
 581         tcp_retransmit_time(sk, all);
 582 }
 583 
 584 /*
 585  *      A write timeout has occured. Process the after effects.
 586  */
 587 
 588 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590         /*
 591          *      Look for a 'soft' timeout.
 592          */
 593         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 594                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 595         {
 596                 /*
 597                  *      Attempt to recover if arp has changed (unlikely!) or
 598                  *      a route has shifted (not supported prior to 1.3).
 599                  */
 600                 arp_destroy (sk->daddr, 0);
 601                 ip_route_check (sk->daddr);
 602         }
 603         /*
 604          *      Has it gone just too far ?
 605          */
 606         if (sk->retransmits > TCP_RETR2) 
 607         {
 608                 sk->err = ETIMEDOUT;
 609                 sk->error_report(sk);
 610                 /*
 611                  *      Time wait the socket 
 612                  */
 613                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) 
 614                 {
 615                         tcp_set_state(sk,TCP_TIME_WAIT);
 616                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 617                 }
 618                 else
 619                 {
 620                         /*
 621                          *      Clean up time.
 622                          */
 623                         tcp_set_state(sk, TCP_CLOSE);
 624                         return 0;
 625                 }
 626         }
 627         return 1;
 628 }
 629 
 630 /*
 631  *      The TCP retransmit timer. This lacks a few small details.
 632  *
 633  *      1.      An initial rtt timeout on the probe0 should cause what we can
 634  *              of the first write queue buffer to be split and sent.
 635  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 636  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 637  *              tcp_err should save a 'soft error' for us.
 638  */
 639 
 640 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 641 {
 642         struct sock *sk = (struct sock*)data;
 643         int why = sk->ip_xmit_timeout;
 644 
 645         /* 
 646          * only process if socket is not in use
 647          */
 648 
 649         cli();
 650         if (sk->inuse || in_bh) 
 651         {
 652                 /* Try again in 1 second */
 653                 sk->retransmit_timer.expires = HZ;
 654                 add_timer(&sk->retransmit_timer);
 655                 sti();
 656                 return;
 657         }
 658 
 659         sk->inuse = 1;
 660         sti();
 661 
 662         /* Always see if we need to send an ack. */
 663 
 664         if (sk->ack_backlog && !sk->zapped) 
 665         {
 666                 sk->prot->read_wakeup (sk);
 667                 if (! sk->dead)
 668                         sk->data_ready(sk,0);
 669         }
 670 
 671         /* Now we need to figure out why the socket was on the timer. */
 672 
 673         switch (why) 
 674         {
 675                 /* Window probing */
 676                 case TIME_PROBE0:
 677                         tcp_send_probe0(sk);
 678                         if(tcp_write_timeout(sk))
 679                                 release_sock (sk);
 680                         break;
 681                 /* Retransmitting */
 682                 case TIME_WRITE:
 683                         /* It could be we got here because we needed to send an ack.
 684                          * So we need to check for that.
 685                          */
 686                 {
 687                         struct sk_buff *skb;
 688                         unsigned long flags;
 689 
 690                         save_flags(flags);
 691                         cli();
 692                         skb = sk->send_head;
 693                         if (!skb) 
 694                         {
 695                                 restore_flags(flags);
 696                         } 
 697                         else 
 698                         {
 699                                 /*
 700                                  *      Kicked by a delayed ack. Reset timer
 701                                  *      correctly now
 702                                  */
 703                                 if (jiffies < skb->when + sk->rto) 
 704                                 {
 705                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 706                                         restore_flags(flags);
 707                                         release_sock (sk);
 708                                         break;
 709                                 }
 710                                 restore_flags(flags);
 711                                 /*
 712                                  *      Retransmission
 713                                  */
 714                                 sk->prot->retransmit (sk, 0);
 715                                 if(!tcp_write_timeout(sk))
 716                                         break;
 717                         }
 718                         release_sock (sk);
 719                         break;
 720                 }
 721                 /* Sending Keepalives */
 722                 case TIME_KEEPOPEN:
 723                         /* 
 724                          * this reset_timer() call is a hack, this is not
 725                          * how KEEPOPEN is supposed to work.
 726                          */
 727                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 728 
 729                         /* Send something to keep the connection open. */
 730                         if (sk->prot->write_wakeup)
 731                                   sk->prot->write_wakeup (sk);
 732                         sk->retransmits++;
 733                         if(tcp_write_timeout(sk))
 734                                 release_sock (sk);
 735                         break;
 736                 default:
 737                         printk ("rexmit_timer: timer expired - reason unknown\n");
 738                         release_sock (sk);
 739                         break;
 740         }
 741 }
 742 
 743 /*
 744  * This routine is called by the ICMP module when it gets some
 745  * sort of error condition.  If err < 0 then the socket should
 746  * be closed and the error returned to the user.  If err > 0
 747  * it's just the icmp type << 8 | icmp code.  After adjustment
 748  * header points to the first 8 bytes of the tcp header.  We need
 749  * to find the appropriate port.
 750  */
 751 
 752 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 753         unsigned long saddr, struct inet_protocol *protocol)
 754 {
 755         struct tcphdr *th;
 756         struct sock *sk;
 757         struct iphdr *iph=(struct iphdr *)header;
 758   
 759         header+=4*iph->ihl;
 760    
 761 
 762         th =(struct tcphdr *)header;
 763         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 764 
 765         if (sk == NULL) 
 766                 return;
 767   
 768         if(err<0)
 769         {
 770                 sk->err = -err;
 771                 sk->error_report(sk);
 772                 return;
 773         }
 774 
 775         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 776         {
 777                 /*
 778                  * FIXME:
 779                  * For now we will just trigger a linear backoff.
 780                  * The slow start code should cause a real backoff here.
 781                  */
 782                 if (sk->cong_window > 4)
 783                         sk->cong_window--;
 784                 return;
 785         }
 786 
 787 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 788 
 789         /*
 790          * If we've already connected we will keep trying
 791          * until we time out, or the user gives up.
 792          */
 793 
 794         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 795         {
 796                 if (sk->state == TCP_SYN_SENT) 
 797                 {
 798                         tcp_statistics.TcpAttemptFails++;
 799                         tcp_set_state(sk,TCP_CLOSE);
 800                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 801                 }
 802                 sk->err = icmp_err_convert[err & 0xff].errno;           
 803         }
 804         return;
 805 }
 806 
 807 
 808 /*
 809  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 810  *      in the received data queue (ie a frame missing that needs sending to us). Not
 811  *      sorting using two queues as data arrives makes life so much harder.
 812  */
 813 
 814 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 815 {
 816         unsigned long counted;
 817         unsigned long amount;
 818         struct sk_buff *skb;
 819         int sum;
 820         unsigned long flags;
 821 
 822         if(sk && sk->debug)
 823                 printk("tcp_readable: %p - ",sk);
 824 
 825         save_flags(flags);
 826         cli();
 827         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 828         {
 829                 restore_flags(flags);
 830                 if(sk && sk->debug) 
 831                         printk("empty\n");
 832                 return(0);
 833         }
 834   
 835         counted = sk->copied_seq;       /* Where we are at the moment */
 836         amount = 0;
 837   
 838         /* 
 839          *      Do until a push or until we are out of data. 
 840          */
 841          
 842         do 
 843         {
 844                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 845                         break;
 846                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 847                 if (skb->h.th->syn)
 848                         sum++;
 849                 if (sum > 0) 
 850                 {                                       /* Add it up, move on */
 851                         amount += sum;
 852                         if (skb->h.th->syn) 
 853                                 amount--;
 854                         counted += sum;
 855                 }
 856                 /*
 857                  * Don't count urg data ... but do it in the right place!
 858                  * Consider: "old_data (ptr is here) URG PUSH data"
 859                  * The old code would stop at the first push because
 860                  * it counted the urg (amount==1) and then does amount--
 861                  * *after* the loop.  This means tcp_readable() always
 862                  * returned zero if any URG PUSH was in the queue, even
 863                  * though there was normal data available. If we subtract
 864                  * the urg data right here, we even get it to work for more
 865                  * than one URG PUSH skb without normal data.
 866                  * This means that select() finally works now with urg data
 867                  * in the queue.  Note that rlogin was never affected
 868                  * because it doesn't use select(); it uses two processes
 869                  * and a blocking read().  And the queue scan in tcp_read()
 870                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 871                  */
 872                 if (skb->h.th->urg)
 873                         amount--;       /* don't count urg data */
 874                 if (amount && skb->h.th->psh) break;
 875                 skb = skb->next;
 876         }
 877         while(skb != (struct sk_buff *)&sk->receive_queue);
 878 
 879         restore_flags(flags);
 880         if(sk->debug)
 881                 printk("got %lu bytes.\n",amount);
 882         return(amount);
 883 }
 884 
 885 /*
 886  * LISTEN is a special case for select..
 887  */
 888 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 889 {
 890         if (sel_type == SEL_IN) {
 891                 int retval;
 892 
 893                 sk->inuse = 1;
 894                 retval = (tcp_find_established(sk) != NULL);
 895                 release_sock(sk);
 896                 if (!retval)
 897                         select_wait(&master_select_wakeup,wait);
 898                 return retval;
 899         }
 900         return 0;
 901 }
 902 
 903 
 904 /*
 905  *      Wait for a TCP event.
 906  *
 907  *      Note that we don't need to set "sk->inuse", as the upper select layers
 908  *      take care of normal races (between the test and the event) and we don't
 909  *      go look at any of the socket buffers directly.
 910  */
 911 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 912 {
 913         if (sk->state == TCP_LISTEN)
 914                 return tcp_listen_select(sk, sel_type, wait);
 915 
 916         switch(sel_type) {
 917         case SEL_IN:
 918                 if (sk->err)
 919                         return 1;
 920                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 921                         break;
 922 
 923                 if (sk->shutdown & RCV_SHUTDOWN)
 924                         return 1;
 925                         
 926                 if (sk->acked_seq == sk->copied_seq)
 927                         break;
 928 
 929                 if (sk->urg_seq != sk->copied_seq ||
 930                     sk->acked_seq != sk->copied_seq+1 ||
 931                     sk->urginline || !sk->urg_data)
 932                         return 1;
 933                 break;
 934 
 935         case SEL_OUT:
 936                 if (sk->shutdown & SEND_SHUTDOWN) 
 937                         return 0;
 938                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 939                         break;
 940                 /*
 941                  * This is now right thanks to a small fix
 942                  * by Matt Dillon.
 943                  */
 944 
 945                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 946                         break;
 947                 return 1;
 948 
 949         case SEL_EX:
 950                 if (sk->err || sk->urg_data)
 951                         return 1;
 952                 break;
 953         }
 954         select_wait(sk->sleep, wait);
 955         return 0;
 956 }
 957 
 958 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 959 {
 960         int err;
 961         switch(cmd) 
 962         {
 963 
 964                 case TIOCINQ:
 965 #ifdef FIXME    /* FIXME: */
 966                 case FIONREAD:
 967 #endif
 968                 {
 969                         unsigned long amount;
 970 
 971                         if (sk->state == TCP_LISTEN) 
 972                                 return(-EINVAL);
 973 
 974                         sk->inuse = 1;
 975                         amount = tcp_readable(sk);
 976                         release_sock(sk);
 977                         err=verify_area(VERIFY_WRITE,(void *)arg,
 978                                                    sizeof(unsigned long));
 979                         if(err)
 980                                 return err;
 981                         put_fs_long(amount,(unsigned long *)arg);
 982                         return(0);
 983                 }
 984                 case SIOCATMARK:
 985                 {
 986                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 987 
 988                         err = verify_area(VERIFY_WRITE,(void *) arg,
 989                                                   sizeof(unsigned long));
 990                         if (err)
 991                                 return err;
 992                         put_fs_long(answ,(int *) arg);
 993                         return(0);
 994                 }
 995                 case TIOCOUTQ:
 996                 {
 997                         unsigned long amount;
 998 
 999                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1000                         amount = sk->prot->wspace(sk);
1001                         err=verify_area(VERIFY_WRITE,(void *)arg,
1002                                                    sizeof(unsigned long));
1003                         if(err)
1004                                 return err;
1005                         put_fs_long(amount,(unsigned long *)arg);
1006                         return(0);
1007                 }
1008                 default:
1009                         return(-EINVAL);
1010         }
1011 }
1012 
1013 
1014 /*
1015  *      This routine computes a TCP checksum. 
1016  */
1017  
1018 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1019           unsigned long saddr, unsigned long daddr)
1020 {     
1021         unsigned long sum;
1022    
1023         if (saddr == 0) saddr = ip_my_addr();
1024 
1025 /*
1026  * stupid, gcc complains when I use just one __asm__ block,
1027  * something about too many reloads, but this is just two
1028  * instructions longer than what I want
1029  */
1030         __asm__("
1031             addl %%ecx, %%ebx
1032             adcl %%edx, %%ebx
1033             adcl $0, %%ebx
1034             "
1035         : "=b"(sum)
1036         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1037         : "bx", "cx", "dx" );
1038         __asm__("
1039             movl %%ecx, %%edx
1040             cld
1041             cmpl $32, %%ecx
1042             jb 2f
1043             shrl $5, %%ecx
1044             clc
1045 1:          lodsl
1046             adcl %%eax, %%ebx
1047             lodsl
1048             adcl %%eax, %%ebx
1049             lodsl
1050             adcl %%eax, %%ebx
1051             lodsl
1052             adcl %%eax, %%ebx
1053             lodsl
1054             adcl %%eax, %%ebx
1055             lodsl
1056             adcl %%eax, %%ebx
1057             lodsl
1058             adcl %%eax, %%ebx
1059             lodsl
1060             adcl %%eax, %%ebx
1061             loop 1b
1062             adcl $0, %%ebx
1063             movl %%edx, %%ecx
1064 2:          andl $28, %%ecx
1065             je 4f
1066             shrl $2, %%ecx
1067             clc
1068 3:          lodsl
1069             adcl %%eax, %%ebx
1070             loop 3b
1071             adcl $0, %%ebx
1072 4:          movl $0, %%eax
1073             testw $2, %%dx
1074             je 5f
1075             lodsw
1076             addl %%eax, %%ebx
1077             adcl $0, %%ebx
1078             movw $0, %%ax
1079 5:          test $1, %%edx
1080             je 6f
1081             lodsb
1082             addl %%eax, %%ebx
1083             adcl $0, %%ebx
1084 6:          movl %%ebx, %%eax
1085             shrl $16, %%eax
1086             addw %%ax, %%bx
1087             adcw $0, %%bx
1088             "
1089         : "=b"(sum)
1090         : "0"(sum), "c"(len), "S"(th)
1091         : "ax", "bx", "cx", "dx", "si" );
1092 
1093         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1094   
1095         return((~sum) & 0xffff);
1096 }
1097 
1098 
1099 
1100 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1101                 unsigned long daddr, int len, struct sock *sk)
1102 {
1103         th->check = 0;
1104         th->check = tcp_check(th, len, saddr, daddr);
1105         return;
1106 }
1107 
1108 /*
1109  *      This is the main buffer sending routine. We queue the buffer
1110  *      having checked it is sane seeming.
1111  */
1112  
1113 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1114 {
1115         int size;
1116         struct tcphdr * th = skb->h.th;
1117 
1118         /*
1119          *      length of packet (not counting length of pre-tcp headers) 
1120          */
1121          
1122         size = skb->len - ((unsigned char *) th - skb->data);
1123 
1124         /*
1125          *      Sanity check it.. 
1126          */
1127          
1128         if (size < sizeof(struct tcphdr) || size > skb->len) 
1129         {
1130                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1131                         skb, skb->data, th, skb->len);
1132                 kfree_skb(skb, FREE_WRITE);
1133                 return;
1134         }
1135 
1136         /*
1137          *      If we have queued a header size packet.. (these crash a few
1138          *      tcp stacks if ack is not set)
1139          */
1140          
1141         if (size == sizeof(struct tcphdr)) 
1142         {
1143                 /* If its got a syn or fin its notionally included in the size..*/
1144                 if(!th->syn && !th->fin) 
1145                 {
1146                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1147                         kfree_skb(skb,FREE_WRITE);
1148                         return;
1149                 }
1150         }
1151 
1152         /*
1153          *      Actual processing.
1154          */
1155          
1156         tcp_statistics.TcpOutSegs++;  
1157         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1158         
1159         /*
1160          *      We must queue if
1161          *
1162          *      a) The right edge of this frame exceeds the window
1163          *      b) We are retransmitting (Nagle's rule)
1164          *      c) We have too many packets 'in flight'
1165          */
1166          
1167         if (after(skb->h.seq, sk->window_seq) ||
1168             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1169              sk->packets_out >= sk->cong_window) 
1170         {
1171                 /* checksum will be supplied by tcp_write_xmit.  So
1172                  * we shouldn't need to set it at all.  I'm being paranoid */
1173                 th->check = 0;
1174                 if (skb->next != NULL) 
1175                 {
1176                         printk("tcp_send_partial: next != NULL\n");
1177                         skb_unlink(skb);
1178                 }
1179                 skb_queue_tail(&sk->write_queue, skb);
1180                 
1181                 /*
1182                  *      If we don't fit we have to start the zero window
1183                  *      probes. This is broken - we really need to do a partial
1184                  *      send _first_ (This is what causes the Cisco and PC/TCP
1185                  *      grief).
1186                  */
1187                  
1188                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1189                     sk->send_head == NULL && sk->ack_backlog == 0)
1190                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1191         } 
1192         else 
1193         {
1194                 /*
1195                  *      This is going straight out
1196                  */
1197                  
1198                 th->ack_seq = ntohl(sk->acked_seq);
1199                 th->window = ntohs(tcp_select_window(sk));
1200 
1201                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1202 
1203                 sk->sent_seq = sk->write_seq;
1204                 
1205                 /*
1206                  *      This is mad. The tcp retransmit queue is put together
1207                  *      by the ip layer. This causes half the problems with
1208                  *      unroutable FIN's and other things.
1209                  */
1210                  
1211                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1212                 
1213                 /*
1214                  *      Set for next retransmit based on expected ACK time.
1215                  *      FIXME: We set this every time which means our 
1216                  *      retransmits are really about a window behind.
1217                  */
1218 
1219                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1220         }
1221 }
1222 
1223 /*
1224  *      Locking problems lead us to a messy situation where we can have
1225  *      multiple partially complete buffers queued up. This is really bad
1226  *      as we don't want to be sending partial buffers. Fix this with
1227  *      a semaphore or similar to lock tcp_write per socket.
1228  *
1229  *      These routines are pretty self descriptive.
1230  */
1231  
1232 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1233 {
1234         struct sk_buff * skb;
1235         unsigned long flags;
1236 
1237         save_flags(flags);
1238         cli();
1239         skb = sk->partial;
1240         if (skb) {
1241                 sk->partial = NULL;
1242                 del_timer(&sk->partial_timer);
1243         }
1244         restore_flags(flags);
1245         return skb;
1246 }
1247 
1248 /*
1249  *      Empty the partial queue
1250  */
1251  
1252 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1253 {
1254         struct sk_buff *skb;
1255 
1256         if (sk == NULL)
1257                 return;
1258         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1259                 tcp_send_skb(sk, skb);
1260 }
1261 
1262 /*
1263  *      Queue a partial frame
1264  */
1265  
1266 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1267 {
1268         struct sk_buff * tmp;
1269         unsigned long flags;
1270 
1271         save_flags(flags);
1272         cli();
1273         tmp = sk->partial;
1274         if (tmp)
1275                 del_timer(&sk->partial_timer);
1276         sk->partial = skb;
1277         init_timer(&sk->partial_timer);
1278         /*
1279          *      Wait up to 1 second for the buffer to fill.
1280          */
1281         sk->partial_timer.expires = HZ;
1282         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1283         sk->partial_timer.data = (unsigned long) sk;
1284         add_timer(&sk->partial_timer);
1285         restore_flags(flags);
1286         if (tmp)
1287                 tcp_send_skb(sk, tmp);
1288 }
1289 
1290 
1291 /*
1292  *      This routine sends an ack and also updates the window. 
1293  */
1294  
1295 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1296              struct sock *sk,
1297              struct tcphdr *th, unsigned long daddr)
1298 {
1299         struct sk_buff *buff;
1300         struct tcphdr *t1;
1301         struct device *dev = NULL;
1302         int tmp;
1303 
1304         if(sk->zapped)
1305                 return;         /* We have been reset, we may not send again */
1306                 
1307         /*
1308          * We need to grab some memory, and put together an ack,
1309          * and then put it into the queue to be sent.
1310          */
1311 
1312         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1313         if (buff == NULL) 
1314         {
1315                 /* 
1316                  *      Force it to send an ack. We don't have to do this
1317                  *      (ACK is unreliable) but its much better use of 
1318                  *      bandwidth on slow links to send a spare ack than
1319                  *      resend packets. 
1320                  */
1321                  
1322                 sk->ack_backlog++;
1323                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1324                 {
1325                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1326                 }
1327                 return;
1328         }
1329 
1330         /*
1331          *      Assemble a suitable TCP frame
1332          */
1333          
1334         buff->len = sizeof(struct tcphdr);
1335         buff->sk = sk;
1336         buff->localroute = sk->localroute;
1337         t1 =(struct tcphdr *) buff->data;
1338 
1339         /* 
1340          *      Put in the IP header and routing stuff. 
1341          */
1342          
1343         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1344                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1345         if (tmp < 0) 
1346         {
1347                 buff->free = 1;
1348                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1349                 return;
1350         }
1351         buff->len += tmp;
1352         t1 =(struct tcphdr *)((char *)t1 +tmp);
1353 
1354         memcpy(t1, th, sizeof(*t1));
1355 
1356         /*
1357          *      Swap the send and the receive. 
1358          */
1359          
1360         t1->dest = th->source;
1361         t1->source = th->dest;
1362         t1->seq = ntohl(sequence);
1363         t1->ack = 1;
1364         sk->window = tcp_select_window(sk);
1365         t1->window = ntohs(sk->window);
1366         t1->res1 = 0;
1367         t1->res2 = 0;
1368         t1->rst = 0;
1369         t1->urg = 0;
1370         t1->syn = 0;
1371         t1->psh = 0;
1372         t1->fin = 0;
1373         
1374         /*
1375          *      If we have nothing queued for transmit and the transmit timer
1376          *      is on we are just doing an ACK timeout and need to switch
1377          *      to a keepalive.
1378          */
1379          
1380         if (ack == sk->acked_seq) 
1381         {
1382                 sk->ack_backlog = 0;
1383                 sk->bytes_rcv = 0;
1384                 sk->ack_timed = 0;
1385                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1386                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1387                 {
1388                         if(sk->keepopen) {
1389                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1390                         } else {
1391                                 delete_timer(sk);
1392                         }
1393                 }
1394         }
1395         
1396         /*
1397          *      Fill in the packet and send it
1398          */
1399          
1400         t1->ack_seq = ntohl(ack);
1401         t1->doff = sizeof(*t1)/4;
1402         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1403         if (sk->debug)
1404                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1405         tcp_statistics.TcpOutSegs++;
1406         sk->prot->queue_xmit(sk, dev, buff, 1);
1407 }
1408 
1409 
1410 /* 
1411  *      This routine builds a generic TCP header. 
1412  */
1413  
1414 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1415 {
1416 
1417         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1418         th->seq = htonl(sk->write_seq);
1419         th->psh =(push == 0) ? 1 : 0;
1420         th->doff = sizeof(*th)/4;
1421         th->ack = 1;
1422         th->fin = 0;
1423         sk->ack_backlog = 0;
1424         sk->bytes_rcv = 0;
1425         sk->ack_timed = 0;
1426         th->ack_seq = htonl(sk->acked_seq);
1427         sk->window = tcp_select_window(sk);
1428         th->window = htons(sk->window);
1429 
1430         return(sizeof(*th));
1431 }
1432 
1433 /*
1434  *      This routine copies from a user buffer into a socket,
1435  *      and starts the transmit system.
1436  */
1437 
1438 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1439           int len, int nonblock, unsigned flags)
1440 {
1441         int copied = 0;
1442         int copy;
1443         int tmp;
1444         struct sk_buff *skb;
1445         struct sk_buff *send_tmp;
1446         unsigned char *buff;
1447         struct proto *prot;
1448         struct device *dev = NULL;
1449 
1450         sk->inuse=1;
1451         prot = sk->prot;
1452         while(len > 0) 
1453         {
1454                 if (sk->err) 
1455                 {                       /* Stop on an error */
1456                         release_sock(sk);
1457                         if (copied) 
1458                                 return(copied);
1459                         tmp = -sk->err;
1460                         sk->err = 0;
1461                         return(tmp);
1462                 }
1463 
1464                 /*
1465                  *      First thing we do is make sure that we are established. 
1466                  */
1467         
1468                 if (sk->shutdown & SEND_SHUTDOWN) 
1469                 {
1470                         release_sock(sk);
1471                         sk->err = EPIPE;
1472                         if (copied) 
1473                                 return(copied);
1474                         sk->err = 0;
1475                         return(-EPIPE);
1476                 }
1477 
1478                 /* 
1479                  *      Wait for a connection to finish.
1480                  */
1481         
1482                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1483                 {
1484                         if (sk->err) 
1485                         {
1486                                 release_sock(sk);
1487                                 if (copied) 
1488                                         return(copied);
1489                                 tmp = -sk->err;
1490                                 sk->err = 0;
1491                                 return(tmp);
1492                         }
1493 
1494                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1495                         {
1496                                 release_sock(sk);
1497                                 if (copied) 
1498                                         return(copied);
1499 
1500                                 if (sk->err) 
1501                                 {
1502                                         tmp = -sk->err;
1503                                         sk->err = 0;
1504                                         return(tmp);
1505                                 }
1506 
1507                                 if (sk->keepopen) 
1508                                 {
1509                                         send_sig(SIGPIPE, current, 0);
1510                                 }
1511                                 return(-EPIPE);
1512                         }
1513 
1514                         if (nonblock || copied) 
1515                         {
1516                                 release_sock(sk);
1517                                 if (copied) 
1518                                         return(copied);
1519                                 return(-EAGAIN);
1520                         }
1521 
1522                         release_sock(sk);
1523                         cli();
1524                 
1525                         if (sk->state != TCP_ESTABLISHED &&
1526                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1527                         {
1528                                 interruptible_sleep_on(sk->sleep);
1529                                 if (current->signal & ~current->blocked) 
1530                                 {
1531                                         sti();
1532                                         if (copied) 
1533                                                 return(copied);
1534                                         return(-ERESTARTSYS);
1535                                 }
1536                         }
1537                         sk->inuse = 1;
1538                         sti();
1539                 }
1540 
1541         /*
1542          * The following code can result in copy <= if sk->mss is ever
1543          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1544          * sk->mtu is constant once SYN processing is finished.  I.e. we
1545          * had better not get here until we've seen his SYN and at least one
1546          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1547          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1548          * non-decreasing.  Note that any ioctl to set user_mss must be done
1549          * before the exchange of SYN's.  If the initial ack from the other
1550          * end has a window of 0, max_window and thus mss will both be 0.
1551          */
1552 
1553         /* 
1554          *      Now we need to check if we have a half built packet. 
1555          */
1556 
1557                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1558                 {
1559                         int hdrlen;
1560 
1561                          /* IP header + TCP header */
1562                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1563                                  + sizeof(struct tcphdr);
1564         
1565                         /* Add more stuff to the end of skb->len */
1566                         if (!(flags & MSG_OOB)) 
1567                         {
1568                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1569                                 /* FIXME: this is really a bug. */
1570                                 if (copy <= 0) 
1571                                 {
1572                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1573                                         copy = 0;
1574                                 }
1575           
1576                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1577                                 skb->len += copy;
1578                                 from += copy;
1579                                 copied += copy;
1580                                 len -= copy;
1581                                 sk->write_seq += copy;
1582                         }
1583                         if ((skb->len - hdrlen) >= sk->mss ||
1584                                 (flags & MSG_OOB) || !sk->packets_out)
1585                                 tcp_send_skb(sk, skb);
1586                         else
1587                                 tcp_enqueue_partial(skb, sk);
1588                         continue;
1589                 }
1590 
1591         /*
1592          * We also need to worry about the window.
1593          * If window < 1/2 the maximum window we've seen from this
1594          *   host, don't use it.  This is sender side
1595          *   silly window prevention, as specified in RFC1122.
1596          *   (Note that this is different than earlier versions of
1597          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1598          *   use the whole MSS.  Since the results in the right
1599          *   edge of the packet being outside the window, it will
1600          *   be queued for later rather than sent.
1601          */
1602 
1603                 copy = sk->window_seq - sk->write_seq;
1604                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1605                         copy = sk->mss;
1606                 if (copy > len)
1607                         copy = len;
1608 
1609         /*
1610          *      We should really check the window here also. 
1611          */
1612          
1613                 send_tmp = NULL;
1614                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1615                 {
1616                         /*
1617                          *      We will release the socket incase we sleep here. 
1618                          */
1619                         release_sock(sk);
1620                         /*
1621                          *      NB: following must be mtu, because mss can be increased.
1622                          *      mss is always <= mtu 
1623                          */
1624                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1625                         sk->inuse = 1;
1626                         send_tmp = skb;
1627                 } 
1628                 else 
1629                 {
1630                         /*
1631                          *      We will release the socket incase we sleep here. 
1632                          */
1633                         release_sock(sk);
1634                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1635                         sk->inuse = 1;
1636                 }
1637 
1638                 /*
1639                  *      If we didn't get any memory, we need to sleep. 
1640                  */
1641 
1642                 if (skb == NULL) 
1643                 {
1644                         sk->socket->flags |= SO_NOSPACE;
1645                         if (nonblock) 
1646                         {
1647                                 release_sock(sk);
1648                                 if (copied) 
1649                                         return(copied);
1650                                 return(-EAGAIN);
1651                         }
1652 
1653                         /*
1654                          *      FIXME: here is another race condition. 
1655                          */
1656 
1657                         tmp = sk->wmem_alloc;
1658                         release_sock(sk);
1659                         cli();
1660                         /*
1661                          *      Again we will try to avoid it. 
1662                          */
1663                         if (tmp <= sk->wmem_alloc &&
1664                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1665                                 && sk->err == 0) 
1666                         {
1667                                 sk->socket->flags &= ~SO_NOSPACE;
1668                                 interruptible_sleep_on(sk->sleep);
1669                                 if (current->signal & ~current->blocked) 
1670                                 {
1671                                         sti();
1672                                         if (copied) 
1673                                                 return(copied);
1674                                         return(-ERESTARTSYS);
1675                                 }
1676                         }
1677                         sk->inuse = 1;
1678                         sti();
1679                         continue;
1680                 }
1681 
1682                 skb->len = 0;
1683                 skb->sk = sk;
1684                 skb->free = 0;
1685                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1686         
1687                 buff = skb->data;
1688         
1689                 /*
1690                  * FIXME: we need to optimize this.
1691                  * Perhaps some hints here would be good.
1692                  */
1693                 
1694                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1695                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1696                 if (tmp < 0 ) 
1697                 {
1698                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1699                         release_sock(sk);
1700                         if (copied) 
1701                                 return(copied);
1702                         return(tmp);
1703                 }
1704                 skb->len += tmp;
1705                 skb->dev = dev;
1706                 buff += tmp;
1707                 skb->h.th =(struct tcphdr *) buff;
1708                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1709                 if (tmp < 0) 
1710                 {
1711                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1712                         release_sock(sk);
1713                         if (copied) 
1714                                 return(copied);
1715                         return(tmp);
1716                 }
1717 
1718                 if (flags & MSG_OOB) 
1719                 {
1720                         ((struct tcphdr *)buff)->urg = 1;
1721                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1722                 }
1723                 skb->len += tmp;
1724                 memcpy_fromfs(buff+tmp, from, copy);
1725 
1726                 from += copy;
1727                 copied += copy;
1728                 len -= copy;
1729                 skb->len += copy;
1730                 skb->free = 0;
1731                 sk->write_seq += copy;
1732         
1733                 if (send_tmp != NULL && sk->packets_out) 
1734                 {
1735                         tcp_enqueue_partial(send_tmp, sk);
1736                         continue;
1737                 }
1738                 tcp_send_skb(sk, skb);
1739         }
1740         sk->err = 0;
1741 
1742 /*
1743  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1744  *      interactive fast network servers. It's meant to be on and
1745  *      it really improves the throughput though not the echo time
1746  *      on my slow slip link - Alan
1747  */
1748 
1749 /*
1750  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1751  */
1752  
1753         if(sk->partial && ((!sk->packets_out) 
1754      /* If not nagling we can send on the before case too.. */
1755               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1756         ))
1757                 tcp_send_partial(sk);
1758 
1759         release_sock(sk);
1760         return(copied);
1761 }
1762 
1763 /*
1764  *      This is just a wrapper. 
1765  */
1766 
1767 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1768            int len, int nonblock, unsigned flags,
1769            struct sockaddr_in *addr, int addr_len)
1770 {
1771         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1772                 return -EINVAL;
1773         if (sk->state == TCP_CLOSE)
1774                 return -ENOTCONN;
1775         if (addr_len < sizeof(*addr))
1776                 return -EINVAL;
1777         if (addr->sin_family && addr->sin_family != AF_INET) 
1778                 return -EINVAL;
1779         if (addr->sin_port != sk->dummy_th.dest) 
1780                 return -EISCONN;
1781         if (addr->sin_addr.s_addr != sk->daddr) 
1782                 return -EISCONN;
1783         return tcp_write(sk, from, len, nonblock, flags);
1784 }
1785 
1786 
1787 /*
1788  *      Send an ack if one is backlogged at this point. Ought to merge
1789  *      this with tcp_send_ack().
1790  */
1791  
1792 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1793 {
1794         int tmp;
1795         struct device *dev = NULL;
1796         struct tcphdr *t1;
1797         struct sk_buff *buff;
1798 
1799         if (!sk->ack_backlog) 
1800                 return;
1801 
1802         /*
1803          * FIXME: we need to put code here to prevent this routine from
1804          * being called.  Being called once in a while is ok, so only check
1805          * if this is the second time in a row.
1806          */
1807 
1808         /*
1809          * We need to grab some memory, and put together an ack,
1810          * and then put it into the queue to be sent.
1811          */
1812 
1813         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1814         if (buff == NULL) 
1815         {
1816                 /* Try again real soon. */
1817                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1818                 return;
1819         }
1820 
1821         buff->len = sizeof(struct tcphdr);
1822         buff->sk = sk;
1823         buff->localroute = sk->localroute;
1824         
1825         /*
1826          *      Put in the IP header and routing stuff. 
1827          */
1828 
1829         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1830                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1831         if (tmp < 0) 
1832         {
1833                 buff->free = 1;
1834                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1835                 return;
1836         }
1837 
1838         buff->len += tmp;
1839         t1 =(struct tcphdr *)(buff->data +tmp);
1840 
1841         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1842         t1->seq = htonl(sk->sent_seq);
1843         t1->ack = 1;
1844         t1->res1 = 0;
1845         t1->res2 = 0;
1846         t1->rst = 0;
1847         t1->urg = 0;
1848         t1->syn = 0;
1849         t1->psh = 0;
1850         sk->ack_backlog = 0;
1851         sk->bytes_rcv = 0;
1852         sk->window = tcp_select_window(sk);
1853         t1->window = ntohs(sk->window);
1854         t1->ack_seq = ntohl(sk->acked_seq);
1855         t1->doff = sizeof(*t1)/4;
1856         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1857         sk->prot->queue_xmit(sk, dev, buff, 1);
1858         tcp_statistics.TcpOutSegs++;
1859 }
1860 
1861 
1862 /*
1863  *      FIXME:
1864  *      This routine frees used buffers.
1865  *      It should consider sending an ACK to let the
1866  *      other end know we now have a bigger window.
1867  */
1868 
1869 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1870 {
1871         unsigned long flags;
1872         unsigned long left;
1873         struct sk_buff *skb;
1874         unsigned long rspace;
1875 
1876         if(sk->debug)
1877                 printk("cleaning rbuf for sk=%p\n", sk);
1878   
1879         save_flags(flags);
1880         cli();
1881   
1882         left = sk->prot->rspace(sk);
1883  
1884         /*
1885          *      We have to loop through all the buffer headers,
1886          *      and try to free up all the space we can.
1887          */
1888 
1889         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1890         {
1891                 if (!skb->used || skb->users) 
1892                         break;
1893                 skb_unlink(skb);
1894                 skb->sk = sk;
1895                 kfree_skb(skb, FREE_READ);
1896         }
1897 
1898         restore_flags(flags);
1899 
1900         /*
1901          *      FIXME:
1902          *      At this point we should send an ack if the difference
1903          *      in the window, and the amount of space is bigger than
1904          *      TCP_WINDOW_DIFF.
1905          */
1906 
1907         if(sk->debug)
1908                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1909                                             left);
1910         if ((rspace=sk->prot->rspace(sk)) != left) 
1911         {
1912                 /*
1913                  * This area has caused the most trouble.  The current strategy
1914                  * is to simply do nothing if the other end has room to send at
1915                  * least 3 full packets, because the ack from those will auto-
1916                  * matically update the window.  If the other end doesn't think
1917                  * we have much space left, but we have room for at least 1 more
1918                  * complete packet than it thinks we do, we will send an ack
1919                  * immediately.  Otherwise we will wait up to .5 seconds in case
1920                  * the user reads some more.
1921                  */
1922                 sk->ack_backlog++;
1923         /*
1924          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1925          * if the other end is offering a window smaller than the agreed on MSS
1926          * (called sk->mtu here).  In theory there's no connection between send
1927          * and receive, and so no reason to think that they're going to send
1928          * small packets.  For the moment I'm using the hack of reducing the mss
1929          * only on the send side, so I'm putting mtu here.
1930          */
1931 
1932                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1933                 {
1934                         /* Send an ack right now. */
1935                         tcp_read_wakeup(sk);
1936                 } 
1937                 else 
1938                 {
1939                         /* Force it to send an ack soon. */
1940                         int was_active = del_timer(&sk->retransmit_timer);
1941                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1942                         {
1943                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1944                         } 
1945                         else
1946                                 add_timer(&sk->retransmit_timer);
1947                 }
1948         }
1949 } 
1950 
1951 
1952 /*
1953  *      Handle reading urgent data. BSD has very simple semantics for
1954  *      this, no blocking and very strange errors 8)
1955  */
1956  
1957 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1958              unsigned char *to, int len, unsigned flags)
1959 {
1960         /*
1961          *      No URG data to read
1962          */
1963         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1964                 return -EINVAL; /* Yes this is right ! */
1965                 
1966         if (sk->err) 
1967         {
1968                 int tmp = -sk->err;
1969                 sk->err = 0;
1970                 return tmp;
1971         }
1972 
1973         if (sk->state == TCP_CLOSE || sk->done) 
1974         {
1975                 if (!sk->done) {
1976                         sk->done = 1;
1977                         return 0;
1978                 }
1979                 return -ENOTCONN;
1980         }
1981 
1982         if (sk->shutdown & RCV_SHUTDOWN) 
1983         {
1984                 sk->done = 1;
1985                 return 0;
1986         }
1987         sk->inuse = 1;
1988         if (sk->urg_data & URG_VALID) 
1989         {
1990                 char c = sk->urg_data;
1991                 if (!(flags & MSG_PEEK))
1992                         sk->urg_data = URG_READ;
1993                 put_fs_byte(c, to);
1994                 release_sock(sk);
1995                 return 1;
1996         }
1997         release_sock(sk);
1998         
1999         /*
2000          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2001          * the available implementations agree in this case:
2002          * this call should never block, independent of the
2003          * blocking state of the socket.
2004          * Mike <pall@rz.uni-karlsruhe.de>
2005          */
2006         return -EAGAIN;
2007 }
2008 
2009 
2010 /*
2011  *      This routine copies from a sock struct into the user buffer. 
2012  */
2013  
2014 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2015         int len, int nonblock, unsigned flags)
2016 {
2017         struct wait_queue wait = { current, NULL };
2018         int copied = 0;
2019         unsigned long peek_seq;
2020         volatile unsigned long *seq;    /* So gcc doesnt overoptimise */
2021         unsigned long used;
2022 
2023         /* 
2024          *      This error should be checked. 
2025          */
2026          
2027         if (sk->state == TCP_LISTEN)
2028                 return -ENOTCONN;
2029 
2030         /*
2031          *      Urgent data needs to be handled specially. 
2032          */
2033          
2034         if (flags & MSG_OOB)
2035                 return tcp_read_urg(sk, nonblock, to, len, flags);
2036 
2037         /*
2038          *      Copying sequence to update. This is volatile to handle
2039          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2040          *      inline and thus not flush cached variables otherwise).
2041          */
2042          
2043         peek_seq = sk->copied_seq;
2044         seq = &sk->copied_seq;
2045         if (flags & MSG_PEEK)
2046                 seq = &peek_seq;
2047 
2048         add_wait_queue(sk->sleep, &wait);
2049         sk->inuse = 1;
2050         while (len > 0) 
2051         {
2052                 struct sk_buff * skb;
2053                 unsigned long offset;
2054         
2055                 /*
2056                  * Are we at urgent data? Stop if we have read anything.
2057                  */
2058                  
2059                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2060                         break;
2061 
2062                 /*
2063                  *      Next get a buffer.
2064                  */
2065                  
2066                 current->state = TASK_INTERRUPTIBLE;
2067 
2068                 skb = skb_peek(&sk->receive_queue);
2069                 do 
2070                 {
2071                         if (!skb)
2072                                 break;
2073                         if (before(*seq, skb->h.th->seq))
2074                                 break;
2075                         offset = *seq - skb->h.th->seq;
2076                         if (skb->h.th->syn)
2077                                 offset--;
2078                         if (offset < skb->len)
2079                                 goto found_ok_skb;
2080                         if (skb->h.th->fin)
2081                                 goto found_fin_ok;
2082                         if (!(flags & MSG_PEEK))
2083                                 skb->used = 1;
2084                         skb = skb->next;
2085                 }
2086                 while (skb != (struct sk_buff *)&sk->receive_queue);
2087 
2088                 if (copied)
2089                         break;
2090 
2091                 if (sk->err) 
2092                 {
2093                         copied = -sk->err;
2094                         sk->err = 0;
2095                         break;
2096                 }
2097 
2098                 if (sk->state == TCP_CLOSE) 
2099                 {
2100                         if (!sk->done) 
2101                         {
2102                                 sk->done = 1;
2103                                 break;
2104                         }
2105                         copied = -ENOTCONN;
2106                         break;
2107                 }
2108 
2109                 if (sk->shutdown & RCV_SHUTDOWN) 
2110                 {
2111                         sk->done = 1;
2112                         break;
2113                 }
2114                         
2115                 if (nonblock) 
2116                 {
2117                         copied = -EAGAIN;
2118                         break;
2119                 }
2120 
2121                 cleanup_rbuf(sk);
2122                 release_sock(sk);
2123                 sk->socket->flags |= SO_WAITDATA;
2124                 schedule();
2125                 sk->socket->flags &= ~SO_WAITDATA;
2126                 sk->inuse = 1;
2127 
2128                 if (current->signal & ~current->blocked) 
2129                 {
2130                         copied = -ERESTARTSYS;
2131                         break;
2132                 }
2133                 continue;
2134 
2135         found_ok_skb:
2136                 /*
2137                  *      Lock the buffer. We can be fairly relaxed as
2138                  *      an interrupt will never steal a buffer we are 
2139                  *      using unless I've missed something serious in
2140                  *      tcp_data.
2141                  */
2142                 
2143                 skb->users++;
2144                 
2145                 /*
2146                  *      Ok so how much can we use ? 
2147                  */
2148                  
2149                 used = skb->len - offset;
2150                 if (len < used)
2151                         used = len;
2152                 /*
2153                  *      Do we have urgent data here? 
2154                  */
2155                 
2156                 if (sk->urg_data) 
2157                 {
2158                         unsigned long urg_offset = sk->urg_seq - *seq;
2159                         if (urg_offset < used) 
2160                         {
2161                                 if (!urg_offset) 
2162                                 {
2163                                         if (!sk->urginline) 
2164                                         {
2165                                                 ++*seq;
2166                                                 offset++;
2167                                                 used--;
2168                                         }
2169                                 }
2170                                 else
2171                                         used = urg_offset;
2172                         }
2173                 }
2174                 
2175                 /*
2176                  *      Copy it - We _MUST_ update *seq first so that we
2177                  *      don't ever double read when we have dual readers
2178                  */
2179                  
2180                 *seq += used;
2181 
2182                 /*
2183                  *      This memcpy_tofs can sleep. If it sleeps and we
2184                  *      do a second read it relies on the skb->users to avoid
2185                  *      a crash when cleanup_rbuf() gets called.
2186                  */
2187                  
2188                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2189                         skb->h.th->doff*4 + offset, used);
2190                 copied += used;
2191                 len -= used;
2192                 to += used;
2193                 
2194                 /*
2195                  *      We now will not sleep again until we are finished
2196                  *      with skb. Sorry if you are doing the SMP port
2197                  *      but you'll just have to fix it neatly ;)
2198                  */
2199                  
2200                 skb->users --;
2201                 
2202                 if (after(sk->copied_seq,sk->urg_seq))
2203                         sk->urg_data = 0;
2204                 if (used + offset < skb->len)
2205                         continue;
2206                 
2207                 /*
2208                  *      Process the FIN.
2209                  */
2210 
2211                 if (skb->h.th->fin)
2212                         goto found_fin_ok;
2213                 if (flags & MSG_PEEK)
2214                         continue;
2215                 skb->used = 1;
2216                 continue;
2217 
2218         found_fin_ok:
2219                 ++*seq;
2220                 if (flags & MSG_PEEK)
2221                         break;
2222                         
2223                 /*
2224                  *      All is done
2225                  */
2226                  
2227                 skb->used = 1;
2228                 sk->shutdown |= RCV_SHUTDOWN;
2229                 break;
2230 
2231         }
2232         remove_wait_queue(sk->sleep, &wait);
2233         current->state = TASK_RUNNING;
2234 
2235         /* Clean up data we have read: This will do ACK frames */
2236         cleanup_rbuf(sk);
2237         release_sock(sk);
2238         return copied;
2239 }
2240 
2241 /*
2242  *      State processing on a close. This implements the state shift for
2243  *      sending our FIN frame. Note that we only send a FIN for some 
2244  *      states. A shutdown() may have already sent the FIN, or we may be
2245  *      closed.
2246  */
2247  
2248 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2249 {
2250         int ns=TCP_CLOSE;
2251         int send_fin=0;
2252         switch(sk->state)
2253         {
2254                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2255                         break;
2256                 case TCP_SYN_RECV:
2257                 case TCP_ESTABLISHED:   /* Closedown begin */
2258                         ns=TCP_FIN_WAIT1;
2259                         send_fin=1;
2260                         break;
2261                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2262                 case TCP_FIN_WAIT2:
2263                 case TCP_CLOSING:
2264                         ns=sk->state;
2265                         break;
2266                 case TCP_CLOSE:
2267                 case TCP_LISTEN:
2268                         break;
2269                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2270                                            wait only for the ACK */
2271                         ns=TCP_LAST_ACK;
2272                         send_fin=1;
2273         }
2274         
2275         tcp_set_state(sk,ns);
2276                 
2277         /*
2278          *      This is a (useful) BSD violating of the RFC. There is a
2279          *      problem with TCP as specified in that the other end could
2280          *      keep a socket open forever with no application left this end.
2281          *      We use a 3 minute timeout (about the same as BSD) then kill
2282          *      our end. If they send after that then tough - BUT: long enough
2283          *      that we won't make the old 4*rto = almost no time - whoops
2284          *      reset mistake.
2285          */
2286         if(dead && ns==TCP_FIN_WAIT2)
2287         {
2288                 int timer_active=del_timer(&sk->timer);
2289                 if(timer_active)
2290                         add_timer(&sk->timer);
2291                 else
2292                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2293         }
2294         
2295         return send_fin;
2296 }
2297 
2298 /*
2299  *      Send a fin.
2300  */
2301 
2302 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2303 {
2304         struct proto *prot =(struct proto *)sk->prot;
2305         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2306         struct tcphdr *t1;
2307         struct sk_buff *buff;
2308         struct device *dev=NULL;
2309         int tmp;
2310                 
2311         release_sock(sk); /* in case the malloc sleeps. */
2312         
2313         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2314         sk->inuse = 1;
2315 
2316         if (buff == NULL)
2317         {
2318                 /* This is a disaster if it occurs */
2319                 printk("tcp_send_fin: Impossible malloc failure");
2320                 return;
2321         }
2322 
2323         /*
2324          *      Administrivia
2325          */
2326          
2327         buff->sk = sk;
2328         buff->len = sizeof(*t1);
2329         buff->localroute = sk->localroute;
2330         t1 =(struct tcphdr *) buff->data;
2331 
2332         /*
2333          *      Put in the IP header and routing stuff. 
2334          */
2335 
2336         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2337                            IPPROTO_TCP, sk->opt,
2338                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2339         if (tmp < 0) 
2340         {
2341                 int t;
2342                 /*
2343                  *      Finish anyway, treat this as a send that got lost. 
2344                  *      (Not good).
2345                  */
2346                  
2347                 buff->free = 1;
2348                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2349                 sk->write_seq++;
2350                 t=del_timer(&sk->timer);
2351                 if(t)
2352                         add_timer(&sk->timer);
2353                 else
2354                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2355                 return;
2356         }
2357         
2358         /*
2359          *      We ought to check if the end of the queue is a buffer and
2360          *      if so simply add the fin to that buffer, not send it ahead.
2361          */
2362 
2363         t1 =(struct tcphdr *)((char *)t1 +tmp);
2364         buff->len += tmp;
2365         buff->dev = dev;
2366         memcpy(t1, th, sizeof(*t1));
2367         t1->seq = ntohl(sk->write_seq);
2368         sk->write_seq++;
2369         buff->h.seq = sk->write_seq;
2370         t1->ack = 1;
2371         t1->ack_seq = ntohl(sk->acked_seq);
2372         t1->window = ntohs(sk->window=tcp_select_window(sk));
2373         t1->fin = 1;
2374         t1->rst = 0;
2375         t1->doff = sizeof(*t1)/4;
2376         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2377 
2378         /*
2379          * If there is data in the write queue, the fin must be appended to
2380          * the write queue.
2381          */
2382         
2383         if (skb_peek(&sk->write_queue) != NULL) 
2384         {
2385                 buff->free = 0;
2386                 if (buff->next != NULL) 
2387                 {
2388                         printk("tcp_send_fin: next != NULL\n");
2389                         skb_unlink(buff);
2390                 }
2391                 skb_queue_tail(&sk->write_queue, buff);
2392         } 
2393         else 
2394         {
2395                 sk->sent_seq = sk->write_seq;
2396                 sk->prot->queue_xmit(sk, dev, buff, 0);
2397                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2398         }
2399 }
2400 
2401 /*
2402  *      Shutdown the sending side of a connection. Much like close except
2403  *      that we don't receive shut down or set sk->dead=1.
2404  */
2405 
2406 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2407 {
2408         /*
2409          *      We need to grab some memory, and put together a FIN,
2410          *      and then put it into the queue to be sent.
2411          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2412          */
2413 
2414         if (!(how & SEND_SHUTDOWN)) 
2415                 return;
2416          
2417         /*
2418          *      If we've already sent a FIN, or its a closed state
2419          */
2420          
2421         if (sk->state == TCP_FIN_WAIT1 ||
2422             sk->state == TCP_FIN_WAIT2 ||
2423             sk->state == TCP_CLOSING ||
2424             sk->state == TCP_LAST_ACK ||
2425             sk->state == TCP_TIME_WAIT || 
2426             sk->state == TCP_CLOSE ||
2427             sk->state == TCP_LISTEN
2428           )
2429         {
2430                 return;
2431         }
2432         sk->inuse = 1;
2433 
2434         /*
2435          * flag that the sender has shutdown
2436          */
2437 
2438         sk->shutdown |= SEND_SHUTDOWN;
2439 
2440         /*
2441          *  Clear out any half completed packets. 
2442          */
2443 
2444         if (sk->partial)
2445                 tcp_send_partial(sk);
2446                 
2447         /*
2448          *      FIN if needed
2449          */
2450          
2451         if(tcp_close_state(sk,0))
2452                 tcp_send_fin(sk);
2453                 
2454         release_sock(sk);
2455 }
2456 
2457 
2458 static int
2459 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2460              int to_len, int nonblock, unsigned flags,
2461              struct sockaddr_in *addr, int *addr_len)
2462 {
2463         int result;
2464   
2465         /* 
2466          *      Have to check these first unlike the old code. If 
2467          *      we check them after we lose data on an error
2468          *      which is wrong 
2469          */
2470 
2471         if(addr_len)
2472                 *addr_len = sizeof(*addr);
2473         result=tcp_read(sk, to, to_len, nonblock, flags);
2474 
2475         if (result < 0) 
2476                 return(result);
2477   
2478         if(addr)
2479         {
2480                 addr->sin_family = AF_INET;
2481                 addr->sin_port = sk->dummy_th.dest;
2482                 addr->sin_addr.s_addr = sk->daddr;
2483         }
2484         return(result);
2485 }
2486 
2487 
2488 /*
2489  *      This routine will send an RST to the other tcp. 
2490  */
2491  
2492 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2493           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2494 {
2495         struct sk_buff *buff;
2496         struct tcphdr *t1;
2497         int tmp;
2498         struct device *ndev=NULL;
2499 
2500         /*
2501          *      Cannot reset a reset (Think about it).
2502          */
2503          
2504         if(th->rst)
2505                 return;
2506   
2507         /*
2508          * We need to grab some memory, and put together an RST,
2509          * and then put it into the queue to be sent.
2510          */
2511 
2512         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2513         if (buff == NULL) 
2514                 return;
2515 
2516         buff->len = sizeof(*t1);
2517         buff->sk = NULL;
2518         buff->dev = dev;
2519         buff->localroute = 0;
2520 
2521         t1 =(struct tcphdr *) buff->data;
2522 
2523         /*
2524          *      Put in the IP header and routing stuff. 
2525          */
2526 
2527         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2528                            sizeof(struct tcphdr),tos,ttl);
2529         if (tmp < 0) 
2530         {
2531                 buff->free = 1;
2532                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2533                 return;
2534         }
2535 
2536         t1 =(struct tcphdr *)((char *)t1 +tmp);
2537         buff->len += tmp;
2538         memcpy(t1, th, sizeof(*t1));
2539 
2540         /*
2541          *      Swap the send and the receive. 
2542          */
2543 
2544         t1->dest = th->source;
2545         t1->source = th->dest;
2546         t1->rst = 1;  
2547         t1->window = 0;
2548   
2549         if(th->ack)
2550         {
2551                 t1->ack = 0;
2552                 t1->seq = th->ack_seq;
2553                 t1->ack_seq = 0;
2554         }
2555         else
2556         {
2557                 t1->ack = 1;
2558                 if(!th->syn)
2559                         t1->ack_seq=htonl(th->seq);
2560                 else
2561                         t1->ack_seq=htonl(th->seq+1);
2562                 t1->seq=0;
2563         }
2564 
2565         t1->syn = 0;
2566         t1->urg = 0;
2567         t1->fin = 0;
2568         t1->psh = 0;
2569         t1->doff = sizeof(*t1)/4;
2570         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2571         prot->queue_xmit(NULL, ndev, buff, 1);
2572         tcp_statistics.TcpOutSegs++;
2573 }
2574 
2575 
2576 /*
2577  *      Look for tcp options. Parses everything but only knows about MSS.
2578  *      This routine is always called with the packet containing the SYN.
2579  *      However it may also be called with the ack to the SYN.  So you
2580  *      can't assume this is always the SYN.  It's always called after
2581  *      we have set up sk->mtu to our own MTU.
2582  *
2583  *      We need at minimum to add PAWS support here. Possibly large windows
2584  *      as Linux gets deployed on 100Mb/sec networks.
2585  */
2586  
2587 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2588 {
2589         unsigned char *ptr;
2590         int length=(th->doff*4)-sizeof(struct tcphdr);
2591         int mss_seen = 0;
2592     
2593         ptr = (unsigned char *)(th + 1);
2594   
2595         while(length>0)
2596         {
2597                 int opcode=*ptr++;
2598                 int opsize=*ptr++;
2599                 switch(opcode)
2600                 {
2601                         case TCPOPT_EOL:
2602                                 return;
2603                         case TCPOPT_NOP:
2604                                 length-=2;
2605                                 continue;
2606                         
2607                         default:
2608                                 if(opsize<=2)   /* Avoid silly options looping forever */
2609                                         return;
2610                                 switch(opcode)
2611                                 {
2612                                         case TCPOPT_MSS:
2613                                                 if(opsize==4 && th->syn)
2614                                                 {
2615                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2616                                                         mss_seen = 1;
2617                                                 }
2618                                                 break;
2619                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2620                                 }
2621                                 ptr+=opsize-2;
2622                                 length-=opsize;
2623                 }
2624         }
2625         if (th->syn) 
2626         {
2627                 if (! mss_seen)
2628                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2629         }
2630 #ifdef CONFIG_INET_PCTCP
2631         sk->mss = min(sk->max_window >> 1, sk->mtu);
2632 #else    
2633         sk->mss = min(sk->max_window, sk->mtu);
2634 #endif  
2635 }
2636 
2637 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2638 {
2639         dst = ntohl(dst);
2640         if (IN_CLASSA(dst))
2641                 return htonl(IN_CLASSA_NET);
2642         if (IN_CLASSB(dst))
2643                 return htonl(IN_CLASSB_NET);
2644         return htonl(IN_CLASSC_NET);
2645 }
2646 
2647 /*
2648  *      Default sequence number picking algorithm.
2649  */
2650 
2651 extern inline long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2652 {
2653         return jiffies * SEQ_TICK - seq_offset; 
2654 }
2655 
2656 /*
2657  *      This routine handles a connection request.
2658  *      It should make sure we haven't already responded.
2659  *      Because of the way BSD works, we have to send a syn/ack now.
2660  *      This also means it will be harder to close a socket which is
2661  *      listening.
2662  */
2663  
2664 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2665                  unsigned long daddr, unsigned long saddr,
2666                  struct options *opt, struct device *dev, unsigned long seq)
2667 {
2668         struct sk_buff *buff;
2669         struct tcphdr *t1;
2670         unsigned char *ptr;
2671         struct sock *newsk;
2672         struct tcphdr *th;
2673         struct device *ndev=NULL;
2674         int tmp;
2675         struct rtable *rt;
2676   
2677         th = skb->h.th;
2678 
2679         /* If the socket is dead, don't accept the connection. */
2680         if (!sk->dead) 
2681         {
2682                 sk->data_ready(sk,0);
2683         }
2684         else 
2685         {
2686                 if(sk->debug)
2687                         printk("Reset on %p: Connect on dead socket.\n",sk);
2688                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2689                 tcp_statistics.TcpAttemptFails++;
2690                 kfree_skb(skb, FREE_READ);
2691                 return;
2692         }
2693 
2694         /*
2695          * Make sure we can accept more.  This will prevent a
2696          * flurry of syns from eating up all our memory.
2697          */
2698 
2699         if (sk->ack_backlog >= sk->max_ack_backlog) 
2700         {
2701                 tcp_statistics.TcpAttemptFails++;
2702                 kfree_skb(skb, FREE_READ);
2703                 return;
2704         }
2705 
2706         /*
2707          * We need to build a new sock struct.
2708          * It is sort of bad to have a socket without an inode attached
2709          * to it, but the wake_up's will just wake up the listening socket,
2710          * and if the listening socket is destroyed before this is taken
2711          * off of the queue, this will take care of it.
2712          */
2713 
2714         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2715         if (newsk == NULL) 
2716         {
2717                 /* just ignore the syn.  It will get retransmitted. */
2718                 tcp_statistics.TcpAttemptFails++;
2719                 kfree_skb(skb, FREE_READ);
2720                 return;
2721         }
2722 
2723         memcpy(newsk, sk, sizeof(*newsk));
2724         skb_queue_head_init(&newsk->write_queue);
2725         skb_queue_head_init(&newsk->receive_queue);
2726         newsk->send_head = NULL;
2727         newsk->send_tail = NULL;
2728         skb_queue_head_init(&newsk->back_log);
2729         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2730         newsk->rto = TCP_TIMEOUT_INIT;
2731         newsk->mdev = 0;
2732         newsk->max_window = 0;
2733         newsk->cong_window = 1;
2734         newsk->cong_count = 0;
2735         newsk->ssthresh = 0;
2736         newsk->backoff = 0;
2737         newsk->blog = 0;
2738         newsk->intr = 0;
2739         newsk->proc = 0;
2740         newsk->done = 0;
2741         newsk->partial = NULL;
2742         newsk->pair = NULL;
2743         newsk->wmem_alloc = 0;
2744         newsk->rmem_alloc = 0;
2745         newsk->localroute = sk->localroute;
2746 
2747         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2748 
2749         newsk->err = 0;
2750         newsk->shutdown = 0;
2751         newsk->ack_backlog = 0;
2752         newsk->acked_seq = skb->h.th->seq+1;
2753         newsk->copied_seq = skb->h.th->seq+1;
2754         newsk->fin_seq = skb->h.th->seq;
2755         newsk->state = TCP_SYN_RECV;
2756         newsk->timeout = 0;
2757         newsk->ip_xmit_timeout = 0;
2758         newsk->write_seq = seq; 
2759         newsk->window_seq = newsk->write_seq;
2760         newsk->rcv_ack_seq = newsk->write_seq;
2761         newsk->urg_data = 0;
2762         newsk->retransmits = 0;
2763         newsk->linger=0;
2764         newsk->destroy = 0;
2765         init_timer(&newsk->timer);
2766         init_timer(&newsk->retransmit_timer);
2767         newsk->timer.data = (unsigned long)newsk;
2768         newsk->timer.function = &net_timer;
2769         newsk->retransmit_timer.data = (unsigned long)newsk;
2770         newsk->retransmit_timer.function=&retransmit_timer;
2771         newsk->dummy_th.source = skb->h.th->dest;
2772         newsk->dummy_th.dest = skb->h.th->source;
2773         
2774         /*
2775          *      Swap these two, they are from our point of view. 
2776          */
2777          
2778         newsk->daddr = saddr;
2779         newsk->saddr = daddr;
2780 
2781         put_sock(newsk->num,newsk);
2782         newsk->dummy_th.res1 = 0;
2783         newsk->dummy_th.doff = 6;
2784         newsk->dummy_th.fin = 0;
2785         newsk->dummy_th.syn = 0;
2786         newsk->dummy_th.rst = 0;        
2787         newsk->dummy_th.psh = 0;
2788         newsk->dummy_th.ack = 0;
2789         newsk->dummy_th.urg = 0;
2790         newsk->dummy_th.res2 = 0;
2791         newsk->acked_seq = skb->h.th->seq + 1;
2792         newsk->copied_seq = skb->h.th->seq + 1;
2793         newsk->socket = NULL;
2794 
2795         /*
2796          *      Grab the ttl and tos values and use them 
2797          */
2798 
2799         newsk->ip_ttl=sk->ip_ttl;
2800         newsk->ip_tos=skb->ip_hdr->tos;
2801 
2802         /*
2803          *      Use 512 or whatever user asked for 
2804          */
2805 
2806         /*
2807          *      Note use of sk->user_mss, since user has no direct access to newsk 
2808          */
2809 
2810         rt=ip_rt_route(saddr, NULL,NULL);
2811         
2812         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2813                 newsk->window_clamp = rt->rt_window;
2814         else
2815                 newsk->window_clamp = 0;
2816                 
2817         if (sk->user_mss)
2818                 newsk->mtu = sk->user_mss;
2819         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2820                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2821         else 
2822         {
2823 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2824                 if ((saddr ^ daddr) & default_mask(saddr))
2825 #else
2826                 if ((saddr ^ daddr) & dev->pa_mask)
2827 #endif
2828                         newsk->mtu = 576 - HEADER_SIZE;
2829                 else
2830                         newsk->mtu = MAX_WINDOW;
2831         }
2832 
2833         /*
2834          *      But not bigger than device MTU 
2835          */
2836 
2837         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2838 
2839         /*
2840          *      This will min with what arrived in the packet 
2841          */
2842 
2843         tcp_options(newsk,skb->h.th);
2844 
2845         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2846         if (buff == NULL) 
2847         {
2848                 sk->err = -ENOMEM;
2849                 newsk->dead = 1;
2850                 release_sock(newsk);
2851                 kfree_skb(skb, FREE_READ);
2852                 tcp_statistics.TcpAttemptFails++;
2853                 return;
2854         }
2855   
2856         buff->len = sizeof(struct tcphdr)+4;
2857         buff->sk = newsk;
2858         buff->localroute = newsk->localroute;
2859 
2860         t1 =(struct tcphdr *) buff->data;
2861 
2862         /*
2863          *      Put in the IP header and routing stuff. 
2864          */
2865 
2866         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2867                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2868 
2869         /*
2870          *      Something went wrong. 
2871          */
2872 
2873         if (tmp < 0) 
2874         {
2875                 sk->err = tmp;
2876                 buff->free = 1;
2877                 kfree_skb(buff,FREE_WRITE);
2878                 newsk->dead = 1;
2879                 release_sock(newsk);
2880                 skb->sk = sk;
2881                 kfree_skb(skb, FREE_READ);
2882                 tcp_statistics.TcpAttemptFails++;
2883                 return;
2884         }
2885 
2886         buff->len += tmp;
2887         t1 =(struct tcphdr *)((char *)t1 +tmp);
2888   
2889         memcpy(t1, skb->h.th, sizeof(*t1));
2890         buff->h.seq = newsk->write_seq;
2891         /*
2892          *      Swap the send and the receive. 
2893          */
2894         t1->dest = skb->h.th->source;
2895         t1->source = newsk->dummy_th.source;
2896         t1->seq = ntohl(newsk->write_seq++);
2897         t1->ack = 1;
2898         newsk->window = tcp_select_window(newsk);
2899         newsk->sent_seq = newsk->write_seq;
2900         t1->window = ntohs(newsk->window);
2901         t1->res1 = 0;
2902         t1->res2 = 0;
2903         t1->rst = 0;
2904         t1->urg = 0;
2905         t1->psh = 0;
2906         t1->syn = 1;
2907         t1->ack_seq = ntohl(skb->h.th->seq+1);
2908         t1->doff = sizeof(*t1)/4+1;
2909         ptr =(unsigned char *)(t1+1);
2910         ptr[0] = 2;
2911         ptr[1] = 4;
2912         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2913         ptr[3] =(newsk->mtu) & 0xff;
2914 
2915         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2916         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2917         reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2918 
2919         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2920         skb->sk = newsk;
2921 
2922         /*
2923          *      Charge the sock_buff to newsk. 
2924          */
2925          
2926         sk->rmem_alloc -= skb->mem_len;
2927         newsk->rmem_alloc += skb->mem_len;
2928         
2929         skb_queue_tail(&sk->receive_queue,skb);
2930         sk->ack_backlog++;
2931         release_sock(newsk);
2932         tcp_statistics.TcpOutSegs++;
2933 }
2934 
2935 
2936 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2937 {
2938         /*
2939          * We need to grab some memory, and put together a FIN, 
2940          * and then put it into the queue to be sent.
2941          */
2942         
2943         sk->inuse = 1;
2944         
2945         if(sk->state == TCP_LISTEN)
2946         {
2947                 /* Special case */
2948                 tcp_set_state(sk, TCP_CLOSE);
2949                 tcp_close_pending(sk);
2950                 release_sock(sk);
2951                 return;
2952         }
2953         
2954         sk->keepopen = 1;
2955         sk->shutdown = SHUTDOWN_MASK;
2956 
2957         if (!sk->dead) 
2958                 sk->state_change(sk);
2959 
2960         if (timeout == 0) 
2961         {
2962                 struct sk_buff *skb;
2963                 
2964                 /*
2965                  *  We need to flush the recv. buffs.  We do this only on the
2966                  *  descriptor close, not protocol-sourced closes, because the
2967                  *  reader process may not have drained the data yet!
2968                  */
2969                  
2970                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2971                         kfree_skb(skb, FREE_READ);
2972                 /*
2973                  *      Get rid off any half-completed packets. 
2974                  */
2975 
2976                 if (sk->partial) 
2977                         tcp_send_partial(sk);
2978         }
2979 
2980                 
2981         /*
2982          *      Timeout is not the same thing - however the code likes
2983          *      to send both the same way (sigh).
2984          */
2985          
2986         if(timeout)
2987         {
2988                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2989         }
2990         else
2991         {
2992                 if(tcp_close_state(sk,1)==1)
2993                 {
2994                         tcp_send_fin(sk);
2995                 }
2996         }
2997         release_sock(sk);
2998 }
2999 
3000 
3001 /*
3002  *      This routine takes stuff off of the write queue,
3003  *      and puts it in the xmit queue. This happens as incoming acks
3004  *      open up the remote window for us.
3005  */
3006  
3007 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3008 {
3009         struct sk_buff *skb;
3010 
3011         /*
3012          *      The bytes will have to remain here. In time closedown will
3013          *      empty the write queue and all will be happy 
3014          */
3015 
3016         if(sk->zapped)
3017                 return;
3018 
3019         /*
3020          *      Anything on the transmit queue that fits the window can
3021          *      be added providing we are not
3022          *
3023          *      a) retransmitting (Nagle's rule)
3024          *      b) exceeding our congestion window.
3025          */
3026          
3027         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3028                 before(skb->h.seq, sk->window_seq + 1) &&
3029                 (sk->retransmits == 0 ||
3030                  sk->ip_xmit_timeout != TIME_WRITE ||
3031                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3032                 && sk->packets_out < sk->cong_window) 
3033         {
3034                 IS_SKB(skb);
3035                 skb_unlink(skb);
3036                 
3037                 /*
3038                  *      See if we really need to send the packet. 
3039                  */
3040                  
3041                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3042                 {
3043                         /*
3044                          *      This is acked data. We can discard it. This 
3045                          *      cannot currently occur.
3046                          */
3047                          
3048                         sk->retransmits = 0;
3049                         kfree_skb(skb, FREE_WRITE);
3050                         if (!sk->dead) 
3051                                 sk->write_space(sk);
3052                 } 
3053                 else
3054                 {
3055                         struct tcphdr *th;
3056                         struct iphdr *iph;
3057                         int size;
3058 /*
3059  * put in the ack seq and window at this point rather than earlier,
3060  * in order to keep them monotonic.  We really want to avoid taking
3061  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3062  * Ack and window will in general have changed since this packet was put
3063  * on the write queue.
3064  */
3065                         iph = (struct iphdr *)(skb->data +
3066                                                skb->dev->hard_header_len);
3067                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3068                         size = skb->len - (((unsigned char *) th) - skb->data);
3069                         
3070                         th->ack_seq = ntohl(sk->acked_seq);
3071                         th->window = ntohs(tcp_select_window(sk));
3072 
3073                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3074 
3075                         sk->sent_seq = skb->h.seq;
3076                         
3077                         /*
3078                          *      IP manages our queue for some crazy reason
3079                          */
3080                          
3081                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3082                         
3083                         /*
3084                          *      Again we slide the timer wrongly
3085                          */
3086                          
3087                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3088                 }
3089         }
3090 }
3091 
3092 
3093 /*
3094  *      This routine deals with incoming acks, but not outgoing ones.
3095  */
3096 
3097 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3098 {
3099         unsigned long ack;
3100         int flag = 0;
3101 
3102         /* 
3103          * 1 - there was data in packet as well as ack or new data is sent or 
3104          *     in shutdown state
3105          * 2 - data from retransmit queue was acked and removed
3106          * 4 - window shrunk or data from retransmit queue was acked and removed
3107          */
3108 
3109         if(sk->zapped)
3110                 return(1);      /* Dead, cant ack any more so why bother */
3111 
3112         /*
3113          *      Have we discovered a larger window
3114          */
3115          
3116         ack = ntohl(th->ack_seq);
3117 
3118         if (ntohs(th->window) > sk->max_window) 
3119         {
3120                 sk->max_window = ntohs(th->window);
3121 #ifdef CONFIG_INET_PCTCP
3122                 /* Hack because we don't send partial packets to non SWS
3123                    handling hosts */
3124                 sk->mss = min(sk->max_window>>1, sk->mtu);
3125 #else
3126                 sk->mss = min(sk->max_window, sk->mtu);
3127 #endif  
3128         }
3129 
3130         /*
3131          *      We have dropped back to keepalive timeouts. Thus we have
3132          *      no retransmits pending.
3133          */
3134          
3135         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3136                 sk->retransmits = 0;
3137 
3138         /*
3139          *      If the ack is newer than sent or older than previous acks
3140          *      then we can probably ignore it.
3141          */
3142          
3143         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3144         {
3145                 if(sk->debug)
3146                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3147                         
3148                 /*
3149                  *      Keepalive processing.
3150                  */
3151                  
3152                 if (after(ack, sk->sent_seq)) 
3153                 {
3154                         return(0);
3155                 }
3156                 
3157                 /*
3158                  *      Restart the keepalive timer.
3159                  */
3160                  
3161                 if (sk->keepopen) 
3162                 {
3163                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3164                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3165                 }
3166                 return(1);
3167         }
3168 
3169         /*
3170          *      If there is data set flag 1
3171          */
3172          
3173         if (len != th->doff*4) 
3174                 flag |= 1;
3175 
3176         /*
3177          *      See if our window has been shrunk. 
3178          */
3179 
3180         if (after(sk->window_seq, ack+ntohs(th->window))) 
3181         {
3182                 /*
3183                  * We may need to move packets from the send queue
3184                  * to the write queue, if the window has been shrunk on us.
3185                  * The RFC says you are not allowed to shrink your window
3186                  * like this, but if the other end does, you must be able
3187                  * to deal with it.
3188                  */
3189                 struct sk_buff *skb;
3190                 struct sk_buff *skb2;
3191                 struct sk_buff *wskb = NULL;
3192         
3193                 skb2 = sk->send_head;
3194                 sk->send_head = NULL;
3195                 sk->send_tail = NULL;
3196         
3197                 /*
3198                  *      This is an artifact of a flawed concept. We want one
3199                  *      queue and a smarter send routine when we send all.
3200                  */
3201         
3202                 flag |= 4;      /* Window changed */
3203         
3204                 sk->window_seq = ack + ntohs(th->window);
3205                 cli();
3206                 while (skb2 != NULL) 
3207                 {
3208                         skb = skb2;
3209                         skb2 = skb->link3;
3210                         skb->link3 = NULL;
3211                         if (after(skb->h.seq, sk->window_seq)) 
3212                         {
3213                                 if (sk->packets_out > 0) 
3214                                         sk->packets_out--;
3215                                 /* We may need to remove this from the dev send list. */
3216                                 if (skb->next != NULL) 
3217                                 {
3218                                         skb_unlink(skb);                                
3219                                 }
3220                                 /* Now add it to the write_queue. */
3221                                 if (wskb == NULL)
3222                                         skb_queue_head(&sk->write_queue,skb);
3223                                 else
3224                                         skb_append(wskb,skb);
3225                                 wskb = skb;
3226                         } 
3227                         else 
3228                         {
3229                                 if (sk->send_head == NULL) 
3230                                 {
3231                                         sk->send_head = skb;
3232                                         sk->send_tail = skb;
3233                                 }
3234                                 else
3235                                 {
3236                                         sk->send_tail->link3 = skb;
3237                                         sk->send_tail = skb;
3238                                 }
3239                                 skb->link3 = NULL;
3240                         }
3241                 }
3242                 sti();
3243         }
3244 
3245         /*
3246          *      Pipe has emptied
3247          */
3248          
3249         if (sk->send_tail == NULL || sk->send_head == NULL) 
3250         {
3251                 sk->send_head = NULL;
3252                 sk->send_tail = NULL;
3253                 sk->packets_out= 0;
3254         }
3255 
3256         /*
3257          *      Update the right hand window edge of the host
3258          */
3259          
3260         sk->window_seq = ack + ntohs(th->window);
3261 
3262         /*
3263          *      We don't want too many packets out there. 
3264          */
3265          
3266         if (sk->ip_xmit_timeout == TIME_WRITE && 
3267                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3268         {
3269                 /* 
3270                  * This is Jacobson's slow start and congestion avoidance. 
3271                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3272                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3273                  * counter and increment it once every cwnd times.  It's possible
3274                  * that this should be done only if sk->retransmits == 0.  I'm
3275                  * interpreting "new data is acked" as including data that has
3276                  * been retransmitted but is just now being acked.
3277                  */
3278                 if (sk->cong_window < sk->ssthresh)  
3279                         /* 
3280                          *      In "safe" area, increase
3281                          */
3282                         sk->cong_window++;
3283                 else 
3284                 {
3285                         /*
3286                          *      In dangerous area, increase slowly.  In theory this is
3287                          *      sk->cong_window += 1 / sk->cong_window
3288                          */
3289                         if (sk->cong_count >= sk->cong_window) 
3290                         {
3291                                 sk->cong_window++;
3292                                 sk->cong_count = 0;
3293                         }
3294                         else 
3295                                 sk->cong_count++;
3296                 }
3297         }
3298 
3299         /*
3300          *      Remember the highest ack received.
3301          */
3302          
3303         sk->rcv_ack_seq = ack;
3304 
3305         /*
3306          *      If this ack opens up a zero window, clear backoff.  It was
3307          *      being used to time the probes, and is probably far higher than
3308          *      it needs to be for normal retransmission.
3309          */
3310 
3311         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3312         {
3313                 sk->retransmits = 0;    /* Our probe was answered */
3314                 
3315                 /*
3316                  *      Was it a usable window open ?
3317                  */
3318                  
3319                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3320                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3321                 {
3322                         sk->backoff = 0;
3323                         
3324                         /*
3325                          *      Recompute rto from rtt.  this eliminates any backoff.
3326                          */
3327 
3328                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3329                         if (sk->rto > 120*HZ)
3330                                 sk->rto = 120*HZ;
3331                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3332                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3333                                                    .2 of a second is going to need huge windows (SIGH) */
3334                         sk->rto = 20;
3335                 }
3336         }
3337 
3338         /* 
3339          *      See if we can take anything off of the retransmit queue.
3340          */
3341    
3342         while(sk->send_head != NULL) 
3343         {
3344                 /* Check for a bug. */
3345                 if (sk->send_head->link3 &&
3346                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3347                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3348                         
3349                 /*
3350                  *      If our packet is before the ack sequence we can
3351                  *      discard it as its confirmed to have arrived the other end.
3352                  */
3353                  
3354                 if (before(sk->send_head->h.seq, ack+1)) 
3355                 {
3356                         struct sk_buff *oskb;   
3357                         if (sk->retransmits) 
3358                         {       
3359                                 /*
3360                                  *      We were retransmitting.  don't count this in RTT est 
3361                                  */
3362                                 flag |= 2;
3363 
3364                                 /*
3365                                  * even though we've gotten an ack, we're still
3366                                  * retransmitting as long as we're sending from
3367                                  * the retransmit queue.  Keeping retransmits non-zero
3368                                  * prevents us from getting new data interspersed with
3369                                  * retransmissions.
3370                                  */
3371 
3372                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3373                                         sk->retransmits = 1;
3374                                 else
3375                                         sk->retransmits = 0;
3376                         }
3377                         /*
3378                          * Note that we only reset backoff and rto in the
3379                          * rtt recomputation code.  And that doesn't happen
3380                          * if there were retransmissions in effect.  So the
3381                          * first new packet after the retransmissions is
3382                          * sent with the backoff still in effect.  Not until
3383                          * we get an ack from a non-retransmitted packet do
3384                          * we reset the backoff and rto.  This allows us to deal
3385                          * with a situation where the network delay has increased
3386                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3387                          */
3388 
3389                         /*
3390                          *      We have one less packet out there. 
3391                          */
3392                          
3393                         if (sk->packets_out > 0) 
3394                                 sk->packets_out --;
3395                         /* 
3396                          *      Wake up the process, it can probably write more. 
3397                          */
3398                         if (!sk->dead) 
3399                                 sk->write_space(sk);
3400                         oskb = sk->send_head;
3401 
3402                         if (!(flag&2))  /* Not retransmitting */
3403                         {
3404                                 long m;
3405         
3406                                 /*
3407                                  *      The following amusing code comes from Jacobson's
3408                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3409                                  *      are scaled versions of rtt and mean deviation.
3410                                  *      This is designed to be as fast as possible 
3411                                  *      m stands for "measurement".
3412                                  */
3413         
3414                                 m = jiffies - oskb->when;  /* RTT */
3415                                 if(m<=0)
3416                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3417                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3418                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3419                                 if (m < 0)
3420                                         m = -m;         /* m is now abs(error) */
3421                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3422                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3423         
3424                                 /*
3425                                  *      Now update timeout.  Note that this removes any backoff.
3426                                  */
3427                          
3428                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3429                                 if (sk->rto > 120*HZ)
3430                                         sk->rto = 120*HZ;
3431                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3432                                         sk->rto = 20;
3433                                 sk->backoff = 0;
3434                         }
3435                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3436                                            In this case as we just set it up */
3437                         cli();
3438                         oskb = sk->send_head;
3439                         IS_SKB(oskb);
3440                         sk->send_head = oskb->link3;
3441                         if (sk->send_head == NULL) 
3442                         {
3443                                 sk->send_tail = NULL;
3444                         }
3445 
3446                 /*
3447                  *      We may need to remove this from the dev send list. 
3448                  */
3449 
3450                         if (oskb->next)
3451                                 skb_unlink(oskb);
3452                         sti();
3453                         kfree_skb(oskb, FREE_WRITE); /* write. */
3454                         if (!sk->dead) 
3455                                 sk->write_space(sk);
3456                 }
3457                 else
3458                 {
3459                         break;
3460                 }
3461         }
3462 
3463         /*
3464          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3465          * returns non-NULL, we complete ignore the timer stuff in the else
3466          * clause.  We ought to organize the code so that else clause can
3467          * (should) be executed regardless, possibly moving the PROBE timer
3468          * reset over.  The skb_peek() thing should only move stuff to the
3469          * write queue, NOT also manage the timer functions.
3470          */
3471 
3472         /*
3473          * Maybe we can take some stuff off of the write queue,
3474          * and put it onto the xmit queue.
3475          */
3476         if (skb_peek(&sk->write_queue) != NULL) 
3477         {
3478                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3479                         (sk->retransmits == 0 || 
3480                          sk->ip_xmit_timeout != TIME_WRITE ||
3481                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3482                         && sk->packets_out < sk->cong_window) 
3483                 {
3484                         /*
3485                          *      Add more data to the send queue.
3486                          */
3487                         flag |= 1;
3488                         tcp_write_xmit(sk);
3489                 }
3490                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3491                         sk->send_head == NULL &&
3492                         sk->ack_backlog == 0 &&
3493                         sk->state != TCP_TIME_WAIT) 
3494                 {
3495                         /*
3496                          *      Data to queue but no room.
3497                          */
3498                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3499                 }               
3500         }
3501         else
3502         {
3503                 /*
3504                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3505                  * from TCP_CLOSE we don't do anything
3506                  *
3507                  * from anything else, if there is write data (or fin) pending,
3508                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3509                  * a KEEPALIVE timeout, else we delete the timer.
3510                  *
3511                  * We do not set flag for nominal write data, otherwise we may
3512                  * force a state where we start to write itsy bitsy tidbits
3513                  * of data.
3514                  */
3515 
3516                 switch(sk->state) {
3517                 case TCP_TIME_WAIT:
3518                         /*
3519                          * keep us in TIME_WAIT until we stop getting packets,
3520                          * reset the timeout.
3521                          */
3522                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3523                         break;
3524                 case TCP_CLOSE:
3525                         /*
3526                          * don't touch the timer.
3527                          */
3528                         break;
3529                 default:
3530                         /*
3531                          *      Must check send_head, write_queue, and ack_backlog
3532                          *      to determine which timeout to use.
3533                          */
3534                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3535                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3536                         } else if (sk->keepopen) {
3537                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3538                         } else {
3539                                 del_timer(&sk->retransmit_timer);
3540                                 sk->ip_xmit_timeout = 0;
3541                         }
3542                         break;
3543                 }
3544         }
3545 
3546         /*
3547          *      We have nothing queued but space to send. Send any partial
3548          *      packets immediately (end of Nagle rule application).
3549          */
3550          
3551         if (sk->packets_out == 0 && sk->partial != NULL &&
3552                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3553         {
3554                 flag |= 1;
3555                 tcp_send_partial(sk);
3556         }
3557 
3558         /*
3559          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3560          * we are now waiting for an acknowledge to our FIN.  The other end is
3561          * already in TIME_WAIT.
3562          *
3563          * Move to TCP_CLOSE on success.
3564          */
3565 
3566         if (sk->state == TCP_LAST_ACK) 
3567         {
3568                 if (!sk->dead)
3569                         sk->state_change(sk);
3570                 if(sk->debug)
3571                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3572                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3573                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3574                 {
3575                         flag |= 1;
3576                         tcp_set_state(sk,TCP_CLOSE);
3577                         sk->shutdown = SHUTDOWN_MASK;
3578                 }
3579         }
3580 
3581         /*
3582          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3583          *
3584          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3585          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3586          */
3587 
3588         if (sk->state == TCP_FIN_WAIT1) 
3589         {
3590 
3591                 if (!sk->dead) 
3592                         sk->state_change(sk);
3593                 if (sk->rcv_ack_seq == sk->write_seq) 
3594                 {
3595                         flag |= 1;
3596                         sk->shutdown |= SEND_SHUTDOWN;
3597                         tcp_set_state(sk, TCP_FIN_WAIT2);
3598                 }
3599         }
3600 
3601         /*
3602          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3603          *
3604          *      Move to TIME_WAIT
3605          */
3606 
3607         if (sk->state == TCP_CLOSING) 
3608         {
3609 
3610                 if (!sk->dead) 
3611                         sk->state_change(sk);
3612                 if (sk->rcv_ack_seq == sk->write_seq) 
3613                 {
3614                         flag |= 1;
3615                         tcp_time_wait(sk);
3616                 }
3617         }
3618         
3619         /*
3620          *      Final ack of a three way shake 
3621          */
3622          
3623         if(sk->state==TCP_SYN_RECV)
3624         {
3625                 tcp_set_state(sk, TCP_ESTABLISHED);
3626                 tcp_options(sk,th);
3627                 sk->dummy_th.dest=th->source;
3628                 sk->copied_seq = sk->acked_seq;
3629                 if(!sk->dead)
3630                         sk->state_change(sk);
3631                 if(sk->max_window==0)
3632                 {
3633                         sk->max_window=32;      /* Sanity check */
3634                         sk->mss=min(sk->max_window,sk->mtu);
3635                 }
3636         }
3637         
3638         /*
3639          * I make no guarantees about the first clause in the following
3640          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3641          * what conditions "!flag" would be true.  However I think the rest
3642          * of the conditions would prevent that from causing any
3643          * unnecessary retransmission. 
3644          *   Clearly if the first packet has expired it should be 
3645          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3646          * harder to explain:  You have to look carefully at how and when the
3647          * timer is set and with what timeout.  The most recent transmission always
3648          * sets the timer.  So in general if the most recent thing has timed
3649          * out, everything before it has as well.  So we want to go ahead and
3650          * retransmit some more.  If we didn't explicitly test for this
3651          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3652          * would not be true.  If you look at the pattern of timing, you can
3653          * show that rto is increased fast enough that the next packet would
3654          * almost never be retransmitted immediately.  Then you'd end up
3655          * waiting for a timeout to send each packet on the retransmission
3656          * queue.  With my implementation of the Karn sampling algorithm,
3657          * the timeout would double each time.  The net result is that it would
3658          * take a hideous amount of time to recover from a single dropped packet.
3659          * It's possible that there should also be a test for TIME_WRITE, but
3660          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3661          * got to be in real retransmission mode.
3662          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3663          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3664          * As long as no further losses occur, this seems reasonable.
3665          */
3666         
3667         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3668                (((flag&2) && sk->retransmits) ||
3669                (sk->send_head->when + sk->rto < jiffies))) 
3670         {
3671                 if(sk->send_head->when + sk->rto < jiffies)
3672                         tcp_retransmit(sk,0);   
3673                 else
3674                 {
3675                         tcp_do_retransmit(sk, 1);
3676                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3677                 }
3678         }
3679 
3680         return(1);
3681 }
3682 
3683 
3684 /*
3685  *      Process the FIN bit. This now behaves as it is supposed to work
3686  *      and the FIN takes effect when it is validly part of sequence
3687  *      space. Not before when we get holes.
3688  *
3689  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3690  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3691  *      TIME-WAIT)
3692  *
3693  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3694  *      close and we go into CLOSING (and later onto TIME-WAIT)
3695  *
3696  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3697  *
3698  */
3699  
3700 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3701 {
3702         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3703 
3704         if (!sk->dead) 
3705         {
3706                 sk->state_change(sk);
3707                 sock_wake_async(sk->socket, 1);
3708         }
3709 
3710         switch(sk->state) 
3711         {
3712                 case TCP_SYN_RECV:
3713                 case TCP_SYN_SENT:
3714                 case TCP_ESTABLISHED:
3715                         /*
3716                          * move to CLOSE_WAIT, tcp_data() already handled
3717                          * sending the ack.
3718                          */
3719                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3720                         if (th->rst)
3721                                 sk->shutdown = SHUTDOWN_MASK;
3722                         break;
3723 
3724                 case TCP_CLOSE_WAIT:
3725                 case TCP_CLOSING:
3726                         /*
3727                          * received a retransmission of the FIN, do
3728                          * nothing.
3729                          */
3730                         break;
3731                 case TCP_TIME_WAIT:
3732                         /*
3733                          * received a retransmission of the FIN,
3734                          * restart the TIME_WAIT timer.
3735                          */
3736                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3737                         return(0);
3738                 case TCP_FIN_WAIT1:
3739                         /*
3740                          * This case occurs when a simultaneous close
3741                          * happens, we must ack the received FIN and
3742                          * enter the CLOSING state.
3743                          *
3744                          * This causes a WRITE timeout, which will either
3745                          * move on to TIME_WAIT when we timeout, or resend
3746                          * the FIN properly (maybe we get rid of that annoying
3747                          * FIN lost hang). The TIME_WRITE code is already correct
3748                          * for handling this timeout.
3749                          */
3750 
3751                         if(sk->ip_xmit_timeout != TIME_WRITE)
3752                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3753                         tcp_set_state(sk,TCP_CLOSING);
3754                         break;
3755                 case TCP_FIN_WAIT2:
3756                         /*
3757                          * received a FIN -- send ACK and enter TIME_WAIT
3758                          */
3759                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3760                         sk->shutdown|=SHUTDOWN_MASK;
3761                         tcp_set_state(sk,TCP_TIME_WAIT);
3762                         break;
3763                 case TCP_CLOSE:
3764                         /*
3765                          * already in CLOSE
3766                          */
3767                         break;
3768                 default:
3769                         tcp_set_state(sk,TCP_LAST_ACK);
3770         
3771                         /* Start the timers. */
3772                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3773                         return(0);
3774         }
3775 
3776         return(0);
3777 }
3778 
3779 
3780 
3781 /*
3782  *      This routine handles the data.  If there is room in the buffer,
3783  *      it will be have already been moved into it.  If there is no
3784  *      room, then we will just have to discard the packet.
3785  */
3786 
3787 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3788          unsigned long saddr, unsigned short len)
3789 {
3790         struct sk_buff *skb1, *skb2;
3791         struct tcphdr *th;
3792         int dup_dumped=0;
3793         unsigned long new_seq;
3794         unsigned long shut_seq;
3795 
3796         th = skb->h.th;
3797         skb->len = len -(th->doff*4);
3798 
3799         /*
3800          *      The bytes in the receive read/assembly queue has increased. Needed for the
3801          *      low memory discard algorithm 
3802          */
3803            
3804         sk->bytes_rcv += skb->len;
3805         
3806         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3807         {
3808                 /* 
3809                  *      Don't want to keep passing ack's back and forth. 
3810                  *      (someone sent us dataless, boring frame)
3811                  */
3812                 if (!th->ack)
3813                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3814                 kfree_skb(skb, FREE_READ);
3815                 return(0);
3816         }
3817         
3818         /*
3819          *      We no longer have anyone receiving data on this connection.
3820          */
3821 
3822 #ifndef TCP_DONT_RST_SHUTDOWN            
3823 
3824         if(sk->shutdown & RCV_SHUTDOWN)
3825         {
3826                 /*
3827                  *      FIXME: BSD has some magic to avoid sending resets to
3828                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3829                  *      BSD stacks still have broken keepalives so we want to
3830                  *      cope with it.
3831                  */
3832 
3833                 if(skb->len)    /* We don't care if its just an ack or
3834                                    a keepalive/window probe */
3835                 {
3836                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3837                         
3838                         /* Do this the way 4.4BSD treats it. Not what I'd
3839                            regard as the meaning of the spec but its what BSD
3840                            does and clearly they know everything 8) */
3841 
3842                         /*
3843                          *      This is valid because of two things
3844                          *
3845                          *      a) The way tcp_data behaves at the bottom.
3846                          *      b) A fin takes effect when read not when received.
3847                          */
3848                          
3849                         shut_seq=sk->acked_seq+1;       /* Last byte */
3850                         
3851                         if(after(new_seq,shut_seq))
3852                         {
3853                                 if(sk->debug)
3854                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3855                                                 sk, new_seq, shut_seq, sk->blog);
3856                                 if(sk->dead)
3857                                 {
3858                                         sk->acked_seq = new_seq + th->fin;
3859                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3860                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3861                                         tcp_statistics.TcpEstabResets++;
3862                                         tcp_set_state(sk,TCP_CLOSE);
3863                                         sk->err = EPIPE;
3864                                         sk->shutdown = SHUTDOWN_MASK;
3865                                         kfree_skb(skb, FREE_READ);
3866                                         return 0;
3867                                 }
3868                         }
3869                 }
3870         }
3871 
3872 #endif
3873 
3874         /*
3875          *      Now we have to walk the chain, and figure out where this one
3876          *      goes into it.  This is set up so that the last packet we received
3877          *      will be the first one we look at, that way if everything comes
3878          *      in order, there will be no performance loss, and if they come
3879          *      out of order we will be able to fit things in nicely.
3880          *
3881          *      [AC: This is wrong. We should assume in order first and then walk
3882          *       forwards from the first hole based upon real traffic patterns.]
3883          *      
3884          */
3885 
3886         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3887         {
3888                 skb_queue_head(&sk->receive_queue,skb);
3889                 skb1= NULL;
3890         } 
3891         else
3892         {
3893                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3894                 {
3895                         if(sk->debug)
3896                         {
3897                                 printk("skb1=%p :", skb1);
3898                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3899                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3900                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3901                                                 sk->acked_seq);
3902                         }
3903                         
3904                         /*
3905                          *      Optimisation: Duplicate frame or extension of previous frame from
3906                          *      same sequence point (lost ack case).
3907                          *      The frame contains duplicate data or replaces a previous frame
3908                          *      discard the previous frame (safe as sk->inuse is set) and put
3909                          *      the new one in its place.
3910                          */
3911                          
3912                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3913                         {
3914                                 skb_append(skb1,skb);
3915                                 skb_unlink(skb1);
3916                                 kfree_skb(skb1,FREE_READ);
3917                                 dup_dumped=1;
3918                                 skb1=NULL;
3919                                 break;
3920                         }
3921                         
3922                         /*
3923                          *      Found where it fits
3924                          */
3925                          
3926                         if (after(th->seq+1, skb1->h.th->seq))
3927                         {
3928                                 skb_append(skb1,skb);
3929                                 break;
3930                         }
3931                         
3932                         /*
3933                          *      See if we've hit the start. If so insert.
3934                          */
3935                         if (skb1 == skb_peek(&sk->receive_queue))
3936                         {
3937                                 skb_queue_head(&sk->receive_queue, skb);
3938                                 break;
3939                         }
3940                 }
3941         }
3942 
3943         /*
3944          *      Figure out what the ack value for this frame is
3945          */
3946          
3947         th->ack_seq = th->seq + skb->len;
3948         if (th->syn) 
3949                 th->ack_seq++;
3950         if (th->fin)
3951                 th->ack_seq++;
3952 
3953         if (before(sk->acked_seq, sk->copied_seq)) 
3954         {
3955                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3956                 sk->acked_seq = sk->copied_seq;
3957         }
3958 
3959         /*
3960          *      Now figure out if we can ack anything. This is very messy because we really want two
3961          *      receive queues, a completed and an assembly queue. We also want only one transmit
3962          *      queue.
3963          */
3964 
3965         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3966         {
3967                 if (before(th->seq, sk->acked_seq+1)) 
3968                 {
3969                         int newwindow;
3970 
3971                         if (after(th->ack_seq, sk->acked_seq)) 
3972                         {
3973                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3974                                 if (newwindow < 0)
3975                                         newwindow = 0;  
3976                                 sk->window = newwindow;
3977                                 sk->acked_seq = th->ack_seq;
3978                         }
3979                         skb->acked = 1;
3980 
3981                         /*
3982                          *      When we ack the fin, we do the FIN 
3983                          *      processing.
3984                          */
3985 
3986                         if (skb->h.th->fin) 
3987                         {
3988                                 tcp_fin(skb,sk,skb->h.th);
3989                         }
3990           
3991                         for(skb2 = skb->next;
3992                             skb2 != (struct sk_buff *)&sk->receive_queue;
3993                             skb2 = skb2->next) 
3994                         {
3995                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3996                                 {
3997                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3998                                         {
3999                                                 newwindow = sk->window -
4000                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4001                                                 if (newwindow < 0)
4002                                                         newwindow = 0;  
4003                                                 sk->window = newwindow;
4004                                                 sk->acked_seq = skb2->h.th->ack_seq;
4005                                         }
4006                                         skb2->acked = 1;
4007                                         /*
4008                                          *      When we ack the fin, we do
4009                                          *      the fin handling.
4010                                          */
4011                                         if (skb2->h.th->fin) 
4012                                         {
4013                                                 tcp_fin(skb,sk,skb->h.th);
4014                                         }
4015 
4016                                         /*
4017                                          *      Force an immediate ack.
4018                                          */
4019                                          
4020                                         sk->ack_backlog = sk->max_ack_backlog;
4021                                 }
4022                                 else
4023                                 {
4024                                         break;
4025                                 }
4026                         }
4027 
4028                         /*
4029                          *      This also takes care of updating the window.
4030                          *      This if statement needs to be simplified.
4031                          */
4032                         if (!sk->delay_acks ||
4033                             sk->ack_backlog >= sk->max_ack_backlog || 
4034                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4035         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4036                         }
4037                         else 
4038                         {
4039                                 sk->ack_backlog++;
4040                                 if(sk->debug)
4041                                         printk("Ack queued.\n");
4042                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4043                         }
4044                 }
4045         }
4046 
4047         /*
4048          *      If we've missed a packet, send an ack.
4049          *      Also start a timer to send another.
4050          */
4051          
4052         if (!skb->acked) 
4053         {
4054         
4055         /*
4056          *      This is important.  If we don't have much room left,
4057          *      we need to throw out a few packets so we have a good
4058          *      window.  Note that mtu is used, not mss, because mss is really
4059          *      for the send side.  He could be sending us stuff as large as mtu.
4060          */
4061                  
4062                 while (sk->prot->rspace(sk) < sk->mtu) 
4063                 {
4064                         skb1 = skb_peek(&sk->receive_queue);
4065                         if (skb1 == NULL) 
4066                         {
4067                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4068                                 break;
4069                         }
4070 
4071                         /*
4072                          *      Don't throw out something that has been acked. 
4073                          */
4074                  
4075                         if (skb1->acked) 
4076                         {
4077                                 break;
4078                         }
4079                 
4080                         skb_unlink(skb1);
4081                         kfree_skb(skb1, FREE_READ);
4082                 }
4083                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4084                 sk->ack_backlog++;
4085                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4086         }
4087         else
4088         {
4089                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4090         }
4091 
4092         /*
4093          *      Now tell the user we may have some data. 
4094          */
4095          
4096         if (!sk->dead) 
4097         {
4098                 if(sk->debug)
4099                         printk("Data wakeup.\n");
4100                 sk->data_ready(sk,0);
4101         } 
4102         return(0);
4103 }
4104 
4105 
4106 /*
4107  *      This routine is only called when we have urgent data
4108  *      signalled. Its the 'slow' part of tcp_urg. It could be
4109  *      moved inline now as tcp_urg is only called from one
4110  *      place. We handle URGent data wrong. We have to - as
4111  *      BSD still doesn't use the correction from RFC961.
4112  */
4113  
4114 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4115 {
4116         unsigned long ptr = ntohs(th->urg_ptr);
4117 
4118         if (ptr)
4119                 ptr--;
4120         ptr += th->seq;
4121 
4122         /* ignore urgent data that we've already seen and read */
4123         if (after(sk->copied_seq, ptr))
4124                 return;
4125 
4126         /* do we already have a newer (or duplicate) urgent pointer? */
4127         if (sk->urg_data && !after(ptr, sk->urg_seq))
4128                 return;
4129 
4130         /* tell the world about our new urgent pointer */
4131         if (sk->proc != 0) {
4132                 if (sk->proc > 0) {
4133                         kill_proc(sk->proc, SIGURG, 1);
4134                 } else {
4135                         kill_pg(-sk->proc, SIGURG, 1);
4136                 }
4137         }
4138         sk->urg_data = URG_NOTYET;
4139         sk->urg_seq = ptr;
4140 }
4141 
4142 /*
4143  *      This is the 'fast' part of urgent handling.
4144  */
4145  
4146 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4147         unsigned long saddr, unsigned long len)
4148 {
4149         unsigned long ptr;
4150 
4151         /*
4152          *      Check if we get a new urgent pointer - normally not 
4153          */
4154          
4155         if (th->urg)
4156                 tcp_check_urg(sk,th);
4157 
4158         /*
4159          *      Do we wait for any urgent data? - normally not
4160          */
4161          
4162         if (sk->urg_data != URG_NOTYET)
4163                 return 0;
4164 
4165         /*
4166          *      Is the urgent pointer pointing into this packet? 
4167          */
4168          
4169         ptr = sk->urg_seq - th->seq + th->doff*4;
4170         if (ptr >= len)
4171                 return 0;
4172 
4173         /*
4174          *      Ok, got the correct packet, update info 
4175          */
4176          
4177         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4178         if (!sk->dead)
4179                 sk->data_ready(sk,0);
4180         return 0;
4181 }
4182 
4183 /*
4184  *      This will accept the next outstanding connection. 
4185  */
4186  
4187 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4188 {
4189         struct sock *newsk;
4190         struct sk_buff *skb;
4191   
4192   /*
4193    * We need to make sure that this socket is listening,
4194    * and that it has something pending.
4195    */
4196 
4197         if (sk->state != TCP_LISTEN) 
4198         {
4199                 sk->err = EINVAL;
4200                 return(NULL); 
4201         }
4202 
4203         /* Avoid the race. */
4204         cli();
4205         sk->inuse = 1;
4206 
4207         while((skb = tcp_dequeue_established(sk)) == NULL) 
4208         {
4209                 if (flags & O_NONBLOCK) 
4210                 {
4211                         sti();
4212                         release_sock(sk);
4213                         sk->err = EAGAIN;
4214                         return(NULL);
4215                 }
4216 
4217                 release_sock(sk);
4218                 interruptible_sleep_on(sk->sleep);
4219                 if (current->signal & ~current->blocked) 
4220                 {
4221                         sti();
4222                         sk->err = ERESTARTSYS;
4223                         return(NULL);
4224                 }
4225                 sk->inuse = 1;
4226         }
4227         sti();
4228 
4229         /*
4230          *      Now all we need to do is return skb->sk. 
4231          */
4232 
4233         newsk = skb->sk;
4234 
4235         kfree_skb(skb, FREE_READ);
4236         sk->ack_backlog--;
4237         release_sock(sk);
4238         return(newsk);
4239 }
4240 
4241 
4242 /*
4243  *      This will initiate an outgoing connection. 
4244  */
4245  
4246 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4247 {
4248         struct sk_buff *buff;
4249         struct device *dev=NULL;
4250         unsigned char *ptr;
4251         int tmp;
4252         int atype;
4253         struct tcphdr *t1;
4254         struct rtable *rt;
4255 
4256         if (sk->state != TCP_CLOSE) 
4257         {
4258                 return(-EISCONN);
4259         }
4260         
4261         if (addr_len < 8) 
4262                 return(-EINVAL);
4263 
4264         if (usin->sin_family && usin->sin_family != AF_INET) 
4265                 return(-EAFNOSUPPORT);
4266 
4267         /*
4268          *      connect() to INADDR_ANY means loopback (BSD'ism).
4269          */
4270         
4271         if(usin->sin_addr.s_addr==INADDR_ANY)
4272                 usin->sin_addr.s_addr=ip_my_addr();
4273                   
4274         /*
4275          *      Don't want a TCP connection going to a broadcast address 
4276          */
4277 
4278         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4279                 return -ENETUNREACH;
4280   
4281         sk->inuse = 1;
4282         sk->daddr = usin->sin_addr.s_addr;
4283         sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4284         sk->window_seq = sk->write_seq;
4285         sk->rcv_ack_seq = sk->write_seq -1;
4286         sk->err = 0;
4287         sk->dummy_th.dest = usin->sin_port;
4288         release_sock(sk);
4289 
4290         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4291         if (buff == NULL) 
4292         {
4293                 return(-ENOMEM);
4294         }
4295         sk->inuse = 1;
4296         buff->len = 24;
4297         buff->sk = sk;
4298         buff->free = 0;
4299         buff->localroute = sk->localroute;
4300         
4301         t1 = (struct tcphdr *) buff->data;
4302 
4303         /*
4304          *      Put in the IP header and routing stuff. 
4305          */
4306          
4307         rt=ip_rt_route(sk->daddr, NULL, NULL);
4308         
4309 
4310         /*
4311          *      We need to build the routing stuff from the things saved in skb. 
4312          */
4313 
4314         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4315                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4316         if (tmp < 0) 
4317         {
4318                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4319                 release_sock(sk);
4320                 return(-ENETUNREACH);
4321         }
4322 
4323         buff->len += tmp;
4324         t1 = (struct tcphdr *)((char *)t1 +tmp);
4325 
4326         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4327         t1->seq = ntohl(sk->write_seq++);
4328         sk->sent_seq = sk->write_seq;
4329         buff->h.seq = sk->write_seq;
4330         t1->ack = 0;
4331         t1->window = 2;
4332         t1->res1=0;
4333         t1->res2=0;
4334         t1->rst = 0;
4335         t1->urg = 0;
4336         t1->psh = 0;
4337         t1->syn = 1;
4338         t1->urg_ptr = 0;
4339         t1->doff = 6;
4340         /* use 512 or whatever user asked for */
4341         
4342         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4343                 sk->window_clamp=rt->rt_window;
4344         else
4345                 sk->window_clamp=0;
4346 
4347         if (sk->user_mss)
4348                 sk->mtu = sk->user_mss;
4349         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4350                 sk->mtu = rt->rt_mss;
4351         else 
4352         {
4353 #ifdef CONFIG_INET_SNARL
4354                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4355 #else
4356                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4357 #endif
4358                         sk->mtu = 576 - HEADER_SIZE;
4359                 else
4360                         sk->mtu = MAX_WINDOW;
4361         }
4362         /*
4363          *      but not bigger than device MTU 
4364          */
4365 
4366         if(sk->mtu <32)
4367                 sk->mtu = 32;   /* Sanity limit */
4368                 
4369         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4370         
4371         /*
4372          *      Put in the TCP options to say MTU. 
4373          */
4374 
4375         ptr = (unsigned char *)(t1+1);
4376         ptr[0] = 2;
4377         ptr[1] = 4;
4378         ptr[2] = (sk->mtu) >> 8;
4379         ptr[3] = (sk->mtu) & 0xff;
4380         tcp_send_check(t1, sk->saddr, sk->daddr,
4381                   sizeof(struct tcphdr) + 4, sk);
4382 
4383         /*
4384          *      This must go first otherwise a really quick response will get reset. 
4385          */
4386 
4387         tcp_set_state(sk,TCP_SYN_SENT);
4388         sk->rto = TCP_TIMEOUT_INIT;
4389         init_timer(&sk->retransmit_timer);
4390         sk->retransmit_timer.function=&retransmit_timer;
4391         sk->retransmit_timer.data = (unsigned long)sk;
4392         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4393         sk->retransmits = TCP_SYN_RETRIES;
4394 
4395         sk->prot->queue_xmit(sk, dev, buff, 0);  
4396         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4397         tcp_statistics.TcpActiveOpens++;
4398         tcp_statistics.TcpOutSegs++;
4399   
4400         release_sock(sk);
4401         return(0);
4402 }
4403 
4404 
4405 /* This functions checks to see if the tcp header is actually acceptable. */
4406 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4407              struct options *opt, unsigned long saddr, struct device *dev)
4408 {
4409         unsigned long next_seq;
4410 
4411         next_seq = len - 4*th->doff;
4412         if (th->fin)
4413                 next_seq++;
4414         /* if we have a zero window, we can't have any data in the packet.. */
4415         if (next_seq && !sk->window)
4416                 goto ignore_it;
4417         next_seq += th->seq;
4418 
4419         /*
4420          * This isn't quite right.  sk->acked_seq could be more recent
4421          * than sk->window.  This is however close enough.  We will accept
4422          * slightly more packets than we should, but it should not cause
4423          * problems unless someone is trying to forge packets.
4424          */
4425 
4426         /* have we already seen all of this packet? */
4427         if (!after(next_seq+1, sk->acked_seq))
4428                 goto ignore_it;
4429         /* or does it start beyond the window? */
4430         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4431                 goto ignore_it;
4432 
4433         /* ok, at least part of this packet would seem interesting.. */
4434         return 1;
4435 
4436 ignore_it:
4437         if (th->rst)
4438                 return 0;
4439 
4440         /*
4441          *      Send a reset if we get something not ours and we are
4442          *      unsynchronized. Note: We don't do anything to our end. We
4443          *      are just killing the bogus remote connection then we will
4444          *      connect again and it will work (with luck).
4445          */
4446          
4447         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4448         {
4449                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4450                 return 1;
4451         }
4452 
4453         /* Try to resync things. */
4454         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4455         return 0;
4456 }
4457 
4458 /*
4459  *      When we get a reset we do this.
4460  */
4461 
4462 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4463 {
4464         sk->zapped = 1;
4465         sk->err = ECONNRESET;
4466         if (sk->state == TCP_SYN_SENT)
4467                 sk->err = ECONNREFUSED;
4468         if (sk->state == TCP_CLOSE_WAIT)
4469                 sk->err = EPIPE;
4470 #ifdef TCP_DO_RFC1337           
4471         /*
4472          *      Time wait assassination protection [RFC1337]
4473          */
4474         if(sk->state!=TCP_TIME_WAIT)
4475         {       
4476                 tcp_set_state(sk,TCP_CLOSE);
4477                 sk->shutdown = SHUTDOWN_MASK;
4478         }
4479 #else   
4480         tcp_set_state(sk,TCP_CLOSE);
4481         sk->shutdown = SHUTDOWN_MASK;
4482 #endif  
4483         if (!sk->dead) 
4484                 sk->state_change(sk);
4485         kfree_skb(skb, FREE_READ);
4486         release_sock(sk);
4487         return(0);
4488 }
4489 
4490 /*
4491  *      A TCP packet has arrived.
4492  */
4493  
4494 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4495         unsigned long daddr, unsigned short len,
4496         unsigned long saddr, int redo, struct inet_protocol * protocol)
4497 {
4498         struct tcphdr *th;
4499         struct sock *sk;
4500         int syn_ok=0;
4501         
4502         if (!skb) 
4503         {
4504                 printk("IMPOSSIBLE 1\n");
4505                 return(0);
4506         }
4507 
4508         if (!dev) 
4509         {
4510                 printk("IMPOSSIBLE 2\n");
4511                 return(0);
4512         }
4513   
4514         tcp_statistics.TcpInSegs++;
4515   
4516         if(skb->pkt_type!=PACKET_HOST)
4517         {
4518                 kfree_skb(skb,FREE_READ);
4519                 return(0);
4520         }
4521   
4522         th = skb->h.th;
4523 
4524         /*
4525          *      Find the socket.
4526          */
4527 
4528         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4529 
4530         /*
4531          *      If this socket has got a reset its to all intents and purposes 
4532          *      really dead. Count closed sockets as dead.
4533          *
4534          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4535          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4536          *      exist so should cause resets as if the port was unreachable.
4537          */
4538          
4539         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4540                 sk=NULL;
4541 
4542         if (!redo) 
4543         {
4544                 if (tcp_check(th, len, saddr, daddr )) 
4545                 {
4546                         skb->sk = NULL;
4547                         kfree_skb(skb,FREE_READ);
4548                         /*
4549                          *      We don't release the socket because it was
4550                          *      never marked in use.
4551                          */
4552                         return(0);
4553                 }
4554                 th->seq = ntohl(th->seq);
4555 
4556                 /* See if we know about the socket. */
4557                 if (sk == NULL) 
4558                 {
4559                         /*
4560                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4561                          */
4562                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4563                         skb->sk = NULL;
4564                         /*
4565                          *      Discard frame
4566                          */
4567                         kfree_skb(skb, FREE_READ);
4568                         return(0);
4569                 }
4570 
4571                 skb->len = len;
4572                 skb->acked = 0;
4573                 skb->used = 0;
4574                 skb->free = 0;
4575                 skb->saddr = daddr;
4576                 skb->daddr = saddr;
4577         
4578                 /* We may need to add it to the backlog here. */
4579                 cli();
4580                 if (sk->inuse) 
4581                 {
4582                         skb_queue_tail(&sk->back_log, skb);
4583                         sti();
4584                         return(0);
4585                 }
4586                 sk->inuse = 1;
4587                 sti();
4588         }
4589         else
4590         {
4591                 if (sk==NULL) 
4592                 {
4593                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4594                         skb->sk = NULL;
4595                         kfree_skb(skb, FREE_READ);
4596                         return(0);
4597                 }
4598         }
4599 
4600 
4601         if (!sk->prot) 
4602         {
4603                 printk("IMPOSSIBLE 3\n");
4604                 return(0);
4605         }
4606 
4607 
4608         /*
4609          *      Charge the memory to the socket. 
4610          */
4611          
4612         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4613         {
4614                 kfree_skb(skb, FREE_READ);
4615                 release_sock(sk);
4616                 return(0);
4617         }
4618 
4619         skb->sk=sk;
4620         sk->rmem_alloc += skb->mem_len;
4621 
4622         /*
4623          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4624          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4625          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4626          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4627          */
4628 
4629         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4630         {
4631         
4632                 /*
4633                  *      Now deal with unusual cases.
4634                  */
4635          
4636                 if(sk->state==TCP_LISTEN)
4637                 {
4638                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4639                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4640 
4641                         /*
4642                          *      We don't care for RST, and non SYN are absorbed (old segments)
4643                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4644                          *      netmask on a running connection it can go broadcast. Even Sun's have
4645                          *      this problem so I'm ignoring it 
4646                          */
4647                            
4648                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4649                         {
4650                                 kfree_skb(skb, FREE_READ);
4651                                 release_sock(sk);
4652                                 return 0;
4653                         }
4654                 
4655                         /*      
4656                          *      Guess we need to make a new socket up 
4657                          */
4658                 
4659                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4660                 
4661                         /*
4662                          *      Now we have several options: In theory there is nothing else
4663                          *      in the frame. KA9Q has an option to send data with the syn,
4664                          *      BSD accepts data with the syn up to the [to be] advertised window
4665                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4666                          *      it, that fits the spec precisely and avoids incompatibilities. It
4667                          *      would be nice in future to drop through and process the data.
4668                          */
4669                          
4670                         release_sock(sk);
4671                         return 0;
4672                 }
4673         
4674                 /* retransmitted SYN? */
4675                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4676                 {
4677                         kfree_skb(skb, FREE_READ);
4678                         release_sock(sk);
4679                         return 0;
4680                 }
4681                 
4682                 /*
4683                  *      SYN sent means we have to look for a suitable ack and either reset
4684                  *      for bad matches or go to connected 
4685                  */
4686            
4687                 if(sk->state==TCP_SYN_SENT)
4688                 {
4689                         /* Crossed SYN or previous junk segment */
4690                         if(th->ack)
4691                         {
4692                                 /* We got an ack, but its not a good ack */
4693                                 if(!tcp_ack(sk,th,saddr,len))
4694                                 {
4695                                         /* Reset the ack - its an ack from a 
4696                                            different connection  [ th->rst is checked in tcp_reset()] */
4697                                         tcp_statistics.TcpAttemptFails++;
4698                                         tcp_reset(daddr, saddr, th,
4699                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4700                                         kfree_skb(skb, FREE_READ);
4701                                         release_sock(sk);
4702                                         return(0);
4703                                 }
4704                                 if(th->rst)
4705                                         return tcp_std_reset(sk,skb);
4706                                 if(!th->syn)
4707                                 {
4708                                         /* A valid ack from a different connection
4709                                            start. Shouldn't happen but cover it */
4710                                         kfree_skb(skb, FREE_READ);
4711                                         release_sock(sk);
4712                                         return 0;
4713                                 }
4714                                 /*
4715                                  *      Ok.. its good. Set up sequence numbers and
4716                                  *      move to established.
4717                                  */
4718                                 syn_ok=1;       /* Don't reset this connection for the syn */
4719                                 sk->acked_seq=th->seq+1;
4720                                 sk->fin_seq=th->seq;
4721                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4722                                 tcp_set_state(sk, TCP_ESTABLISHED);
4723                                 tcp_options(sk,th);
4724                                 sk->dummy_th.dest=th->source;
4725                                 sk->copied_seq = sk->acked_seq;
4726                                 if(!sk->dead)
4727                                 {
4728                                         sk->state_change(sk);
4729                                         sock_wake_async(sk->socket, 0);
4730                                 }
4731                                 if(sk->max_window==0)
4732                                 {
4733                                         sk->max_window = 32;
4734                                         sk->mss = min(sk->max_window, sk->mtu);
4735                                 }
4736                         }
4737                         else
4738                         {
4739                                 /* See if SYN's cross. Drop if boring */
4740                                 if(th->syn && !th->rst)
4741                                 {
4742                                         /* Crossed SYN's are fine - but talking to
4743                                            yourself is right out... */
4744                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4745                                                 sk->dummy_th.source==th->source &&
4746                                                 sk->dummy_th.dest==th->dest)
4747                                         {
4748                                                 tcp_statistics.TcpAttemptFails++;
4749                                                 return tcp_std_reset(sk,skb);
4750                                         }
4751                                         tcp_set_state(sk,TCP_SYN_RECV);
4752                                         
4753                                         /*
4754                                          *      FIXME:
4755                                          *      Must send SYN|ACK here
4756                                          */
4757                                 }               
4758                                 /* Discard junk segment */
4759                                 kfree_skb(skb, FREE_READ);
4760                                 release_sock(sk);
4761                                 return 0;
4762                         }
4763                         /*
4764                          *      SYN_RECV with data maybe.. drop through
4765                          */
4766                         goto rfc_step6;
4767                 }
4768 
4769         /*
4770          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4771          *      a more complex suggestion for fixing these reuse issues in RFC1644
4772          *      but not yet ready for general use. Also see RFC1379.
4773          */
4774         
4775 #define BSD_TIME_WAIT
4776 #ifdef BSD_TIME_WAIT
4777                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4778                         after(th->seq, sk->acked_seq) && !th->rst)
4779                 {
4780                         long seq=sk->write_seq;
4781                         if(sk->debug)
4782                                 printk("Doing a BSD time wait\n");
4783                         tcp_statistics.TcpEstabResets++;           
4784                         sk->rmem_alloc -= skb->mem_len;
4785                         skb->sk = NULL;
4786                         sk->err=ECONNRESET;
4787                         tcp_set_state(sk, TCP_CLOSE);
4788                         sk->shutdown = SHUTDOWN_MASK;
4789                         release_sock(sk);
4790                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4791                         if (sk && sk->state==TCP_LISTEN)
4792                         {
4793                                 sk->inuse=1;
4794                                 skb->sk = sk;
4795                                 sk->rmem_alloc += skb->mem_len;
4796                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4797                                 release_sock(sk);
4798                                 return 0;
4799                         }
4800                         kfree_skb(skb, FREE_READ);
4801                         return 0;
4802                 }
4803 #endif  
4804         }
4805 
4806         /*
4807          *      We are now in normal data flow (see the step list in the RFC)
4808          *      Note most of these are inline now. I'll inline the lot when
4809          *      I have time to test it hard and look at what gcc outputs 
4810          */
4811         
4812         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4813         {
4814                 kfree_skb(skb, FREE_READ);
4815                 release_sock(sk);
4816                 return 0;
4817         }
4818 
4819         if(th->rst)
4820                 return tcp_std_reset(sk,skb);
4821         
4822         /*
4823          *      !syn_ok is effectively the state test in RFC793.
4824          */
4825          
4826         if(th->syn && !syn_ok)
4827         {
4828                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4829                 return tcp_std_reset(sk,skb);   
4830         }
4831 
4832         /*
4833          *      Process the ACK
4834          */
4835          
4836 
4837         if(th->ack && !tcp_ack(sk,th,saddr,len))
4838         {
4839                 /*
4840                  *      Our three way handshake failed.
4841                  */
4842                  
4843                 if(sk->state==TCP_SYN_RECV)
4844                 {
4845                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4846                 }
4847                 kfree_skb(skb, FREE_READ);
4848                 release_sock(sk);
4849                 return 0;
4850         }
4851         
4852 rfc_step6:              /* I'll clean this up later */
4853 
4854         /*
4855          *      Process urgent data
4856          */
4857                 
4858         if(tcp_urg(sk, th, saddr, len))
4859         {
4860                 kfree_skb(skb, FREE_READ);
4861                 release_sock(sk);
4862                 return 0;
4863         }
4864         
4865         
4866         /*
4867          *      Process the encapsulated data
4868          */
4869         
4870         if(tcp_data(skb,sk, saddr, len))
4871         {
4872                 kfree_skb(skb, FREE_READ);
4873                 release_sock(sk);
4874                 return 0;
4875         }
4876 
4877         /*
4878          *      And done
4879          */     
4880         
4881         release_sock(sk);
4882         return 0;
4883 }
4884 
4885 /*
4886  *      This routine sends a packet with an out of date sequence
4887  *      number. It assumes the other end will try to ack it.
4888  */
4889 
4890 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4891 {
4892         struct sk_buff *buff;
4893         struct tcphdr *t1;
4894         struct device *dev=NULL;
4895         int tmp;
4896 
4897         if (sk->zapped)
4898                 return; /* After a valid reset we can send no more */
4899 
4900         /*
4901          *      Write data can still be transmitted/retransmitted in the
4902          *      following states.  If any other state is encountered, return.
4903          *      [listen/close will never occur here anyway]
4904          */
4905 
4906         if (sk->state != TCP_ESTABLISHED && 
4907             sk->state != TCP_CLOSE_WAIT &&
4908             sk->state != TCP_FIN_WAIT1 && 
4909             sk->state != TCP_LAST_ACK &&
4910             sk->state != TCP_CLOSING
4911         ) 
4912         {
4913                 return;
4914         }
4915 
4916         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4917         if (buff == NULL) 
4918                 return;
4919 
4920         buff->len = sizeof(struct tcphdr);
4921         buff->free = 1;
4922         buff->sk = sk;
4923         buff->localroute = sk->localroute;
4924 
4925         t1 = (struct tcphdr *) buff->data;
4926 
4927         /* Put in the IP header and routing stuff. */
4928         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4929                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4930         if (tmp < 0) 
4931         {
4932                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4933                 return;
4934         }
4935 
4936         buff->len += tmp;
4937         t1 = (struct tcphdr *)((char *)t1 +tmp);
4938 
4939         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4940 
4941         /*
4942          *      Use a previous sequence.
4943          *      This should cause the other end to send an ack.
4944          */
4945          
4946         t1->seq = htonl(sk->sent_seq-1);
4947         t1->ack = 1; 
4948         t1->res1= 0;
4949         t1->res2= 0;
4950         t1->rst = 0;
4951         t1->urg = 0;
4952         t1->psh = 0;
4953         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4954         t1->syn = 0;
4955         t1->ack_seq = ntohl(sk->acked_seq);
4956         t1->window = ntohs(tcp_select_window(sk));
4957         t1->doff = sizeof(*t1)/4;
4958         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4959          /*
4960           *     Send it and free it.
4961           *     This will prevent the timer from automatically being restarted.
4962           */
4963         sk->prot->queue_xmit(sk, dev, buff, 1);
4964         tcp_statistics.TcpOutSegs++;
4965 }
4966 
4967 /*
4968  *      A window probe timeout has occurred.
4969  */
4970 
4971 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4972 {
4973         if (sk->zapped)
4974                 return;         /* After a valid reset we can send no more */
4975 
4976         tcp_write_wakeup(sk);
4977 
4978         sk->backoff++;
4979         sk->rto = min(sk->rto << 1, 120*HZ);
4980         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4981         sk->retransmits++;
4982         sk->prot->retransmits ++;
4983 }
4984 
4985 /*
4986  *      Socket option code for TCP. 
4987  */
4988   
4989 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4990 {
4991         int val,err;
4992 
4993         if(level!=SOL_TCP)
4994                 return ip_setsockopt(sk,level,optname,optval,optlen);
4995 
4996         if (optval == NULL) 
4997                 return(-EINVAL);
4998 
4999         err=verify_area(VERIFY_READ, optval, sizeof(int));
5000         if(err)
5001                 return err;
5002         
5003         val = get_fs_long((unsigned long *)optval);
5004 
5005         switch(optname)
5006         {
5007                 case TCP_MAXSEG:
5008 /*
5009  * values greater than interface MTU won't take effect.  however at
5010  * the point when this call is done we typically don't yet know
5011  * which interface is going to be used
5012  */
5013                         if(val<1||val>MAX_WINDOW)
5014                                 return -EINVAL;
5015                         sk->user_mss=val;
5016                         return 0;
5017                 case TCP_NODELAY:
5018                         sk->nonagle=(val==0)?0:1;
5019                         return 0;
5020                 default:
5021                         return(-ENOPROTOOPT);
5022         }
5023 }
5024 
5025 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5026 {
5027         int val,err;
5028 
5029         if(level!=SOL_TCP)
5030                 return ip_getsockopt(sk,level,optname,optval,optlen);
5031                         
5032         switch(optname)
5033         {
5034                 case TCP_MAXSEG:
5035                         val=sk->user_mss;
5036                         break;
5037                 case TCP_NODELAY:
5038                         val=sk->nonagle;
5039                         break;
5040                 default:
5041                         return(-ENOPROTOOPT);
5042         }
5043         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5044         if(err)
5045                 return err;
5046         put_fs_long(sizeof(int),(unsigned long *) optlen);
5047 
5048         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5049         if(err)
5050                 return err;
5051         put_fs_long(val,(unsigned long *)optval);
5052 
5053         return(0);
5054 }       
5055 
5056 
5057 struct proto tcp_prot = {
5058         sock_wmalloc,
5059         sock_rmalloc,
5060         sock_wfree,
5061         sock_rfree,
5062         sock_rspace,
5063         sock_wspace,
5064         tcp_close,
5065         tcp_read,
5066         tcp_write,
5067         tcp_sendto,
5068         tcp_recvfrom,
5069         ip_build_header,
5070         tcp_connect,
5071         tcp_accept,
5072         ip_queue_xmit,
5073         tcp_retransmit,
5074         tcp_write_wakeup,
5075         tcp_read_wakeup,
5076         tcp_rcv,
5077         tcp_select,
5078         tcp_ioctl,
5079         NULL,
5080         tcp_shutdown,
5081         tcp_setsockopt,
5082         tcp_getsockopt,
5083         128,
5084         0,
5085         {NULL,},
5086         "TCP",
5087         0, 0
5088 };

/* [previous][next][first][last][top][bottom][index][help] */