root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *
 136  *
 137  * To Fix:
 138  *              Fast path the code. Two things here - fix the window calculation
 139  *              so it doesn't iterate over the queue, also spot packets with no funny
 140  *              options arriving in order and process directly.
 141  *
 142  *              Implement RFC 1191 [Path MTU discovery]
 143  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 144  *              Rewrite output state machine to use a single queue and do low window
 145  *              situations as per the spec (RFC 1122)
 146  *              Speed up input assembly algorithm.
 147  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 148  *              could do with it working on IPv4
 149  *              User settable/learned rtt/max window/mtu
 150  *              Cope with MTU/device switches when retransmitting in tcp.
 151  *              Fix the window handling to use PR's new code.
 152  *
 153  *              Change the fundamental structure to a single send queue maintained
 154  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 155  *              active routes too]). Cut the queue off in tcp_retransmit/
 156  *              tcp_transmit.
 157  *              Change the receive queue to assemble as it goes. This lets us
 158  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 159  *              tcp_data/tcp_read as well as the window shrink crud.
 160  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 161  *              tcp_queue_skb seem obvious routines to extract.
 162  *      
 163  *              This program is free software; you can redistribute it and/or
 164  *              modify it under the terms of the GNU General Public License
 165  *              as published by the Free Software Foundation; either version
 166  *              2 of the License, or(at your option) any later version.
 167  *
 168  * Description of States:
 169  *
 170  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 171  *
 172  *      TCP_SYN_RECV            received a connection request, sent ack,
 173  *                              waiting for final ack in three-way handshake.
 174  *
 175  *      TCP_ESTABLISHED         connection established
 176  *
 177  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 178  *                              transmission of remaining buffered data
 179  *
 180  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 181  *                              to shutdown
 182  *
 183  *      TCP_CLOSING             both sides have shutdown but we still have
 184  *                              data we have to finish sending
 185  *
 186  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 187  *                              closed, can only be entered from FIN_WAIT2
 188  *                              or CLOSING.  Required because the other end
 189  *                              may not have gotten our last ACK causing it
 190  *                              to retransmit the data packet (which we ignore)
 191  *
 192  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 193  *                              us to finish writing our data and to shutdown
 194  *                              (we have to close() to move on to LAST_ACK)
 195  *
 196  *      TCP_LAST_ACK            out side has shutdown after remote has
 197  *                              shutdown.  There may still be data in our
 198  *                              buffer that we have to finish sending
 199  *              
 200  *      TCP_CLOSE               socket is finished
 201  */
 202 
 203 #include <linux/types.h>
 204 #include <linux/sched.h>
 205 #include <linux/mm.h>
 206 #include <linux/time.h>
 207 #include <linux/string.h>
 208 #include <linux/config.h>
 209 #include <linux/socket.h>
 210 #include <linux/sockios.h>
 211 #include <linux/termios.h>
 212 #include <linux/in.h>
 213 #include <linux/fcntl.h>
 214 #include <linux/inet.h>
 215 #include <linux/netdevice.h>
 216 #include "snmp.h"
 217 #include "ip.h"
 218 #include "protocol.h"
 219 #include "icmp.h"
 220 #include "tcp.h"
 221 #include "arp.h"
 222 #include <linux/skbuff.h>
 223 #include "sock.h"
 224 #include "route.h"
 225 #include <linux/errno.h>
 226 #include <linux/timer.h>
 227 #include <asm/system.h>
 228 #include <asm/segment.h>
 229 #include <linux/mm.h>
 230 
 231 /*
 232  *      The MSL timer is the 'normal' timer.
 233  */
 234  
 235 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 236 
 237 #define SEQ_TICK 3
 238 unsigned long seq_offset;
 239 struct tcp_mib  tcp_statistics;
 240 
 241 static void tcp_close(struct sock *sk, int timeout);
 242 
 243 
 244 /*
 245  *      The less said about this the better, but it works and will do for 1.2 
 246  */
 247 
 248 static struct wait_queue *master_select_wakeup;
 249 
 250 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 251 {
 252         if (a < b) 
 253                 return(a);
 254         return(b);
 255 }
 256 
 257 #undef STATE_TRACE
 258 
 259 #ifdef STATE_TRACE
 260 static char *statename[]={
 261         "Unused","Established","Syn Sent","Syn Recv",
 262         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 263         "Close Wait","Last ACK","Listen","Closing"
 264 };
 265 #endif
 266 
 267 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 268 {
 269         if(sk->state==TCP_ESTABLISHED)
 270                 tcp_statistics.TcpCurrEstab--;
 271 #ifdef STATE_TRACE
 272         if(sk->debug)
 273                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 274 #endif  
 275         /* This is a hack but it doesn't occur often and its going to
 276            be a real        to fix nicely */
 277            
 278         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 279         {
 280                 wake_up_interruptible(&master_select_wakeup);
 281         }
 282         sk->state=state;
 283         if(state==TCP_ESTABLISHED)
 284                 tcp_statistics.TcpCurrEstab++;
 285 }
 286 
 287 /*
 288  *      This routine picks a TCP windows for a socket based on
 289  *      the following constraints
 290  *  
 291  *      1. The window can never be shrunk once it is offered (RFC 793)
 292  *      2. We limit memory per socket
 293  *   
 294  *      For now we use NET2E3's heuristic of offering half the memory
 295  *      we have handy. All is not as bad as this seems however because
 296  *      of two things. Firstly we will bin packets even within the window
 297  *      in order to get the data we are waiting for into the memory limit.
 298  *      Secondly we bin common duplicate forms at receive time
 299  *      Better heuristics welcome
 300  */
 301    
 302 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 303 {
 304         int new_window = sk->prot->rspace(sk);
 305         
 306         if(sk->window_clamp)
 307                 new_window=min(sk->window_clamp,new_window);
 308         /*
 309          *      Two things are going on here.  First, we don't ever offer a
 310          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 311          *      receiver side of SWS as specified in RFC1122.
 312          *      Second, we always give them at least the window they
 313          *      had before, in order to avoid retracting window.  This
 314          *      is technically allowed, but RFC1122 advises against it and
 315          *      in practice it causes trouble.
 316          *
 317          *      Fixme: This doesn't correctly handle the case where
 318          *      new_window > sk->window but not by enough to allow for the
 319          *      shift in sequence space. 
 320          */
 321         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 322                 return(sk->window);
 323         return(new_window);
 324 }
 325 
 326 /*
 327  *      Find someone to 'accept'. Must be called with
 328  *      sk->inuse=1 or cli()
 329  */ 
 330 
 331 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 332 {
 333         struct sk_buff *p=skb_peek(&s->receive_queue);
 334         if(p==NULL)
 335                 return NULL;
 336         do
 337         {
 338                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 339                         return p;
 340                 p=p->next;
 341         }
 342         while(p!=(struct sk_buff *)&s->receive_queue);
 343         return NULL;
 344 }
 345 
 346 /*
 347  *      Remove a completed connection and return it. This is used by
 348  *      tcp_accept() to get connections from the queue.
 349  */
 350 
 351 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 352 {
 353         struct sk_buff *skb;
 354         unsigned long flags;
 355         save_flags(flags);
 356         cli(); 
 357         skb=tcp_find_established(s);
 358         if(skb!=NULL)
 359                 skb_unlink(skb);        /* Take it off the queue */
 360         restore_flags(flags);
 361         return skb;
 362 }
 363 
 364 /* 
 365  *      This routine closes sockets which have been at least partially
 366  *      opened, but not yet accepted. Currently it is only called by
 367  *      tcp_close, and timeout mirrors the value there. 
 368  */
 369 
 370 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 371 {
 372         struct sk_buff *skb;
 373 
 374         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 375         {
 376                 skb->sk->dead=1;
 377                 tcp_close(skb->sk, 0);
 378                 kfree_skb(skb, FREE_READ);
 379         }
 380         return;
 381 }
 382 
 383 /*
 384  *      Enter the time wait state. 
 385  */
 386 
 387 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 388 {
 389         tcp_set_state(sk,TCP_TIME_WAIT);
 390         sk->shutdown = SHUTDOWN_MASK;
 391         if (!sk->dead)
 392                 sk->state_change(sk);
 393         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 394 }
 395 
 396 /*
 397  *      A socket has timed out on its send queue and wants to do a
 398  *      little retransmitting. Currently this means TCP.
 399  */
 400 
 401 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 402 {
 403         struct sk_buff * skb;
 404         struct proto *prot;
 405         struct device *dev;
 406         int ct=0;
 407 
 408         prot = sk->prot;
 409         skb = sk->send_head;
 410 
 411         while (skb != NULL)
 412         {
 413                 struct tcphdr *th;
 414                 struct iphdr *iph;
 415                 int size;
 416 
 417                 dev = skb->dev;
 418                 IS_SKB(skb);
 419                 skb->when = jiffies;
 420 
 421                 /*
 422                  * In general it's OK just to use the old packet.  However we
 423                  * need to use the current ack and window fields.  Urg and
 424                  * urg_ptr could possibly stand to be updated as well, but we
 425                  * don't keep the necessary data.  That shouldn't be a problem,
 426                  * if the other end is doing the right thing.  Since we're
 427                  * changing the packet, we have to issue a new IP identifier.
 428                  */
 429 
 430                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 431                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 432                 size = skb->len - (((unsigned char *) th) - skb->data);
 433                 
 434                 /*
 435                  *      Note: We ought to check for window limits here but
 436                  *      currently this is done (less efficiently) elsewhere.
 437                  *      We do need to check for a route change but can't handle
 438                  *      that until we have the new 1.3.x buffers in.
 439                  *
 440                  */
 441 
 442                 iph->id = htons(ip_id_count++);
 443                 ip_send_check(iph);
 444 
 445                 /*
 446                  *      This is not the right way to handle this. We have to
 447                  *      issue an up to date window and ack report with this 
 448                  *      retransmit to keep the odd buggy tcp that relies on 
 449                  *      the fact BSD does this happy. 
 450                  *      We don't however need to recalculate the entire 
 451                  *      checksum, so someone wanting a small problem to play
 452                  *      with might like to implement RFC1141/RFC1624 and speed
 453                  *      this up by avoiding a full checksum.
 454                  */
 455                  
 456                 th->ack_seq = ntohl(sk->acked_seq);
 457                 th->window = ntohs(tcp_select_window(sk));
 458                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 459                 
 460                 /*
 461                  *      If the interface is (still) up and running, kick it.
 462                  */
 463 
 464                 if (dev->flags & IFF_UP)
 465                 {
 466                         /*
 467                          *      If the packet is still being sent by the device/protocol
 468                          *      below then don't retransmit. This is both needed, and good -
 469                          *      especially with connected mode AX.25 where it stops resends
 470                          *      occurring of an as yet unsent anyway frame!
 471                          *      We still add up the counts as the round trip time wants
 472                          *      adjusting.
 473                          */
 474                         if (sk && !skb_device_locked(skb))
 475                         {
 476                                 /* Remove it from any existing driver queue first! */
 477                                 skb_unlink(skb);
 478                                 /* Now queue it */
 479                                 ip_statistics.IpOutRequests++;
 480                                 dev_queue_xmit(skb, dev, sk->priority);
 481                         }
 482                 }
 483 
 484                 /*
 485                  *      Count retransmissions
 486                  */
 487                  
 488                 ct++;
 489                 sk->prot->retransmits ++;
 490 
 491                 /*
 492                  *      Only one retransmit requested.
 493                  */
 494         
 495                 if (!all)
 496                         break;
 497 
 498                 /*
 499                  *      This should cut it off before we send too many packets.
 500                  */
 501 
 502                 if (ct >= sk->cong_window)
 503                         break;
 504                 skb = skb->link3;
 505         }
 506 }
 507 
 508 /*
 509  *      Reset the retransmission timer
 510  */
 511  
 512 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 513 {
 514         del_timer(&sk->retransmit_timer);
 515         sk->ip_xmit_timeout = why;
 516         if((int)when < 0)
 517         {
 518                 when=3;
 519                 printk("Error: Negative timer in xmit_timer\n");
 520         }
 521         sk->retransmit_timer.expires=when;
 522         add_timer(&sk->retransmit_timer);
 523 }
 524 
 525 /*
 526  *      This is the normal code called for timeouts.  It does the retransmission
 527  *      and then does backoff.  tcp_do_retransmit is separated out because
 528  *      tcp_ack needs to send stuff from the retransmit queue without
 529  *      initiating a backoff.
 530  */
 531 
 532 
 533 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 534 {
 535         tcp_do_retransmit(sk, all);
 536 
 537         /*
 538          * Increase the timeout each time we retransmit.  Note that
 539          * we do not increase the rtt estimate.  rto is initialized
 540          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 541          * that doubling rto each time is the least we can get away with.
 542          * In KA9Q, Karn uses this for the first few times, and then
 543          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 544          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 545          * defined in the protocol as the maximum possible RTT.  I guess
 546          * we'll have to use something other than TCP to talk to the
 547          * University of Mars.
 548          *
 549          * PAWS allows us longer timeouts and large windows, so once
 550          * implemented ftp to mars will work nicely. We will have to fix
 551          * the 120 second clamps though!
 552          */
 553 
 554         sk->retransmits++;
 555         sk->backoff++;
 556         sk->rto = min(sk->rto << 1, 120*HZ);
 557         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 558 }
 559 
 560 
 561 /*
 562  *      A timer event has trigger a tcp retransmit timeout. The
 563  *      socket xmit queue is ready and set up to send. Because
 564  *      the ack receive code keeps the queue straight we do
 565  *      nothing clever here.
 566  */
 567 
 568 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 569 {
 570         if (all) 
 571         {
 572                 tcp_retransmit_time(sk, all);
 573                 return;
 574         }
 575 
 576         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 577         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 578         sk->cong_count = 0;
 579 
 580         sk->cong_window = 1;
 581 
 582         /* Do the actual retransmit. */
 583         tcp_retransmit_time(sk, all);
 584 }
 585 
 586 /*
 587  *      A write timeout has occurred. Process the after effects.
 588  */
 589 
 590 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 591 {
 592         /*
 593          *      Look for a 'soft' timeout.
 594          */
 595         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 596                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 597         {
 598                 /*
 599                  *      Attempt to recover if arp has changed (unlikely!) or
 600                  *      a route has shifted (not supported prior to 1.3).
 601                  */
 602                 arp_destroy (sk->daddr, 0);
 603                 ip_route_check (sk->daddr);
 604         }
 605         /*
 606          *      Has it gone just too far ?
 607          */
 608         if (sk->retransmits > TCP_RETR2) 
 609         {
 610                 sk->err = ETIMEDOUT;
 611                 sk->error_report(sk);
 612                 del_timer(&sk->retransmit_timer);
 613                 /*
 614                  *      Time wait the socket 
 615                  */
 616                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 617                 {
 618                         tcp_set_state(sk,TCP_TIME_WAIT);
 619                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 620                 }
 621                 else
 622                 {
 623                         /*
 624                          *      Clean up time.
 625                          */
 626                         tcp_set_state(sk, TCP_CLOSE);
 627                         return 0;
 628                 }
 629         }
 630         return 1;
 631 }
 632 
 633 /*
 634  *      The TCP retransmit timer. This lacks a few small details.
 635  *
 636  *      1.      An initial rtt timeout on the probe0 should cause what we can
 637  *              of the first write queue buffer to be split and sent.
 638  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 639  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 640  *              tcp_err should save a 'soft error' for us.
 641  */
 642 
 643 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 644 {
 645         struct sock *sk = (struct sock*)data;
 646         int why = sk->ip_xmit_timeout;
 647 
 648         /* 
 649          * only process if socket is not in use
 650          */
 651 
 652         cli();
 653         if (sk->inuse || in_bh) 
 654         {
 655                 /* Try again in 1 second */
 656                 sk->retransmit_timer.expires = HZ;
 657                 add_timer(&sk->retransmit_timer);
 658                 sti();
 659                 return;
 660         }
 661 
 662         sk->inuse = 1;
 663         sti();
 664 
 665         /* Always see if we need to send an ack. */
 666 
 667         if (sk->ack_backlog && !sk->zapped) 
 668         {
 669                 sk->prot->read_wakeup (sk);
 670                 if (! sk->dead)
 671                         sk->data_ready(sk,0);
 672         }
 673 
 674         /* Now we need to figure out why the socket was on the timer. */
 675 
 676         switch (why) 
 677         {
 678                 /* Window probing */
 679                 case TIME_PROBE0:
 680                         tcp_send_probe0(sk);
 681                         tcp_write_timeout(sk);
 682                         break;
 683                 /* Retransmitting */
 684                 case TIME_WRITE:
 685                         /* It could be we got here because we needed to send an ack.
 686                          * So we need to check for that.
 687                          */
 688                 {
 689                         struct sk_buff *skb;
 690                         unsigned long flags;
 691 
 692                         save_flags(flags);
 693                         cli();
 694                         skb = sk->send_head;
 695                         if (!skb) 
 696                         {
 697                                 restore_flags(flags);
 698                         } 
 699                         else 
 700                         {
 701                                 /*
 702                                  *      Kicked by a delayed ack. Reset timer
 703                                  *      correctly now
 704                                  */
 705                                 if (jiffies < skb->when + sk->rto) 
 706                                 {
 707                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 708                                         restore_flags(flags);
 709                                         break;
 710                                 }
 711                                 restore_flags(flags);
 712                                 /*
 713                                  *      Retransmission
 714                                  */
 715                                 sk->prot->retransmit (sk, 0);
 716                                 tcp_write_timeout(sk);
 717                         }
 718                         break;
 719                 }
 720                 /* Sending Keepalives */
 721                 case TIME_KEEPOPEN:
 722                         /* 
 723                          * this reset_timer() call is a hack, this is not
 724                          * how KEEPOPEN is supposed to work.
 725                          */
 726                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 727 
 728                         /* Send something to keep the connection open. */
 729                         if (sk->prot->write_wakeup)
 730                                   sk->prot->write_wakeup (sk);
 731                         sk->retransmits++;
 732                         tcp_write_timeout(sk);
 733                         break;
 734                 default:
 735                         printk ("rexmit_timer: timer expired - reason unknown\n");
 736                         break;
 737         }
 738         release_sock(sk);
 739 }
 740 
 741 /*
 742  * This routine is called by the ICMP module when it gets some
 743  * sort of error condition.  If err < 0 then the socket should
 744  * be closed and the error returned to the user.  If err > 0
 745  * it's just the icmp type << 8 | icmp code.  After adjustment
 746  * header points to the first 8 bytes of the tcp header.  We need
 747  * to find the appropriate port.
 748  */
 749 
 750 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 751         unsigned long saddr, struct inet_protocol *protocol)
 752 {
 753         struct tcphdr *th;
 754         struct sock *sk;
 755         struct iphdr *iph=(struct iphdr *)header;
 756   
 757         header+=4*iph->ihl;
 758    
 759 
 760         th =(struct tcphdr *)header;
 761         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 762 
 763         if (sk == NULL) 
 764                 return;
 765   
 766         if(err<0)
 767         {
 768                 sk->err = -err;
 769                 sk->error_report(sk);
 770                 return;
 771         }
 772 
 773         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 774         {
 775                 /*
 776                  * FIXME:
 777                  * For now we will just trigger a linear backoff.
 778                  * The slow start code should cause a real backoff here.
 779                  */
 780                 if (sk->cong_window > 4)
 781                         sk->cong_window--;
 782                 return;
 783         }
 784 
 785 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 786 
 787         /*
 788          * If we've already connected we will keep trying
 789          * until we time out, or the user gives up.
 790          */
 791 
 792         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 793         {
 794                 if (sk->state == TCP_SYN_SENT) 
 795                 {
 796                         tcp_statistics.TcpAttemptFails++;
 797                         tcp_set_state(sk,TCP_CLOSE);
 798                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 799                 }
 800                 sk->err = icmp_err_convert[err & 0xff].errno;           
 801         }
 802         return;
 803 }
 804 
 805 
 806 /*
 807  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 808  *      in the received data queue (ie a frame missing that needs sending to us). Not
 809  *      sorting using two queues as data arrives makes life so much harder.
 810  */
 811 
 812 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 813 {
 814         unsigned long counted;
 815         unsigned long amount;
 816         struct sk_buff *skb;
 817         int sum;
 818         unsigned long flags;
 819 
 820         if(sk && sk->debug)
 821                 printk("tcp_readable: %p - ",sk);
 822 
 823         save_flags(flags);
 824         cli();
 825         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 826         {
 827                 restore_flags(flags);
 828                 if(sk && sk->debug) 
 829                         printk("empty\n");
 830                 return(0);
 831         }
 832   
 833         counted = sk->copied_seq;       /* Where we are at the moment */
 834         amount = 0;
 835   
 836         /* 
 837          *      Do until a push or until we are out of data. 
 838          */
 839          
 840         do 
 841         {
 842                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 843                         break;
 844                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 845                 if (skb->h.th->syn)
 846                         sum++;
 847                 if (sum > 0) 
 848                 {                                       /* Add it up, move on */
 849                         amount += sum;
 850                         if (skb->h.th->syn) 
 851                                 amount--;
 852                         counted += sum;
 853                 }
 854                 /*
 855                  * Don't count urg data ... but do it in the right place!
 856                  * Consider: "old_data (ptr is here) URG PUSH data"
 857                  * The old code would stop at the first push because
 858                  * it counted the urg (amount==1) and then does amount--
 859                  * *after* the loop.  This means tcp_readable() always
 860                  * returned zero if any URG PUSH was in the queue, even
 861                  * though there was normal data available. If we subtract
 862                  * the urg data right here, we even get it to work for more
 863                  * than one URG PUSH skb without normal data.
 864                  * This means that select() finally works now with urg data
 865                  * in the queue.  Note that rlogin was never affected
 866                  * because it doesn't use select(); it uses two processes
 867                  * and a blocking read().  And the queue scan in tcp_read()
 868                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 869                  */
 870                 if (skb->h.th->urg)
 871                         amount--;       /* don't count urg data */
 872                 if (amount && skb->h.th->psh) break;
 873                 skb = skb->next;
 874         }
 875         while(skb != (struct sk_buff *)&sk->receive_queue);
 876 
 877         restore_flags(flags);
 878         if(sk->debug)
 879                 printk("got %lu bytes.\n",amount);
 880         return(amount);
 881 }
 882 
 883 /*
 884  * LISTEN is a special case for select..
 885  */
 886 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 887 {
 888         if (sel_type == SEL_IN) {
 889                 int retval;
 890 
 891                 sk->inuse = 1;
 892                 retval = (tcp_find_established(sk) != NULL);
 893                 release_sock(sk);
 894                 if (!retval)
 895                         select_wait(&master_select_wakeup,wait);
 896                 return retval;
 897         }
 898         return 0;
 899 }
 900 
 901 
 902 /*
 903  *      Wait for a TCP event.
 904  *
 905  *      Note that we don't need to set "sk->inuse", as the upper select layers
 906  *      take care of normal races (between the test and the event) and we don't
 907  *      go look at any of the socket buffers directly.
 908  */
 909 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 910 {
 911         if (sk->state == TCP_LISTEN)
 912                 return tcp_listen_select(sk, sel_type, wait);
 913 
 914         switch(sel_type) {
 915         case SEL_IN:
 916                 if (sk->err)
 917                         return 1;
 918                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 919                         break;
 920 
 921                 if (sk->shutdown & RCV_SHUTDOWN)
 922                         return 1;
 923                         
 924                 if (sk->acked_seq == sk->copied_seq)
 925                         break;
 926 
 927                 if (sk->urg_seq != sk->copied_seq ||
 928                     sk->acked_seq != sk->copied_seq+1 ||
 929                     sk->urginline || !sk->urg_data)
 930                         return 1;
 931                 break;
 932 
 933         case SEL_OUT:
 934                 if (sk->shutdown & SEND_SHUTDOWN) 
 935                         return 0;
 936                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 937                         break;
 938                 /*
 939                  * This is now right thanks to a small fix
 940                  * by Matt Dillon.
 941                  */
 942 
 943                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 944                         break;
 945                 return 1;
 946 
 947         case SEL_EX:
 948                 if (sk->err || sk->urg_data)
 949                         return 1;
 950                 break;
 951         }
 952         select_wait(sk->sleep, wait);
 953         return 0;
 954 }
 955 
 956 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 957 {
 958         int err;
 959         switch(cmd) 
 960         {
 961 
 962                 case TIOCINQ:
 963 #ifdef FIXME    /* FIXME: */
 964                 case FIONREAD:
 965 #endif
 966                 {
 967                         unsigned long amount;
 968 
 969                         if (sk->state == TCP_LISTEN) 
 970                                 return(-EINVAL);
 971 
 972                         sk->inuse = 1;
 973                         amount = tcp_readable(sk);
 974                         release_sock(sk);
 975                         err=verify_area(VERIFY_WRITE,(void *)arg,
 976                                                    sizeof(unsigned long));
 977                         if(err)
 978                                 return err;
 979                         put_fs_long(amount,(unsigned long *)arg);
 980                         return(0);
 981                 }
 982                 case SIOCATMARK:
 983                 {
 984                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 985 
 986                         err = verify_area(VERIFY_WRITE,(void *) arg,
 987                                                   sizeof(unsigned long));
 988                         if (err)
 989                                 return err;
 990                         put_fs_long(answ,(int *) arg);
 991                         return(0);
 992                 }
 993                 case TIOCOUTQ:
 994                 {
 995                         unsigned long amount;
 996 
 997                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 998                         amount = sk->prot->wspace(sk);
 999                         err=verify_area(VERIFY_WRITE,(void *)arg,
1000                                                    sizeof(unsigned long));
1001                         if(err)
1002                                 return err;
1003                         put_fs_long(amount,(unsigned long *)arg);
1004                         return(0);
1005                 }
1006                 default:
1007                         return(-EINVAL);
1008         }
1009 }
1010 
1011 
1012 /*
1013  *      This routine computes a TCP checksum. 
1014  */
1015  
1016 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1017           unsigned long saddr, unsigned long daddr)
1018 {     
1019         unsigned long sum;
1020    
1021         if (saddr == 0) saddr = ip_my_addr();
1022 
1023 /*
1024  * stupid, gcc complains when I use just one __asm__ block,
1025  * something about too many reloads, but this is just two
1026  * instructions longer than what I want
1027  */
1028         __asm__("
1029             addl %%ecx, %%ebx
1030             adcl %%edx, %%ebx
1031             adcl $0, %%ebx
1032             "
1033         : "=b"(sum)
1034         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1035         : "bx", "cx", "dx" );
1036         __asm__("
1037             movl %%ecx, %%edx
1038             cld
1039             cmpl $32, %%ecx
1040             jb 2f
1041             shrl $5, %%ecx
1042             clc
1043 1:          lodsl
1044             adcl %%eax, %%ebx
1045             lodsl
1046             adcl %%eax, %%ebx
1047             lodsl
1048             adcl %%eax, %%ebx
1049             lodsl
1050             adcl %%eax, %%ebx
1051             lodsl
1052             adcl %%eax, %%ebx
1053             lodsl
1054             adcl %%eax, %%ebx
1055             lodsl
1056             adcl %%eax, %%ebx
1057             lodsl
1058             adcl %%eax, %%ebx
1059             loop 1b
1060             adcl $0, %%ebx
1061             movl %%edx, %%ecx
1062 2:          andl $28, %%ecx
1063             je 4f
1064             shrl $2, %%ecx
1065             clc
1066 3:          lodsl
1067             adcl %%eax, %%ebx
1068             loop 3b
1069             adcl $0, %%ebx
1070 4:          movl $0, %%eax
1071             testw $2, %%dx
1072             je 5f
1073             lodsw
1074             addl %%eax, %%ebx
1075             adcl $0, %%ebx
1076             movw $0, %%ax
1077 5:          test $1, %%edx
1078             je 6f
1079             lodsb
1080             addl %%eax, %%ebx
1081             adcl $0, %%ebx
1082 6:          movl %%ebx, %%eax
1083             shrl $16, %%eax
1084             addw %%ax, %%bx
1085             adcw $0, %%bx
1086             "
1087         : "=b"(sum)
1088         : "0"(sum), "c"(len), "S"(th)
1089         : "ax", "bx", "cx", "dx", "si" );
1090 
1091         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1092   
1093         return((~sum) & 0xffff);
1094 }
1095 
1096 
1097 
1098 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1099                 unsigned long daddr, int len, struct sock *sk)
1100 {
1101         th->check = 0;
1102         th->check = tcp_check(th, len, saddr, daddr);
1103         return;
1104 }
1105 
1106 /*
1107  *      This is the main buffer sending routine. We queue the buffer
1108  *      having checked it is sane seeming.
1109  */
1110  
1111 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1112 {
1113         int size;
1114         struct tcphdr * th = skb->h.th;
1115 
1116         /*
1117          *      length of packet (not counting length of pre-tcp headers) 
1118          */
1119          
1120         size = skb->len - ((unsigned char *) th - skb->data);
1121 
1122         /*
1123          *      Sanity check it.. 
1124          */
1125          
1126         if (size < sizeof(struct tcphdr) || size > skb->len) 
1127         {
1128                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1129                         skb, skb->data, th, skb->len);
1130                 kfree_skb(skb, FREE_WRITE);
1131                 return;
1132         }
1133 
1134         /*
1135          *      If we have queued a header size packet.. (these crash a few
1136          *      tcp stacks if ack is not set)
1137          */
1138          
1139         if (size == sizeof(struct tcphdr)) 
1140         {
1141                 /* If its got a syn or fin its notionally included in the size..*/
1142                 if(!th->syn && !th->fin) 
1143                 {
1144                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1145                         kfree_skb(skb,FREE_WRITE);
1146                         return;
1147                 }
1148         }
1149 
1150         /*
1151          *      Actual processing.
1152          */
1153          
1154         tcp_statistics.TcpOutSegs++;  
1155         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1156         
1157         /*
1158          *      We must queue if
1159          *
1160          *      a) The right edge of this frame exceeds the window
1161          *      b) We are retransmitting (Nagle's rule)
1162          *      c) We have too many packets 'in flight'
1163          */
1164          
1165         if (after(skb->h.seq, sk->window_seq) ||
1166             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1167              sk->packets_out >= sk->cong_window) 
1168         {
1169                 /* checksum will be supplied by tcp_write_xmit.  So
1170                  * we shouldn't need to set it at all.  I'm being paranoid */
1171                 th->check = 0;
1172                 if (skb->next != NULL) 
1173                 {
1174                         printk("tcp_send_partial: next != NULL\n");
1175                         skb_unlink(skb);
1176                 }
1177                 skb_queue_tail(&sk->write_queue, skb);
1178                 
1179                 /*
1180                  *      If we don't fit we have to start the zero window
1181                  *      probes. This is broken - we really need to do a partial
1182                  *      send _first_ (This is what causes the Cisco and PC/TCP
1183                  *      grief).
1184                  */
1185                  
1186                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1187                     sk->send_head == NULL && sk->ack_backlog == 0)
1188                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1189         } 
1190         else 
1191         {
1192                 /*
1193                  *      This is going straight out
1194                  */
1195                  
1196                 th->ack_seq = ntohl(sk->acked_seq);
1197                 th->window = ntohs(tcp_select_window(sk));
1198 
1199                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1200 
1201                 sk->sent_seq = sk->write_seq;
1202                 
1203                 /*
1204                  *      This is mad. The tcp retransmit queue is put together
1205                  *      by the ip layer. This causes half the problems with
1206                  *      unroutable FIN's and other things.
1207                  */
1208                  
1209                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1210                 
1211                 /*
1212                  *      Set for next retransmit based on expected ACK time.
1213                  *      FIXME: We set this every time which means our 
1214                  *      retransmits are really about a window behind.
1215                  */
1216 
1217                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1218         }
1219 }
1220 
1221 /*
1222  *      Locking problems lead us to a messy situation where we can have
1223  *      multiple partially complete buffers queued up. This is really bad
1224  *      as we don't want to be sending partial buffers. Fix this with
1225  *      a semaphore or similar to lock tcp_write per socket.
1226  *
1227  *      These routines are pretty self descriptive.
1228  */
1229  
1230 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1231 {
1232         struct sk_buff * skb;
1233         unsigned long flags;
1234 
1235         save_flags(flags);
1236         cli();
1237         skb = sk->partial;
1238         if (skb) {
1239                 sk->partial = NULL;
1240                 del_timer(&sk->partial_timer);
1241         }
1242         restore_flags(flags);
1243         return skb;
1244 }
1245 
1246 /*
1247  *      Empty the partial queue
1248  */
1249  
1250 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1251 {
1252         struct sk_buff *skb;
1253 
1254         if (sk == NULL)
1255                 return;
1256         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1257                 tcp_send_skb(sk, skb);
1258 }
1259 
1260 /*
1261  *      Queue a partial frame
1262  */
1263  
1264 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1265 {
1266         struct sk_buff * tmp;
1267         unsigned long flags;
1268 
1269         save_flags(flags);
1270         cli();
1271         tmp = sk->partial;
1272         if (tmp)
1273                 del_timer(&sk->partial_timer);
1274         sk->partial = skb;
1275         init_timer(&sk->partial_timer);
1276         /*
1277          *      Wait up to 1 second for the buffer to fill.
1278          */
1279         sk->partial_timer.expires = HZ;
1280         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1281         sk->partial_timer.data = (unsigned long) sk;
1282         add_timer(&sk->partial_timer);
1283         restore_flags(flags);
1284         if (tmp)
1285                 tcp_send_skb(sk, tmp);
1286 }
1287 
1288 
1289 /*
1290  *      This routine sends an ack and also updates the window. 
1291  */
1292  
1293 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1294              struct sock *sk,
1295              struct tcphdr *th, unsigned long daddr)
1296 {
1297         struct sk_buff *buff;
1298         struct tcphdr *t1;
1299         struct device *dev = NULL;
1300         int tmp;
1301 
1302         if(sk->zapped)
1303                 return;         /* We have been reset, we may not send again */
1304                 
1305         /*
1306          * We need to grab some memory, and put together an ack,
1307          * and then put it into the queue to be sent.
1308          */
1309 
1310         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1311         if (buff == NULL) 
1312         {
1313                 /* 
1314                  *      Force it to send an ack. We don't have to do this
1315                  *      (ACK is unreliable) but its much better use of 
1316                  *      bandwidth on slow links to send a spare ack than
1317                  *      resend packets. 
1318                  */
1319                  
1320                 sk->ack_backlog++;
1321                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1322                 {
1323                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1324                 }
1325                 return;
1326         }
1327 
1328         /*
1329          *      Assemble a suitable TCP frame
1330          */
1331          
1332         buff->len = sizeof(struct tcphdr);
1333         buff->sk = sk;
1334         buff->localroute = sk->localroute;
1335         t1 =(struct tcphdr *) buff->data;
1336 
1337         /* 
1338          *      Put in the IP header and routing stuff. 
1339          */
1340          
1341         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1342                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1343         if (tmp < 0) 
1344         {
1345                 buff->free = 1;
1346                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1347                 return;
1348         }
1349         buff->len += tmp;
1350         t1 =(struct tcphdr *)((char *)t1 +tmp);
1351 
1352         memcpy(t1, th, sizeof(*t1));
1353 
1354         /*
1355          *      Swap the send and the receive. 
1356          */
1357          
1358         t1->dest = th->source;
1359         t1->source = th->dest;
1360         t1->seq = ntohl(sequence);
1361         t1->ack = 1;
1362         sk->window = tcp_select_window(sk);
1363         t1->window = ntohs(sk->window);
1364         t1->res1 = 0;
1365         t1->res2 = 0;
1366         t1->rst = 0;
1367         t1->urg = 0;
1368         t1->syn = 0;
1369         t1->psh = 0;
1370         t1->fin = 0;
1371         
1372         /*
1373          *      If we have nothing queued for transmit and the transmit timer
1374          *      is on we are just doing an ACK timeout and need to switch
1375          *      to a keepalive.
1376          */
1377          
1378         if (ack == sk->acked_seq) 
1379         {
1380                 sk->ack_backlog = 0;
1381                 sk->bytes_rcv = 0;
1382                 sk->ack_timed = 0;
1383                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1384                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1385                 {
1386                         if(sk->keepopen) {
1387                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1388                         } else {
1389                                 delete_timer(sk);
1390                         }
1391                 }
1392         }
1393         
1394         /*
1395          *      Fill in the packet and send it
1396          */
1397          
1398         t1->ack_seq = ntohl(ack);
1399         t1->doff = sizeof(*t1)/4;
1400         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1401         if (sk->debug)
1402                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1403         tcp_statistics.TcpOutSegs++;
1404         sk->prot->queue_xmit(sk, dev, buff, 1);
1405 }
1406 
1407 
1408 /* 
1409  *      This routine builds a generic TCP header. 
1410  */
1411  
1412 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1413 {
1414 
1415         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1416         th->seq = htonl(sk->write_seq);
1417         th->psh =(push == 0) ? 1 : 0;
1418         th->doff = sizeof(*th)/4;
1419         th->ack = 1;
1420         th->fin = 0;
1421         sk->ack_backlog = 0;
1422         sk->bytes_rcv = 0;
1423         sk->ack_timed = 0;
1424         th->ack_seq = htonl(sk->acked_seq);
1425         sk->window = tcp_select_window(sk);
1426         th->window = htons(sk->window);
1427 
1428         return(sizeof(*th));
1429 }
1430 
1431 /*
1432  *      This routine copies from a user buffer into a socket,
1433  *      and starts the transmit system.
1434  */
1435 
1436 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1437           int len, int nonblock, unsigned flags)
1438 {
1439         int copied = 0;
1440         int copy;
1441         int tmp;
1442         struct sk_buff *skb;
1443         struct sk_buff *send_tmp;
1444         unsigned char *buff;
1445         struct proto *prot;
1446         struct device *dev = NULL;
1447 
1448         sk->inuse=1;
1449         prot = sk->prot;
1450         while(len > 0) 
1451         {
1452                 if (sk->err) 
1453                 {                       /* Stop on an error */
1454                         release_sock(sk);
1455                         if (copied) 
1456                                 return(copied);
1457                         tmp = -sk->err;
1458                         sk->err = 0;
1459                         return(tmp);
1460                 }
1461 
1462                 /*
1463                  *      First thing we do is make sure that we are established. 
1464                  */
1465         
1466                 if (sk->shutdown & SEND_SHUTDOWN) 
1467                 {
1468                         release_sock(sk);
1469                         sk->err = EPIPE;
1470                         if (copied) 
1471                                 return(copied);
1472                         sk->err = 0;
1473                         return(-EPIPE);
1474                 }
1475 
1476                 /* 
1477                  *      Wait for a connection to finish.
1478                  */
1479         
1480                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1481                 {
1482                         if (sk->err) 
1483                         {
1484                                 release_sock(sk);
1485                                 if (copied) 
1486                                         return(copied);
1487                                 tmp = -sk->err;
1488                                 sk->err = 0;
1489                                 return(tmp);
1490                         }
1491 
1492                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1493                         {
1494                                 release_sock(sk);
1495                                 if (copied) 
1496                                         return(copied);
1497 
1498                                 if (sk->err) 
1499                                 {
1500                                         tmp = -sk->err;
1501                                         sk->err = 0;
1502                                         return(tmp);
1503                                 }
1504 
1505                                 if (sk->keepopen) 
1506                                 {
1507                                         send_sig(SIGPIPE, current, 0);
1508                                 }
1509                                 return(-EPIPE);
1510                         }
1511 
1512                         if (nonblock || copied) 
1513                         {
1514                                 release_sock(sk);
1515                                 if (copied) 
1516                                         return(copied);
1517                                 return(-EAGAIN);
1518                         }
1519 
1520                         release_sock(sk);
1521                         cli();
1522                 
1523                         if (sk->state != TCP_ESTABLISHED &&
1524                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1525                         {
1526                                 interruptible_sleep_on(sk->sleep);
1527                                 if (current->signal & ~current->blocked) 
1528                                 {
1529                                         sti();
1530                                         if (copied) 
1531                                                 return(copied);
1532                                         return(-ERESTARTSYS);
1533                                 }
1534                         }
1535                         sk->inuse = 1;
1536                         sti();
1537                 }
1538 
1539         /*
1540          * The following code can result in copy <= if sk->mss is ever
1541          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1542          * sk->mtu is constant once SYN processing is finished.  I.e. we
1543          * had better not get here until we've seen his SYN and at least one
1544          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1545          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1546          * non-decreasing.  Note that any ioctl to set user_mss must be done
1547          * before the exchange of SYN's.  If the initial ack from the other
1548          * end has a window of 0, max_window and thus mss will both be 0.
1549          */
1550 
1551         /* 
1552          *      Now we need to check if we have a half built packet. 
1553          */
1554 
1555                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1556                 {
1557                         int hdrlen;
1558 
1559                          /* IP header + TCP header */
1560                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1561                                  + sizeof(struct tcphdr);
1562         
1563                         /* Add more stuff to the end of skb->len */
1564                         if (!(flags & MSG_OOB)) 
1565                         {
1566                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1567                                 /* FIXME: this is really a bug. */
1568                                 if (copy <= 0) 
1569                                 {
1570                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1571                                         copy = 0;
1572                                 }
1573           
1574                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1575                                 skb->len += copy;
1576                                 from += copy;
1577                                 copied += copy;
1578                                 len -= copy;
1579                                 sk->write_seq += copy;
1580                         }
1581                         if ((skb->len - hdrlen) >= sk->mss ||
1582                                 (flags & MSG_OOB) || !sk->packets_out)
1583                                 tcp_send_skb(sk, skb);
1584                         else
1585                                 tcp_enqueue_partial(skb, sk);
1586                         continue;
1587                 }
1588 
1589         /*
1590          * We also need to worry about the window.
1591          * If window < 1/2 the maximum window we've seen from this
1592          *   host, don't use it.  This is sender side
1593          *   silly window prevention, as specified in RFC1122.
1594          *   (Note that this is different than earlier versions of
1595          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1596          *   use the whole MSS.  Since the results in the right
1597          *   edge of the packet being outside the window, it will
1598          *   be queued for later rather than sent.
1599          */
1600 
1601                 copy = sk->window_seq - sk->write_seq;
1602                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1603                         copy = sk->mss;
1604                 if (copy > len)
1605                         copy = len;
1606 
1607         /*
1608          *      We should really check the window here also. 
1609          */
1610          
1611                 send_tmp = NULL;
1612                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1613                 {
1614                         /*
1615                          *      We will release the socket incase we sleep here. 
1616                          */
1617                         release_sock(sk);
1618                         /*
1619                          *      NB: following must be mtu, because mss can be increased.
1620                          *      mss is always <= mtu 
1621                          */
1622                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1623                         sk->inuse = 1;
1624                         send_tmp = skb;
1625                 } 
1626                 else 
1627                 {
1628                         /*
1629                          *      We will release the socket incase we sleep here. 
1630                          */
1631                         release_sock(sk);
1632                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1633                         sk->inuse = 1;
1634                 }
1635 
1636                 /*
1637                  *      If we didn't get any memory, we need to sleep. 
1638                  */
1639 
1640                 if (skb == NULL) 
1641                 {
1642                         sk->socket->flags |= SO_NOSPACE;
1643                         if (nonblock) 
1644                         {
1645                                 release_sock(sk);
1646                                 if (copied) 
1647                                         return(copied);
1648                                 return(-EAGAIN);
1649                         }
1650 
1651                         /*
1652                          *      FIXME: here is another race condition. 
1653                          */
1654 
1655                         tmp = sk->wmem_alloc;
1656                         release_sock(sk);
1657                         cli();
1658                         /*
1659                          *      Again we will try to avoid it. 
1660                          */
1661                         if (tmp <= sk->wmem_alloc &&
1662                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1663                                 && sk->err == 0) 
1664                         {
1665                                 sk->socket->flags &= ~SO_NOSPACE;
1666                                 interruptible_sleep_on(sk->sleep);
1667                                 if (current->signal & ~current->blocked) 
1668                                 {
1669                                         sti();
1670                                         if (copied) 
1671                                                 return(copied);
1672                                         return(-ERESTARTSYS);
1673                                 }
1674                         }
1675                         sk->inuse = 1;
1676                         sti();
1677                         continue;
1678                 }
1679 
1680                 skb->len = 0;
1681                 skb->sk = sk;
1682                 skb->free = 0;
1683                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1684         
1685                 buff = skb->data;
1686         
1687                 /*
1688                  * FIXME: we need to optimize this.
1689                  * Perhaps some hints here would be good.
1690                  */
1691                 
1692                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1693                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1694                 if (tmp < 0 ) 
1695                 {
1696                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1697                         release_sock(sk);
1698                         if (copied) 
1699                                 return(copied);
1700                         return(tmp);
1701                 }
1702                 skb->len += tmp;
1703                 skb->dev = dev;
1704                 buff += tmp;
1705                 skb->h.th =(struct tcphdr *) buff;
1706                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1707                 if (tmp < 0) 
1708                 {
1709                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1710                         release_sock(sk);
1711                         if (copied) 
1712                                 return(copied);
1713                         return(tmp);
1714                 }
1715 
1716                 if (flags & MSG_OOB) 
1717                 {
1718                         ((struct tcphdr *)buff)->urg = 1;
1719                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1720                 }
1721                 skb->len += tmp;
1722                 memcpy_fromfs(buff+tmp, from, copy);
1723 
1724                 from += copy;
1725                 copied += copy;
1726                 len -= copy;
1727                 skb->len += copy;
1728                 skb->free = 0;
1729                 sk->write_seq += copy;
1730         
1731                 if (send_tmp != NULL && sk->packets_out) 
1732                 {
1733                         tcp_enqueue_partial(send_tmp, sk);
1734                         continue;
1735                 }
1736                 tcp_send_skb(sk, skb);
1737         }
1738         sk->err = 0;
1739 
1740 /*
1741  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1742  *      interactive fast network servers. It's meant to be on and
1743  *      it really improves the throughput though not the echo time
1744  *      on my slow slip link - Alan
1745  */
1746 
1747 /*
1748  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1749  */
1750  
1751         if(sk->partial && ((!sk->packets_out) 
1752      /* If not nagling we can send on the before case too.. */
1753               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1754         ))
1755                 tcp_send_partial(sk);
1756 
1757         release_sock(sk);
1758         return(copied);
1759 }
1760 
1761 /*
1762  *      This is just a wrapper. 
1763  */
1764 
1765 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1766            int len, int nonblock, unsigned flags,
1767            struct sockaddr_in *addr, int addr_len)
1768 {
1769         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1770                 return -EINVAL;
1771         if (sk->state == TCP_CLOSE)
1772                 return -ENOTCONN;
1773         if (addr_len < sizeof(*addr))
1774                 return -EINVAL;
1775         if (addr->sin_family && addr->sin_family != AF_INET) 
1776                 return -EINVAL;
1777         if (addr->sin_port != sk->dummy_th.dest) 
1778                 return -EISCONN;
1779         if (addr->sin_addr.s_addr != sk->daddr) 
1780                 return -EISCONN;
1781         return tcp_write(sk, from, len, nonblock, flags);
1782 }
1783 
1784 
1785 /*
1786  *      Send an ack if one is backlogged at this point. Ought to merge
1787  *      this with tcp_send_ack().
1788  */
1789  
1790 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1791 {
1792         int tmp;
1793         struct device *dev = NULL;
1794         struct tcphdr *t1;
1795         struct sk_buff *buff;
1796 
1797         if (!sk->ack_backlog) 
1798                 return;
1799 
1800         /*
1801          * FIXME: we need to put code here to prevent this routine from
1802          * being called.  Being called once in a while is ok, so only check
1803          * if this is the second time in a row.
1804          */
1805 
1806         /*
1807          * We need to grab some memory, and put together an ack,
1808          * and then put it into the queue to be sent.
1809          */
1810 
1811         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1812         if (buff == NULL) 
1813         {
1814                 /* Try again real soon. */
1815                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1816                 return;
1817         }
1818 
1819         buff->len = sizeof(struct tcphdr);
1820         buff->sk = sk;
1821         buff->localroute = sk->localroute;
1822         
1823         /*
1824          *      Put in the IP header and routing stuff. 
1825          */
1826 
1827         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1828                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1829         if (tmp < 0) 
1830         {
1831                 buff->free = 1;
1832                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1833                 return;
1834         }
1835 
1836         buff->len += tmp;
1837         t1 =(struct tcphdr *)(buff->data +tmp);
1838 
1839         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1840         t1->seq = htonl(sk->sent_seq);
1841         t1->ack = 1;
1842         t1->res1 = 0;
1843         t1->res2 = 0;
1844         t1->rst = 0;
1845         t1->urg = 0;
1846         t1->syn = 0;
1847         t1->psh = 0;
1848         sk->ack_backlog = 0;
1849         sk->bytes_rcv = 0;
1850         sk->window = tcp_select_window(sk);
1851         t1->window = ntohs(sk->window);
1852         t1->ack_seq = ntohl(sk->acked_seq);
1853         t1->doff = sizeof(*t1)/4;
1854         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1855         sk->prot->queue_xmit(sk, dev, buff, 1);
1856         tcp_statistics.TcpOutSegs++;
1857 }
1858 
1859 
1860 /*
1861  *      FIXME:
1862  *      This routine frees used buffers.
1863  *      It should consider sending an ACK to let the
1864  *      other end know we now have a bigger window.
1865  */
1866 
1867 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1868 {
1869         unsigned long flags;
1870         unsigned long left;
1871         struct sk_buff *skb;
1872         unsigned long rspace;
1873 
1874         if(sk->debug)
1875                 printk("cleaning rbuf for sk=%p\n", sk);
1876   
1877         save_flags(flags);
1878         cli();
1879   
1880         left = sk->prot->rspace(sk);
1881  
1882         /*
1883          *      We have to loop through all the buffer headers,
1884          *      and try to free up all the space we can.
1885          */
1886 
1887         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1888         {
1889                 if (!skb->used || skb->users) 
1890                         break;
1891                 skb_unlink(skb);
1892                 skb->sk = sk;
1893                 kfree_skb(skb, FREE_READ);
1894         }
1895 
1896         restore_flags(flags);
1897 
1898         /*
1899          *      FIXME:
1900          *      At this point we should send an ack if the difference
1901          *      in the window, and the amount of space is bigger than
1902          *      TCP_WINDOW_DIFF.
1903          */
1904 
1905         if(sk->debug)
1906                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1907                                             left);
1908         if ((rspace=sk->prot->rspace(sk)) != left) 
1909         {
1910                 /*
1911                  * This area has caused the most trouble.  The current strategy
1912                  * is to simply do nothing if the other end has room to send at
1913                  * least 3 full packets, because the ack from those will auto-
1914                  * matically update the window.  If the other end doesn't think
1915                  * we have much space left, but we have room for at least 1 more
1916                  * complete packet than it thinks we do, we will send an ack
1917                  * immediately.  Otherwise we will wait up to .5 seconds in case
1918                  * the user reads some more.
1919                  */
1920                 sk->ack_backlog++;
1921         /*
1922          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1923          * if the other end is offering a window smaller than the agreed on MSS
1924          * (called sk->mtu here).  In theory there's no connection between send
1925          * and receive, and so no reason to think that they're going to send
1926          * small packets.  For the moment I'm using the hack of reducing the mss
1927          * only on the send side, so I'm putting mtu here.
1928          */
1929 
1930                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1931                 {
1932                         /* Send an ack right now. */
1933                         tcp_read_wakeup(sk);
1934                 } 
1935                 else 
1936                 {
1937                         /* Force it to send an ack soon. */
1938                         int was_active = del_timer(&sk->retransmit_timer);
1939                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1940                         {
1941                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1942                         } 
1943                         else
1944                                 add_timer(&sk->retransmit_timer);
1945                 }
1946         }
1947 } 
1948 
1949 
1950 /*
1951  *      Handle reading urgent data. BSD has very simple semantics for
1952  *      this, no blocking and very strange errors 8)
1953  */
1954  
1955 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1956              unsigned char *to, int len, unsigned flags)
1957 {
1958         /*
1959          *      No URG data to read
1960          */
1961         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1962                 return -EINVAL; /* Yes this is right ! */
1963                 
1964         if (sk->err) 
1965         {
1966                 int tmp = -sk->err;
1967                 sk->err = 0;
1968                 return tmp;
1969         }
1970 
1971         if (sk->state == TCP_CLOSE || sk->done) 
1972         {
1973                 if (!sk->done) {
1974                         sk->done = 1;
1975                         return 0;
1976                 }
1977                 return -ENOTCONN;
1978         }
1979 
1980         if (sk->shutdown & RCV_SHUTDOWN) 
1981         {
1982                 sk->done = 1;
1983                 return 0;
1984         }
1985         sk->inuse = 1;
1986         if (sk->urg_data & URG_VALID) 
1987         {
1988                 char c = sk->urg_data;
1989                 if (!(flags & MSG_PEEK))
1990                         sk->urg_data = URG_READ;
1991                 put_fs_byte(c, to);
1992                 release_sock(sk);
1993                 return 1;
1994         }
1995         release_sock(sk);
1996         
1997         /*
1998          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1999          * the available implementations agree in this case:
2000          * this call should never block, independent of the
2001          * blocking state of the socket.
2002          * Mike <pall@rz.uni-karlsruhe.de>
2003          */
2004         return -EAGAIN;
2005 }
2006 
2007 
2008 /*
2009  *      This routine copies from a sock struct into the user buffer. 
2010  */
2011  
2012 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2013         int len, int nonblock, unsigned flags)
2014 {
2015         struct wait_queue wait = { current, NULL };
2016         int copied = 0;
2017         unsigned long peek_seq;
2018         volatile unsigned long *seq;    /* So gcc doesn't overoptimise */
2019         unsigned long used;
2020 
2021         /* 
2022          *      This error should be checked. 
2023          */
2024          
2025         if (sk->state == TCP_LISTEN)
2026                 return -ENOTCONN;
2027 
2028         /*
2029          *      Urgent data needs to be handled specially. 
2030          */
2031          
2032         if (flags & MSG_OOB)
2033                 return tcp_read_urg(sk, nonblock, to, len, flags);
2034 
2035         /*
2036          *      Copying sequence to update. This is volatile to handle
2037          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2038          *      inline and thus not flush cached variables otherwise).
2039          */
2040          
2041         peek_seq = sk->copied_seq;
2042         seq = &sk->copied_seq;
2043         if (flags & MSG_PEEK)
2044                 seq = &peek_seq;
2045 
2046         add_wait_queue(sk->sleep, &wait);
2047         sk->inuse = 1;
2048         while (len > 0) 
2049         {
2050                 struct sk_buff * skb;
2051                 unsigned long offset;
2052         
2053                 /*
2054                  * Are we at urgent data? Stop if we have read anything.
2055                  */
2056                  
2057                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2058                         break;
2059 
2060                 /*
2061                  *      Next get a buffer.
2062                  */
2063                  
2064                 current->state = TASK_INTERRUPTIBLE;
2065 
2066                 skb = skb_peek(&sk->receive_queue);
2067                 do 
2068                 {
2069                         if (!skb)
2070                                 break;
2071                         if (before(*seq, skb->h.th->seq))
2072                                 break;
2073                         offset = *seq - skb->h.th->seq;
2074                         if (skb->h.th->syn)
2075                                 offset--;
2076                         if (offset < skb->len)
2077                                 goto found_ok_skb;
2078                         if (skb->h.th->fin)
2079                                 goto found_fin_ok;
2080                         if (!(flags & MSG_PEEK))
2081                                 skb->used = 1;
2082                         skb = skb->next;
2083                 }
2084                 while (skb != (struct sk_buff *)&sk->receive_queue);
2085 
2086                 if (copied)
2087                         break;
2088 
2089                 if (sk->err) 
2090                 {
2091                         copied = -sk->err;
2092                         sk->err = 0;
2093                         break;
2094                 }
2095 
2096                 if (sk->state == TCP_CLOSE) 
2097                 {
2098                         if (!sk->done) 
2099                         {
2100                                 sk->done = 1;
2101                                 break;
2102                         }
2103                         copied = -ENOTCONN;
2104                         break;
2105                 }
2106 
2107                 if (sk->shutdown & RCV_SHUTDOWN) 
2108                 {
2109                         sk->done = 1;
2110                         break;
2111                 }
2112                         
2113                 if (nonblock) 
2114                 {
2115                         copied = -EAGAIN;
2116                         break;
2117                 }
2118 
2119                 cleanup_rbuf(sk);
2120                 release_sock(sk);
2121                 sk->socket->flags |= SO_WAITDATA;
2122                 schedule();
2123                 sk->socket->flags &= ~SO_WAITDATA;
2124                 sk->inuse = 1;
2125 
2126                 if (current->signal & ~current->blocked) 
2127                 {
2128                         copied = -ERESTARTSYS;
2129                         break;
2130                 }
2131                 continue;
2132 
2133         found_ok_skb:
2134                 /*
2135                  *      Lock the buffer. We can be fairly relaxed as
2136                  *      an interrupt will never steal a buffer we are 
2137                  *      using unless I've missed something serious in
2138                  *      tcp_data.
2139                  */
2140                 
2141                 skb->users++;
2142                 
2143                 /*
2144                  *      Ok so how much can we use ? 
2145                  */
2146                  
2147                 used = skb->len - offset;
2148                 if (len < used)
2149                         used = len;
2150                 /*
2151                  *      Do we have urgent data here? 
2152                  */
2153                 
2154                 if (sk->urg_data) 
2155                 {
2156                         unsigned long urg_offset = sk->urg_seq - *seq;
2157                         if (urg_offset < used) 
2158                         {
2159                                 if (!urg_offset) 
2160                                 {
2161                                         if (!sk->urginline) 
2162                                         {
2163                                                 ++*seq;
2164                                                 offset++;
2165                                                 used--;
2166                                         }
2167                                 }
2168                                 else
2169                                         used = urg_offset;
2170                         }
2171                 }
2172                 
2173                 /*
2174                  *      Copy it - We _MUST_ update *seq first so that we
2175                  *      don't ever double read when we have dual readers
2176                  */
2177                  
2178                 *seq += used;
2179 
2180                 /*
2181                  *      This memcpy_tofs can sleep. If it sleeps and we
2182                  *      do a second read it relies on the skb->users to avoid
2183                  *      a crash when cleanup_rbuf() gets called.
2184                  */
2185                  
2186                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2187                         skb->h.th->doff*4 + offset, used);
2188                 copied += used;
2189                 len -= used;
2190                 to += used;
2191                 
2192                 /*
2193                  *      We now will not sleep again until we are finished
2194                  *      with skb. Sorry if you are doing the SMP port
2195                  *      but you'll just have to fix it neatly ;)
2196                  */
2197                  
2198                 skb->users --;
2199                 
2200                 if (after(sk->copied_seq,sk->urg_seq))
2201                         sk->urg_data = 0;
2202                 if (used + offset < skb->len)
2203                         continue;
2204                 
2205                 /*
2206                  *      Process the FIN.
2207                  */
2208 
2209                 if (skb->h.th->fin)
2210                         goto found_fin_ok;
2211                 if (flags & MSG_PEEK)
2212                         continue;
2213                 skb->used = 1;
2214                 continue;
2215 
2216         found_fin_ok:
2217                 ++*seq;
2218                 if (flags & MSG_PEEK)
2219                         break;
2220                         
2221                 /*
2222                  *      All is done
2223                  */
2224                  
2225                 skb->used = 1;
2226                 sk->shutdown |= RCV_SHUTDOWN;
2227                 break;
2228 
2229         }
2230         remove_wait_queue(sk->sleep, &wait);
2231         current->state = TASK_RUNNING;
2232 
2233         /* Clean up data we have read: This will do ACK frames */
2234         cleanup_rbuf(sk);
2235         release_sock(sk);
2236         return copied;
2237 }
2238 
2239 /*
2240  *      State processing on a close. This implements the state shift for
2241  *      sending our FIN frame. Note that we only send a FIN for some 
2242  *      states. A shutdown() may have already sent the FIN, or we may be
2243  *      closed.
2244  */
2245  
2246 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2247 {
2248         int ns=TCP_CLOSE;
2249         int send_fin=0;
2250         switch(sk->state)
2251         {
2252                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2253                         break;
2254                 case TCP_SYN_RECV:
2255                 case TCP_ESTABLISHED:   /* Closedown begin */
2256                         ns=TCP_FIN_WAIT1;
2257                         send_fin=1;
2258                         break;
2259                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2260                 case TCP_FIN_WAIT2:
2261                 case TCP_CLOSING:
2262                         ns=sk->state;
2263                         break;
2264                 case TCP_CLOSE:
2265                 case TCP_LISTEN:
2266                         break;
2267                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2268                                            wait only for the ACK */
2269                         ns=TCP_LAST_ACK;
2270                         send_fin=1;
2271         }
2272         
2273         tcp_set_state(sk,ns);
2274                 
2275         /*
2276          *      This is a (useful) BSD violating of the RFC. There is a
2277          *      problem with TCP as specified in that the other end could
2278          *      keep a socket open forever with no application left this end.
2279          *      We use a 3 minute timeout (about the same as BSD) then kill
2280          *      our end. If they send after that then tough - BUT: long enough
2281          *      that we won't make the old 4*rto = almost no time - whoops
2282          *      reset mistake.
2283          */
2284         if(dead && ns==TCP_FIN_WAIT2)
2285         {
2286                 int timer_active=del_timer(&sk->timer);
2287                 if(timer_active)
2288                         add_timer(&sk->timer);
2289                 else
2290                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2291         }
2292         
2293         return send_fin;
2294 }
2295 
2296 /*
2297  *      Send a fin.
2298  */
2299 
2300 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2301 {
2302         struct proto *prot =(struct proto *)sk->prot;
2303         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2304         struct tcphdr *t1;
2305         struct sk_buff *buff;
2306         struct device *dev=NULL;
2307         int tmp;
2308                 
2309         release_sock(sk); /* in case the malloc sleeps. */
2310         
2311         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2312         sk->inuse = 1;
2313 
2314         if (buff == NULL)
2315         {
2316                 /* This is a disaster if it occurs */
2317                 printk("tcp_send_fin: Impossible malloc failure");
2318                 return;
2319         }
2320 
2321         /*
2322          *      Administrivia
2323          */
2324          
2325         buff->sk = sk;
2326         buff->len = sizeof(*t1);
2327         buff->localroute = sk->localroute;
2328         t1 =(struct tcphdr *) buff->data;
2329 
2330         /*
2331          *      Put in the IP header and routing stuff. 
2332          */
2333 
2334         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2335                            IPPROTO_TCP, sk->opt,
2336                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2337         if (tmp < 0) 
2338         {
2339                 int t;
2340                 /*
2341                  *      Finish anyway, treat this as a send that got lost. 
2342                  *      (Not good).
2343                  */
2344                  
2345                 buff->free = 1;
2346                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2347                 sk->write_seq++;
2348                 t=del_timer(&sk->timer);
2349                 if(t)
2350                         add_timer(&sk->timer);
2351                 else
2352                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2353                 return;
2354         }
2355         
2356         /*
2357          *      We ought to check if the end of the queue is a buffer and
2358          *      if so simply add the fin to that buffer, not send it ahead.
2359          */
2360 
2361         t1 =(struct tcphdr *)((char *)t1 +tmp);
2362         buff->len += tmp;
2363         buff->dev = dev;
2364         memcpy(t1, th, sizeof(*t1));
2365         t1->seq = ntohl(sk->write_seq);
2366         sk->write_seq++;
2367         buff->h.seq = sk->write_seq;
2368         t1->ack = 1;
2369         t1->ack_seq = ntohl(sk->acked_seq);
2370         t1->window = ntohs(sk->window=tcp_select_window(sk));
2371         t1->fin = 1;
2372         t1->rst = 0;
2373         t1->doff = sizeof(*t1)/4;
2374         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2375 
2376         /*
2377          * If there is data in the write queue, the fin must be appended to
2378          * the write queue.
2379          */
2380         
2381         if (skb_peek(&sk->write_queue) != NULL) 
2382         {
2383                 buff->free = 0;
2384                 if (buff->next != NULL) 
2385                 {
2386                         printk("tcp_send_fin: next != NULL\n");
2387                         skb_unlink(buff);
2388                 }
2389                 skb_queue_tail(&sk->write_queue, buff);
2390         } 
2391         else 
2392         {
2393                 sk->sent_seq = sk->write_seq;
2394                 sk->prot->queue_xmit(sk, dev, buff, 0);
2395                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2396         }
2397 }
2398 
2399 /*
2400  *      Shutdown the sending side of a connection. Much like close except
2401  *      that we don't receive shut down or set sk->dead=1.
2402  */
2403 
2404 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2405 {
2406         /*
2407          *      We need to grab some memory, and put together a FIN,
2408          *      and then put it into the queue to be sent.
2409          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2410          */
2411 
2412         if (!(how & SEND_SHUTDOWN)) 
2413                 return;
2414          
2415         /*
2416          *      If we've already sent a FIN, or its a closed state
2417          */
2418          
2419         if (sk->state == TCP_FIN_WAIT1 ||
2420             sk->state == TCP_FIN_WAIT2 ||
2421             sk->state == TCP_CLOSING ||
2422             sk->state == TCP_LAST_ACK ||
2423             sk->state == TCP_TIME_WAIT || 
2424             sk->state == TCP_CLOSE ||
2425             sk->state == TCP_LISTEN
2426           )
2427         {
2428                 return;
2429         }
2430         sk->inuse = 1;
2431 
2432         /*
2433          * flag that the sender has shutdown
2434          */
2435 
2436         sk->shutdown |= SEND_SHUTDOWN;
2437 
2438         /*
2439          *  Clear out any half completed packets. 
2440          */
2441 
2442         if (sk->partial)
2443                 tcp_send_partial(sk);
2444                 
2445         /*
2446          *      FIN if needed
2447          */
2448          
2449         if(tcp_close_state(sk,0))
2450                 tcp_send_fin(sk);
2451                 
2452         release_sock(sk);
2453 }
2454 
2455 
2456 static int
2457 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2458              int to_len, int nonblock, unsigned flags,
2459              struct sockaddr_in *addr, int *addr_len)
2460 {
2461         int result;
2462   
2463         /* 
2464          *      Have to check these first unlike the old code. If 
2465          *      we check them after we lose data on an error
2466          *      which is wrong 
2467          */
2468 
2469         if(addr_len)
2470                 *addr_len = sizeof(*addr);
2471         result=tcp_read(sk, to, to_len, nonblock, flags);
2472 
2473         if (result < 0) 
2474                 return(result);
2475   
2476         if(addr)
2477         {
2478                 addr->sin_family = AF_INET;
2479                 addr->sin_port = sk->dummy_th.dest;
2480                 addr->sin_addr.s_addr = sk->daddr;
2481         }
2482         return(result);
2483 }
2484 
2485 
2486 /*
2487  *      This routine will send an RST to the other tcp. 
2488  */
2489  
2490 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2491           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2492 {
2493         struct sk_buff *buff;
2494         struct tcphdr *t1;
2495         int tmp;
2496         struct device *ndev=NULL;
2497 
2498         /*
2499          *      Cannot reset a reset (Think about it).
2500          */
2501          
2502         if(th->rst)
2503                 return;
2504   
2505         /*
2506          * We need to grab some memory, and put together an RST,
2507          * and then put it into the queue to be sent.
2508          */
2509 
2510         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2511         if (buff == NULL) 
2512                 return;
2513 
2514         buff->len = sizeof(*t1);
2515         buff->sk = NULL;
2516         buff->dev = dev;
2517         buff->localroute = 0;
2518 
2519         t1 =(struct tcphdr *) buff->data;
2520 
2521         /*
2522          *      Put in the IP header and routing stuff. 
2523          */
2524 
2525         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2526                            sizeof(struct tcphdr),tos,ttl);
2527         if (tmp < 0) 
2528         {
2529                 buff->free = 1;
2530                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2531                 return;
2532         }
2533 
2534         t1 =(struct tcphdr *)((char *)t1 +tmp);
2535         buff->len += tmp;
2536         memcpy(t1, th, sizeof(*t1));
2537 
2538         /*
2539          *      Swap the send and the receive. 
2540          */
2541 
2542         t1->dest = th->source;
2543         t1->source = th->dest;
2544         t1->rst = 1;  
2545         t1->window = 0;
2546   
2547         if(th->ack)
2548         {
2549                 t1->ack = 0;
2550                 t1->seq = th->ack_seq;
2551                 t1->ack_seq = 0;
2552         }
2553         else
2554         {
2555                 t1->ack = 1;
2556                 if(!th->syn)
2557                         t1->ack_seq=htonl(th->seq);
2558                 else
2559                         t1->ack_seq=htonl(th->seq+1);
2560                 t1->seq=0;
2561         }
2562 
2563         t1->syn = 0;
2564         t1->urg = 0;
2565         t1->fin = 0;
2566         t1->psh = 0;
2567         t1->doff = sizeof(*t1)/4;
2568         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2569         prot->queue_xmit(NULL, ndev, buff, 1);
2570         tcp_statistics.TcpOutSegs++;
2571 }
2572 
2573 
2574 /*
2575  *      Look for tcp options. Parses everything but only knows about MSS.
2576  *      This routine is always called with the packet containing the SYN.
2577  *      However it may also be called with the ack to the SYN.  So you
2578  *      can't assume this is always the SYN.  It's always called after
2579  *      we have set up sk->mtu to our own MTU.
2580  *
2581  *      We need at minimum to add PAWS support here. Possibly large windows
2582  *      as Linux gets deployed on 100Mb/sec networks.
2583  */
2584  
2585 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2586 {
2587         unsigned char *ptr;
2588         int length=(th->doff*4)-sizeof(struct tcphdr);
2589         int mss_seen = 0;
2590     
2591         ptr = (unsigned char *)(th + 1);
2592   
2593         while(length>0)
2594         {
2595                 int opcode=*ptr++;
2596                 int opsize=*ptr++;
2597                 switch(opcode)
2598                 {
2599                         case TCPOPT_EOL:
2600                                 return;
2601                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2602                                 length--;
2603                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2604                                 continue;
2605                         
2606                         default:
2607                                 if(opsize<=2)   /* Avoid silly options looping forever */
2608                                         return;
2609                                 switch(opcode)
2610                                 {
2611                                         case TCPOPT_MSS:
2612                                                 if(opsize==4 && th->syn)
2613                                                 {
2614                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2615                                                         mss_seen = 1;
2616                                                 }
2617                                                 break;
2618                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2619                                 }
2620                                 ptr+=opsize-2;
2621                                 length-=opsize;
2622                 }
2623         }
2624         if (th->syn) 
2625         {
2626                 if (! mss_seen)
2627                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2628         }
2629 #ifdef CONFIG_INET_PCTCP
2630         sk->mss = min(sk->max_window >> 1, sk->mtu);
2631 #else    
2632         sk->mss = min(sk->max_window, sk->mtu);
2633 #endif  
2634 }
2635 
2636 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2637 {
2638         dst = ntohl(dst);
2639         if (IN_CLASSA(dst))
2640                 return htonl(IN_CLASSA_NET);
2641         if (IN_CLASSB(dst))
2642                 return htonl(IN_CLASSB_NET);
2643         return htonl(IN_CLASSC_NET);
2644 }
2645 
2646 /*
2647  *      Default sequence number picking algorithm.
2648  *      As close as possible to RFC 793, which
2649  *      suggests using a 250kHz clock.
2650  *      Further reading shows this assumes 2MB/s networks.
2651  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2652  *      That's funny, Linux has one built in!  Use it!
2653  */
2654 
2655 extern inline unsigned long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2656 {
2657         struct timeval tv;
2658         do_gettimeofday(&tv);
2659         return tv.tv_usec+tv.tv_sec*1000000;
2660 }
2661 
2662 /*
2663  *      This routine handles a connection request.
2664  *      It should make sure we haven't already responded.
2665  *      Because of the way BSD works, we have to send a syn/ack now.
2666  *      This also means it will be harder to close a socket which is
2667  *      listening.
2668  */
2669  
2670 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2671                  unsigned long daddr, unsigned long saddr,
2672                  struct options *opt, struct device *dev, unsigned long seq)
2673 {
2674         struct sk_buff *buff;
2675         struct tcphdr *t1;
2676         unsigned char *ptr;
2677         struct sock *newsk;
2678         struct tcphdr *th;
2679         struct device *ndev=NULL;
2680         int tmp;
2681         struct rtable *rt;
2682   
2683         th = skb->h.th;
2684 
2685         /* If the socket is dead, don't accept the connection. */
2686         if (!sk->dead) 
2687         {
2688                 sk->data_ready(sk,0);
2689         }
2690         else 
2691         {
2692                 if(sk->debug)
2693                         printk("Reset on %p: Connect on dead socket.\n",sk);
2694                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2695                 tcp_statistics.TcpAttemptFails++;
2696                 kfree_skb(skb, FREE_READ);
2697                 return;
2698         }
2699 
2700         /*
2701          * Make sure we can accept more.  This will prevent a
2702          * flurry of syns from eating up all our memory.
2703          */
2704 
2705         if (sk->ack_backlog >= sk->max_ack_backlog) 
2706         {
2707                 tcp_statistics.TcpAttemptFails++;
2708                 kfree_skb(skb, FREE_READ);
2709                 return;
2710         }
2711 
2712         /*
2713          * We need to build a new sock struct.
2714          * It is sort of bad to have a socket without an inode attached
2715          * to it, but the wake_up's will just wake up the listening socket,
2716          * and if the listening socket is destroyed before this is taken
2717          * off of the queue, this will take care of it.
2718          */
2719 
2720         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2721         if (newsk == NULL) 
2722         {
2723                 /* just ignore the syn.  It will get retransmitted. */
2724                 tcp_statistics.TcpAttemptFails++;
2725                 kfree_skb(skb, FREE_READ);
2726                 return;
2727         }
2728 
2729         memcpy(newsk, sk, sizeof(*newsk));
2730         skb_queue_head_init(&newsk->write_queue);
2731         skb_queue_head_init(&newsk->receive_queue);
2732         newsk->send_head = NULL;
2733         newsk->send_tail = NULL;
2734         skb_queue_head_init(&newsk->back_log);
2735         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2736         newsk->rto = TCP_TIMEOUT_INIT;
2737         newsk->mdev = 0;
2738         newsk->max_window = 0;
2739         newsk->cong_window = 1;
2740         newsk->cong_count = 0;
2741         newsk->ssthresh = 0;
2742         newsk->backoff = 0;
2743         newsk->blog = 0;
2744         newsk->intr = 0;
2745         newsk->proc = 0;
2746         newsk->done = 0;
2747         newsk->partial = NULL;
2748         newsk->pair = NULL;
2749         newsk->wmem_alloc = 0;
2750         newsk->rmem_alloc = 0;
2751         newsk->localroute = sk->localroute;
2752 
2753         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2754 
2755         newsk->err = 0;
2756         newsk->shutdown = 0;
2757         newsk->ack_backlog = 0;
2758         newsk->acked_seq = skb->h.th->seq+1;
2759         newsk->copied_seq = skb->h.th->seq+1;
2760         newsk->fin_seq = skb->h.th->seq;
2761         newsk->state = TCP_SYN_RECV;
2762         newsk->timeout = 0;
2763         newsk->ip_xmit_timeout = 0;
2764         newsk->write_seq = seq; 
2765         newsk->window_seq = newsk->write_seq;
2766         newsk->rcv_ack_seq = newsk->write_seq;
2767         newsk->urg_data = 0;
2768         newsk->retransmits = 0;
2769         newsk->linger=0;
2770         newsk->destroy = 0;
2771         init_timer(&newsk->timer);
2772         newsk->timer.data = (unsigned long)newsk;
2773         newsk->timer.function = &net_timer;
2774         init_timer(&newsk->retransmit_timer);
2775         newsk->retransmit_timer.data = (unsigned long)newsk;
2776         newsk->retransmit_timer.function=&retransmit_timer;
2777         newsk->dummy_th.source = skb->h.th->dest;
2778         newsk->dummy_th.dest = skb->h.th->source;
2779         
2780         /*
2781          *      Swap these two, they are from our point of view. 
2782          */
2783          
2784         newsk->daddr = saddr;
2785         newsk->saddr = daddr;
2786 
2787         put_sock(newsk->num,newsk);
2788         newsk->dummy_th.res1 = 0;
2789         newsk->dummy_th.doff = 6;
2790         newsk->dummy_th.fin = 0;
2791         newsk->dummy_th.syn = 0;
2792         newsk->dummy_th.rst = 0;        
2793         newsk->dummy_th.psh = 0;
2794         newsk->dummy_th.ack = 0;
2795         newsk->dummy_th.urg = 0;
2796         newsk->dummy_th.res2 = 0;
2797         newsk->acked_seq = skb->h.th->seq + 1;
2798         newsk->copied_seq = skb->h.th->seq + 1;
2799         newsk->socket = NULL;
2800 
2801         /*
2802          *      Grab the ttl and tos values and use them 
2803          */
2804 
2805         newsk->ip_ttl=sk->ip_ttl;
2806         newsk->ip_tos=skb->ip_hdr->tos;
2807 
2808         /*
2809          *      Use 512 or whatever user asked for 
2810          */
2811 
2812         /*
2813          *      Note use of sk->user_mss, since user has no direct access to newsk 
2814          */
2815 
2816         rt=ip_rt_route(saddr, NULL,NULL);
2817         
2818         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2819                 newsk->window_clamp = rt->rt_window;
2820         else
2821                 newsk->window_clamp = 0;
2822                 
2823         if (sk->user_mss)
2824                 newsk->mtu = sk->user_mss;
2825         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2826                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2827         else 
2828         {
2829 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2830                 if ((saddr ^ daddr) & default_mask(saddr))
2831 #else
2832                 if ((saddr ^ daddr) & dev->pa_mask)
2833 #endif
2834                         newsk->mtu = 576 - HEADER_SIZE;
2835                 else
2836                         newsk->mtu = MAX_WINDOW;
2837         }
2838 
2839         /*
2840          *      But not bigger than device MTU 
2841          */
2842 
2843         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2844 
2845         /*
2846          *      This will min with what arrived in the packet 
2847          */
2848 
2849         tcp_options(newsk,skb->h.th);
2850 
2851         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2852         if (buff == NULL) 
2853         {
2854                 sk->err = -ENOMEM;
2855                 newsk->dead = 1;
2856                 newsk->state = TCP_CLOSE;
2857                 /* And this will destroy it */
2858                 release_sock(newsk);
2859                 kfree_skb(skb, FREE_READ);
2860                 tcp_statistics.TcpAttemptFails++;
2861                 return;
2862         }
2863   
2864         buff->len = sizeof(struct tcphdr)+4;
2865         buff->sk = newsk;
2866         buff->localroute = newsk->localroute;
2867 
2868         t1 =(struct tcphdr *) buff->data;
2869 
2870         /*
2871          *      Put in the IP header and routing stuff. 
2872          */
2873 
2874         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2875                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2876 
2877         /*
2878          *      Something went wrong. 
2879          */
2880 
2881         if (tmp < 0) 
2882         {
2883                 sk->err = tmp;
2884                 buff->free = 1;
2885                 kfree_skb(buff,FREE_WRITE);
2886                 newsk->dead = 1;
2887                 newsk->state = TCP_CLOSE;
2888                 release_sock(newsk);
2889                 skb->sk = sk;
2890                 kfree_skb(skb, FREE_READ);
2891                 tcp_statistics.TcpAttemptFails++;
2892                 return;
2893         }
2894 
2895         buff->len += tmp;
2896         t1 =(struct tcphdr *)((char *)t1 +tmp);
2897   
2898         memcpy(t1, skb->h.th, sizeof(*t1));
2899         buff->h.seq = newsk->write_seq;
2900         /*
2901          *      Swap the send and the receive. 
2902          */
2903         t1->dest = skb->h.th->source;
2904         t1->source = newsk->dummy_th.source;
2905         t1->seq = ntohl(newsk->write_seq++);
2906         t1->ack = 1;
2907         newsk->window = tcp_select_window(newsk);
2908         newsk->sent_seq = newsk->write_seq;
2909         t1->window = ntohs(newsk->window);
2910         t1->res1 = 0;
2911         t1->res2 = 0;
2912         t1->rst = 0;
2913         t1->urg = 0;
2914         t1->psh = 0;
2915         t1->syn = 1;
2916         t1->ack_seq = ntohl(skb->h.th->seq+1);
2917         t1->doff = sizeof(*t1)/4+1;
2918         ptr =(unsigned char *)(t1+1);
2919         ptr[0] = 2;
2920         ptr[1] = 4;
2921         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2922         ptr[3] =(newsk->mtu) & 0xff;
2923 
2924         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2925         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2926         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2927         skb->sk = newsk;
2928 
2929         /*
2930          *      Charge the sock_buff to newsk. 
2931          */
2932          
2933         sk->rmem_alloc -= skb->mem_len;
2934         newsk->rmem_alloc += skb->mem_len;
2935         
2936         skb_queue_tail(&sk->receive_queue,skb);
2937         sk->ack_backlog++;
2938         release_sock(newsk);
2939         tcp_statistics.TcpOutSegs++;
2940 }
2941 
2942 
2943 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2944 {
2945         /*
2946          * We need to grab some memory, and put together a FIN, 
2947          * and then put it into the queue to be sent.
2948          */
2949         
2950         sk->inuse = 1;
2951         
2952         if(sk->state == TCP_LISTEN)
2953         {
2954                 /* Special case */
2955                 tcp_set_state(sk, TCP_CLOSE);
2956                 tcp_close_pending(sk);
2957                 release_sock(sk);
2958                 return;
2959         }
2960         
2961         sk->keepopen = 1;
2962         sk->shutdown = SHUTDOWN_MASK;
2963 
2964         if (!sk->dead) 
2965                 sk->state_change(sk);
2966 
2967         if (timeout == 0) 
2968         {
2969                 struct sk_buff *skb;
2970                 
2971                 /*
2972                  *  We need to flush the recv. buffs.  We do this only on the
2973                  *  descriptor close, not protocol-sourced closes, because the
2974                  *  reader process may not have drained the data yet!
2975                  */
2976                  
2977                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2978                         kfree_skb(skb, FREE_READ);
2979                 /*
2980                  *      Get rid off any half-completed packets. 
2981                  */
2982 
2983                 if (sk->partial) 
2984                         tcp_send_partial(sk);
2985         }
2986 
2987                 
2988         /*
2989          *      Timeout is not the same thing - however the code likes
2990          *      to send both the same way (sigh).
2991          */
2992          
2993         if(timeout)
2994         {
2995                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2996         }
2997         else
2998         {
2999                 if(tcp_close_state(sk,1)==1)
3000                 {
3001                         tcp_send_fin(sk);
3002                 }
3003         }
3004         release_sock(sk);
3005 }
3006 
3007 
3008 /*
3009  *      This routine takes stuff off of the write queue,
3010  *      and puts it in the xmit queue. This happens as incoming acks
3011  *      open up the remote window for us.
3012  */
3013  
3014 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3015 {
3016         struct sk_buff *skb;
3017 
3018         /*
3019          *      The bytes will have to remain here. In time closedown will
3020          *      empty the write queue and all will be happy 
3021          */
3022 
3023         if(sk->zapped)
3024                 return;
3025 
3026         /*
3027          *      Anything on the transmit queue that fits the window can
3028          *      be added providing we are not
3029          *
3030          *      a) retransmitting (Nagle's rule)
3031          *      b) exceeding our congestion window.
3032          */
3033          
3034         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3035                 before(skb->h.seq, sk->window_seq + 1) &&
3036                 (sk->retransmits == 0 ||
3037                  sk->ip_xmit_timeout != TIME_WRITE ||
3038                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3039                 && sk->packets_out < sk->cong_window) 
3040         {
3041                 IS_SKB(skb);
3042                 skb_unlink(skb);
3043                 
3044                 /*
3045                  *      See if we really need to send the packet. 
3046                  */
3047                  
3048                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3049                 {
3050                         /*
3051                          *      This is acked data. We can discard it. This 
3052                          *      cannot currently occur.
3053                          */
3054                          
3055                         sk->retransmits = 0;
3056                         kfree_skb(skb, FREE_WRITE);
3057                         if (!sk->dead) 
3058                                 sk->write_space(sk);
3059                 } 
3060                 else
3061                 {
3062                         struct tcphdr *th;
3063                         struct iphdr *iph;
3064                         int size;
3065 /*
3066  * put in the ack seq and window at this point rather than earlier,
3067  * in order to keep them monotonic.  We really want to avoid taking
3068  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3069  * Ack and window will in general have changed since this packet was put
3070  * on the write queue.
3071  */
3072                         iph = (struct iphdr *)(skb->data +
3073                                                skb->dev->hard_header_len);
3074                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3075                         size = skb->len - (((unsigned char *) th) - skb->data);
3076                         
3077                         th->ack_seq = ntohl(sk->acked_seq);
3078                         th->window = ntohs(tcp_select_window(sk));
3079 
3080                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3081 
3082                         sk->sent_seq = skb->h.seq;
3083                         
3084                         /*
3085                          *      IP manages our queue for some crazy reason
3086                          */
3087                          
3088                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3089                         
3090                         /*
3091                          *      Again we slide the timer wrongly
3092                          */
3093                          
3094                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3095                 }
3096         }
3097 }
3098 
3099 
3100 /*
3101  *      This routine deals with incoming acks, but not outgoing ones.
3102  */
3103 
3104 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3105 {
3106         unsigned long ack;
3107         int flag = 0;
3108 
3109         /* 
3110          * 1 - there was data in packet as well as ack or new data is sent or 
3111          *     in shutdown state
3112          * 2 - data from retransmit queue was acked and removed
3113          * 4 - window shrunk or data from retransmit queue was acked and removed
3114          */
3115 
3116         if(sk->zapped)
3117                 return(1);      /* Dead, cant ack any more so why bother */
3118 
3119         /*
3120          *      Have we discovered a larger window
3121          */
3122          
3123         ack = ntohl(th->ack_seq);
3124 
3125         if (ntohs(th->window) > sk->max_window) 
3126         {
3127                 sk->max_window = ntohs(th->window);
3128 #ifdef CONFIG_INET_PCTCP
3129                 /* Hack because we don't send partial packets to non SWS
3130                    handling hosts */
3131                 sk->mss = min(sk->max_window>>1, sk->mtu);
3132 #else
3133                 sk->mss = min(sk->max_window, sk->mtu);
3134 #endif  
3135         }
3136 
3137         /*
3138          *      We have dropped back to keepalive timeouts. Thus we have
3139          *      no retransmits pending.
3140          */
3141          
3142         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3143                 sk->retransmits = 0;
3144 
3145         /*
3146          *      If the ack is newer than sent or older than previous acks
3147          *      then we can probably ignore it.
3148          */
3149          
3150         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3151         {
3152                 if(sk->debug)
3153                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3154                         
3155                 /*
3156                  *      Keepalive processing.
3157                  */
3158                  
3159                 if (after(ack, sk->sent_seq)) 
3160                 {
3161                         return(0);
3162                 }
3163                 
3164                 /*
3165                  *      Restart the keepalive timer.
3166                  */
3167                  
3168                 if (sk->keepopen) 
3169                 {
3170                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3171                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3172                 }
3173                 return(1);
3174         }
3175 
3176         /*
3177          *      If there is data set flag 1
3178          */
3179          
3180         if (len != th->doff*4) 
3181                 flag |= 1;
3182 
3183         /*
3184          *      See if our window has been shrunk. 
3185          */
3186 
3187         if (after(sk->window_seq, ack+ntohs(th->window))) 
3188         {
3189                 /*
3190                  * We may need to move packets from the send queue
3191                  * to the write queue, if the window has been shrunk on us.
3192                  * The RFC says you are not allowed to shrink your window
3193                  * like this, but if the other end does, you must be able
3194                  * to deal with it.
3195                  */
3196                 struct sk_buff *skb;
3197                 struct sk_buff *skb2;
3198                 struct sk_buff *wskb = NULL;
3199         
3200                 skb2 = sk->send_head;
3201                 sk->send_head = NULL;
3202                 sk->send_tail = NULL;
3203         
3204                 /*
3205                  *      This is an artifact of a flawed concept. We want one
3206                  *      queue and a smarter send routine when we send all.
3207                  */
3208         
3209                 flag |= 4;      /* Window changed */
3210         
3211                 sk->window_seq = ack + ntohs(th->window);
3212                 cli();
3213                 while (skb2 != NULL) 
3214                 {
3215                         skb = skb2;
3216                         skb2 = skb->link3;
3217                         skb->link3 = NULL;
3218                         if (after(skb->h.seq, sk->window_seq)) 
3219                         {
3220                                 if (sk->packets_out > 0) 
3221                                         sk->packets_out--;
3222                                 /* We may need to remove this from the dev send list. */
3223                                 if (skb->next != NULL) 
3224                                 {
3225                                         skb_unlink(skb);                                
3226                                 }
3227                                 /* Now add it to the write_queue. */
3228                                 if (wskb == NULL)
3229                                         skb_queue_head(&sk->write_queue,skb);
3230                                 else
3231                                         skb_append(wskb,skb);
3232                                 wskb = skb;
3233                         } 
3234                         else 
3235                         {
3236                                 if (sk->send_head == NULL) 
3237                                 {
3238                                         sk->send_head = skb;
3239                                         sk->send_tail = skb;
3240                                 }
3241                                 else
3242                                 {
3243                                         sk->send_tail->link3 = skb;
3244                                         sk->send_tail = skb;
3245                                 }
3246                                 skb->link3 = NULL;
3247                         }
3248                 }
3249                 sti();
3250         }
3251 
3252         /*
3253          *      Pipe has emptied
3254          */
3255          
3256         if (sk->send_tail == NULL || sk->send_head == NULL) 
3257         {
3258                 sk->send_head = NULL;
3259                 sk->send_tail = NULL;
3260                 sk->packets_out= 0;
3261         }
3262 
3263         /*
3264          *      Update the right hand window edge of the host
3265          */
3266          
3267         sk->window_seq = ack + ntohs(th->window);
3268 
3269         /*
3270          *      We don't want too many packets out there. 
3271          */
3272          
3273         if (sk->ip_xmit_timeout == TIME_WRITE && 
3274                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3275         {
3276                 /* 
3277                  * This is Jacobson's slow start and congestion avoidance. 
3278                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3279                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3280                  * counter and increment it once every cwnd times.  It's possible
3281                  * that this should be done only if sk->retransmits == 0.  I'm
3282                  * interpreting "new data is acked" as including data that has
3283                  * been retransmitted but is just now being acked.
3284                  */
3285                 if (sk->cong_window < sk->ssthresh)  
3286                         /* 
3287                          *      In "safe" area, increase
3288                          */
3289                         sk->cong_window++;
3290                 else 
3291                 {
3292                         /*
3293                          *      In dangerous area, increase slowly.  In theory this is
3294                          *      sk->cong_window += 1 / sk->cong_window
3295                          */
3296                         if (sk->cong_count >= sk->cong_window) 
3297                         {
3298                                 sk->cong_window++;
3299                                 sk->cong_count = 0;
3300                         }
3301                         else 
3302                                 sk->cong_count++;
3303                 }
3304         }
3305 
3306         /*
3307          *      Remember the highest ack received.
3308          */
3309          
3310         sk->rcv_ack_seq = ack;
3311 
3312         /*
3313          *      If this ack opens up a zero window, clear backoff.  It was
3314          *      being used to time the probes, and is probably far higher than
3315          *      it needs to be for normal retransmission.
3316          */
3317 
3318         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3319         {
3320                 sk->retransmits = 0;    /* Our probe was answered */
3321                 
3322                 /*
3323                  *      Was it a usable window open ?
3324                  */
3325                  
3326                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3327                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3328                 {
3329                         sk->backoff = 0;
3330                         
3331                         /*
3332                          *      Recompute rto from rtt.  this eliminates any backoff.
3333                          */
3334 
3335                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3336                         if (sk->rto > 120*HZ)
3337                                 sk->rto = 120*HZ;
3338                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3339                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3340                                                    .2 of a second is going to need huge windows (SIGH) */
3341                         sk->rto = 20;
3342                 }
3343         }
3344 
3345         /* 
3346          *      See if we can take anything off of the retransmit queue.
3347          */
3348    
3349         while(sk->send_head != NULL) 
3350         {
3351                 /* Check for a bug. */
3352                 if (sk->send_head->link3 &&
3353                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3354                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3355                         
3356                 /*
3357                  *      If our packet is before the ack sequence we can
3358                  *      discard it as its confirmed to have arrived the other end.
3359                  */
3360                  
3361                 if (before(sk->send_head->h.seq, ack+1)) 
3362                 {
3363                         struct sk_buff *oskb;   
3364                         if (sk->retransmits) 
3365                         {       
3366                                 /*
3367                                  *      We were retransmitting.  don't count this in RTT est 
3368                                  */
3369                                 flag |= 2;
3370 
3371                                 /*
3372                                  * even though we've gotten an ack, we're still
3373                                  * retransmitting as long as we're sending from
3374                                  * the retransmit queue.  Keeping retransmits non-zero
3375                                  * prevents us from getting new data interspersed with
3376                                  * retransmissions.
3377                                  */
3378 
3379                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3380                                         sk->retransmits = 1;
3381                                 else
3382                                         sk->retransmits = 0;
3383                         }
3384                         /*
3385                          * Note that we only reset backoff and rto in the
3386                          * rtt recomputation code.  And that doesn't happen
3387                          * if there were retransmissions in effect.  So the
3388                          * first new packet after the retransmissions is
3389                          * sent with the backoff still in effect.  Not until
3390                          * we get an ack from a non-retransmitted packet do
3391                          * we reset the backoff and rto.  This allows us to deal
3392                          * with a situation where the network delay has increased
3393                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3394                          */
3395 
3396                         /*
3397                          *      We have one less packet out there. 
3398                          */
3399                          
3400                         if (sk->packets_out > 0) 
3401                                 sk->packets_out --;
3402                         /* 
3403                          *      Wake up the process, it can probably write more. 
3404                          */
3405                         if (!sk->dead) 
3406                                 sk->write_space(sk);
3407                         oskb = sk->send_head;
3408 
3409                         if (!(flag&2))  /* Not retransmitting */
3410                         {
3411                                 long m;
3412         
3413                                 /*
3414                                  *      The following amusing code comes from Jacobson's
3415                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3416                                  *      are scaled versions of rtt and mean deviation.
3417                                  *      This is designed to be as fast as possible 
3418                                  *      m stands for "measurement".
3419                                  */
3420         
3421                                 m = jiffies - oskb->when;  /* RTT */
3422                                 if(m<=0)
3423                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3424                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3425                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3426                                 if (m < 0)
3427                                         m = -m;         /* m is now abs(error) */
3428                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3429                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3430         
3431                                 /*
3432                                  *      Now update timeout.  Note that this removes any backoff.
3433                                  */
3434                          
3435                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3436                                 if (sk->rto > 120*HZ)
3437                                         sk->rto = 120*HZ;
3438                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3439                                         sk->rto = 20;
3440                                 sk->backoff = 0;
3441                         }
3442                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3443                                            In this case as we just set it up */
3444                         cli();
3445                         oskb = sk->send_head;
3446                         IS_SKB(oskb);
3447                         sk->send_head = oskb->link3;
3448                         if (sk->send_head == NULL) 
3449                         {
3450                                 sk->send_tail = NULL;
3451                         }
3452 
3453                 /*
3454                  *      We may need to remove this from the dev send list. 
3455                  */
3456 
3457                         if (oskb->next)
3458                                 skb_unlink(oskb);
3459                         sti();
3460                         kfree_skb(oskb, FREE_WRITE); /* write. */
3461                         if (!sk->dead) 
3462                                 sk->write_space(sk);
3463                 }
3464                 else
3465                 {
3466                         break;
3467                 }
3468         }
3469 
3470         /*
3471          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3472          * returns non-NULL, we complete ignore the timer stuff in the else
3473          * clause.  We ought to organize the code so that else clause can
3474          * (should) be executed regardless, possibly moving the PROBE timer
3475          * reset over.  The skb_peek() thing should only move stuff to the
3476          * write queue, NOT also manage the timer functions.
3477          */
3478 
3479         /*
3480          * Maybe we can take some stuff off of the write queue,
3481          * and put it onto the xmit queue.
3482          */
3483         if (skb_peek(&sk->write_queue) != NULL) 
3484         {
3485                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3486                         (sk->retransmits == 0 || 
3487                          sk->ip_xmit_timeout != TIME_WRITE ||
3488                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3489                         && sk->packets_out < sk->cong_window) 
3490                 {
3491                         /*
3492                          *      Add more data to the send queue.
3493                          */
3494                         flag |= 1;
3495                         tcp_write_xmit(sk);
3496                 }
3497                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3498                         sk->send_head == NULL &&
3499                         sk->ack_backlog == 0 &&
3500                         sk->state != TCP_TIME_WAIT) 
3501                 {
3502                         /*
3503                          *      Data to queue but no room.
3504                          */
3505                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3506                 }               
3507         }
3508         else
3509         {
3510                 /*
3511                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3512                  * from TCP_CLOSE we don't do anything
3513                  *
3514                  * from anything else, if there is write data (or fin) pending,
3515                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3516                  * a KEEPALIVE timeout, else we delete the timer.
3517                  *
3518                  * We do not set flag for nominal write data, otherwise we may
3519                  * force a state where we start to write itsy bitsy tidbits
3520                  * of data.
3521                  */
3522 
3523                 switch(sk->state) {
3524                 case TCP_TIME_WAIT:
3525                         /*
3526                          * keep us in TIME_WAIT until we stop getting packets,
3527                          * reset the timeout.
3528                          */
3529                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3530                         break;
3531                 case TCP_CLOSE:
3532                         /*
3533                          * don't touch the timer.
3534                          */
3535                         break;
3536                 default:
3537                         /*
3538                          *      Must check send_head, write_queue, and ack_backlog
3539                          *      to determine which timeout to use.
3540                          */
3541                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3542                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3543                         } else if (sk->keepopen) {
3544                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3545                         } else {
3546                                 del_timer(&sk->retransmit_timer);
3547                                 sk->ip_xmit_timeout = 0;
3548                         }
3549                         break;
3550                 }
3551         }
3552 
3553         /*
3554          *      We have nothing queued but space to send. Send any partial
3555          *      packets immediately (end of Nagle rule application).
3556          */
3557          
3558         if (sk->packets_out == 0 && sk->partial != NULL &&
3559                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3560         {
3561                 flag |= 1;
3562                 tcp_send_partial(sk);
3563         }
3564 
3565         /*
3566          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3567          * we are now waiting for an acknowledge to our FIN.  The other end is
3568          * already in TIME_WAIT.
3569          *
3570          * Move to TCP_CLOSE on success.
3571          */
3572 
3573         if (sk->state == TCP_LAST_ACK) 
3574         {
3575                 if (!sk->dead)
3576                         sk->state_change(sk);
3577                 if(sk->debug)
3578                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3579                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3580                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3581                 {
3582                         flag |= 1;
3583                         tcp_set_state(sk,TCP_CLOSE);
3584                         sk->shutdown = SHUTDOWN_MASK;
3585                 }
3586         }
3587 
3588         /*
3589          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3590          *
3591          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3592          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3593          */
3594 
3595         if (sk->state == TCP_FIN_WAIT1) 
3596         {
3597 
3598                 if (!sk->dead) 
3599                         sk->state_change(sk);
3600                 if (sk->rcv_ack_seq == sk->write_seq) 
3601                 {
3602                         flag |= 1;
3603                         sk->shutdown |= SEND_SHUTDOWN;
3604                         tcp_set_state(sk, TCP_FIN_WAIT2);
3605                 }
3606         }
3607 
3608         /*
3609          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3610          *
3611          *      Move to TIME_WAIT
3612          */
3613 
3614         if (sk->state == TCP_CLOSING) 
3615         {
3616 
3617                 if (!sk->dead) 
3618                         sk->state_change(sk);
3619                 if (sk->rcv_ack_seq == sk->write_seq) 
3620                 {
3621                         flag |= 1;
3622                         tcp_time_wait(sk);
3623                 }
3624         }
3625         
3626         /*
3627          *      Final ack of a three way shake 
3628          */
3629          
3630         if(sk->state==TCP_SYN_RECV)
3631         {
3632                 tcp_set_state(sk, TCP_ESTABLISHED);
3633                 tcp_options(sk,th);
3634                 sk->dummy_th.dest=th->source;
3635                 sk->copied_seq = sk->acked_seq;
3636                 if(!sk->dead)
3637                         sk->state_change(sk);
3638                 if(sk->max_window==0)
3639                 {
3640                         sk->max_window=32;      /* Sanity check */
3641                         sk->mss=min(sk->max_window,sk->mtu);
3642                 }
3643         }
3644         
3645         /*
3646          * I make no guarantees about the first clause in the following
3647          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3648          * what conditions "!flag" would be true.  However I think the rest
3649          * of the conditions would prevent that from causing any
3650          * unnecessary retransmission. 
3651          *   Clearly if the first packet has expired it should be 
3652          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3653          * harder to explain:  You have to look carefully at how and when the
3654          * timer is set and with what timeout.  The most recent transmission always
3655          * sets the timer.  So in general if the most recent thing has timed
3656          * out, everything before it has as well.  So we want to go ahead and
3657          * retransmit some more.  If we didn't explicitly test for this
3658          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3659          * would not be true.  If you look at the pattern of timing, you can
3660          * show that rto is increased fast enough that the next packet would
3661          * almost never be retransmitted immediately.  Then you'd end up
3662          * waiting for a timeout to send each packet on the retransmission
3663          * queue.  With my implementation of the Karn sampling algorithm,
3664          * the timeout would double each time.  The net result is that it would
3665          * take a hideous amount of time to recover from a single dropped packet.
3666          * It's possible that there should also be a test for TIME_WRITE, but
3667          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3668          * got to be in real retransmission mode.
3669          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3670          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3671          * As long as no further losses occur, this seems reasonable.
3672          */
3673         
3674         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3675                (((flag&2) && sk->retransmits) ||
3676                (sk->send_head->when + sk->rto < jiffies))) 
3677         {
3678                 if(sk->send_head->when + sk->rto < jiffies)
3679                         tcp_retransmit(sk,0);   
3680                 else
3681                 {
3682                         tcp_do_retransmit(sk, 1);
3683                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3684                 }
3685         }
3686 
3687         return(1);
3688 }
3689 
3690 
3691 /*
3692  *      Process the FIN bit. This now behaves as it is supposed to work
3693  *      and the FIN takes effect when it is validly part of sequence
3694  *      space. Not before when we get holes.
3695  *
3696  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3697  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3698  *      TIME-WAIT)
3699  *
3700  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3701  *      close and we go into CLOSING (and later onto TIME-WAIT)
3702  *
3703  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3704  *
3705  */
3706  
3707 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3708 {
3709         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3710 
3711         if (!sk->dead) 
3712         {
3713                 sk->state_change(sk);
3714                 sock_wake_async(sk->socket, 1);
3715         }
3716 
3717         switch(sk->state) 
3718         {
3719                 case TCP_SYN_RECV:
3720                 case TCP_SYN_SENT:
3721                 case TCP_ESTABLISHED:
3722                         /*
3723                          * move to CLOSE_WAIT, tcp_data() already handled
3724                          * sending the ack.
3725                          */
3726                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3727                         if (th->rst)
3728                                 sk->shutdown = SHUTDOWN_MASK;
3729                         break;
3730 
3731                 case TCP_CLOSE_WAIT:
3732                 case TCP_CLOSING:
3733                         /*
3734                          * received a retransmission of the FIN, do
3735                          * nothing.
3736                          */
3737                         break;
3738                 case TCP_TIME_WAIT:
3739                         /*
3740                          * received a retransmission of the FIN,
3741                          * restart the TIME_WAIT timer.
3742                          */
3743                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3744                         return(0);
3745                 case TCP_FIN_WAIT1:
3746                         /*
3747                          * This case occurs when a simultaneous close
3748                          * happens, we must ack the received FIN and
3749                          * enter the CLOSING state.
3750                          *
3751                          * This causes a WRITE timeout, which will either
3752                          * move on to TIME_WAIT when we timeout, or resend
3753                          * the FIN properly (maybe we get rid of that annoying
3754                          * FIN lost hang). The TIME_WRITE code is already correct
3755                          * for handling this timeout.
3756                          */
3757 
3758                         if(sk->ip_xmit_timeout != TIME_WRITE)
3759                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3760                         tcp_set_state(sk,TCP_CLOSING);
3761                         break;
3762                 case TCP_FIN_WAIT2:
3763                         /*
3764                          * received a FIN -- send ACK and enter TIME_WAIT
3765                          */
3766                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3767                         sk->shutdown|=SHUTDOWN_MASK;
3768                         tcp_set_state(sk,TCP_TIME_WAIT);
3769                         break;
3770                 case TCP_CLOSE:
3771                         /*
3772                          * already in CLOSE
3773                          */
3774                         break;
3775                 default:
3776                         tcp_set_state(sk,TCP_LAST_ACK);
3777         
3778                         /* Start the timers. */
3779                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3780                         return(0);
3781         }
3782 
3783         return(0);
3784 }
3785 
3786 
3787 
3788 /*
3789  *      This routine handles the data.  If there is room in the buffer,
3790  *      it will be have already been moved into it.  If there is no
3791  *      room, then we will just have to discard the packet.
3792  */
3793 
3794 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3795          unsigned long saddr, unsigned short len)
3796 {
3797         struct sk_buff *skb1, *skb2;
3798         struct tcphdr *th;
3799         int dup_dumped=0;
3800         unsigned long new_seq;
3801         unsigned long shut_seq;
3802 
3803         th = skb->h.th;
3804         skb->len = len -(th->doff*4);
3805 
3806         /*
3807          *      The bytes in the receive read/assembly queue has increased. Needed for the
3808          *      low memory discard algorithm 
3809          */
3810            
3811         sk->bytes_rcv += skb->len;
3812         
3813         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3814         {
3815                 /* 
3816                  *      Don't want to keep passing ack's back and forth. 
3817                  *      (someone sent us dataless, boring frame)
3818                  */
3819                 if (!th->ack)
3820                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3821                 kfree_skb(skb, FREE_READ);
3822                 return(0);
3823         }
3824         
3825         /*
3826          *      We no longer have anyone receiving data on this connection.
3827          */
3828 
3829 #ifndef TCP_DONT_RST_SHUTDOWN            
3830 
3831         if(sk->shutdown & RCV_SHUTDOWN)
3832         {
3833                 /*
3834                  *      FIXME: BSD has some magic to avoid sending resets to
3835                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3836                  *      BSD stacks still have broken keepalives so we want to
3837                  *      cope with it.
3838                  */
3839 
3840                 if(skb->len)    /* We don't care if its just an ack or
3841                                    a keepalive/window probe */
3842                 {
3843                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3844                         
3845                         /* Do this the way 4.4BSD treats it. Not what I'd
3846                            regard as the meaning of the spec but its what BSD
3847                            does and clearly they know everything 8) */
3848 
3849                         /*
3850                          *      This is valid because of two things
3851                          *
3852                          *      a) The way tcp_data behaves at the bottom.
3853                          *      b) A fin takes effect when read not when received.
3854                          */
3855                          
3856                         shut_seq=sk->acked_seq+1;       /* Last byte */
3857                         
3858                         if(after(new_seq,shut_seq))
3859                         {
3860                                 if(sk->debug)
3861                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3862                                                 sk, new_seq, shut_seq, sk->blog);
3863                                 if(sk->dead)
3864                                 {
3865                                         sk->acked_seq = new_seq + th->fin;
3866                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3867                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3868                                         tcp_statistics.TcpEstabResets++;
3869                                         tcp_set_state(sk,TCP_CLOSE);
3870                                         sk->err = EPIPE;
3871                                         sk->shutdown = SHUTDOWN_MASK;
3872                                         kfree_skb(skb, FREE_READ);
3873                                         return 0;
3874                                 }
3875                         }
3876                 }
3877         }
3878 
3879 #endif
3880 
3881         /*
3882          *      Now we have to walk the chain, and figure out where this one
3883          *      goes into it.  This is set up so that the last packet we received
3884          *      will be the first one we look at, that way if everything comes
3885          *      in order, there will be no performance loss, and if they come
3886          *      out of order we will be able to fit things in nicely.
3887          *
3888          *      [AC: This is wrong. We should assume in order first and then walk
3889          *       forwards from the first hole based upon real traffic patterns.]
3890          *      
3891          */
3892 
3893         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3894         {
3895                 skb_queue_head(&sk->receive_queue,skb);
3896                 skb1= NULL;
3897         } 
3898         else
3899         {
3900                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3901                 {
3902                         if(sk->debug)
3903                         {
3904                                 printk("skb1=%p :", skb1);
3905                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3906                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3907                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3908                                                 sk->acked_seq);
3909                         }
3910                         
3911                         /*
3912                          *      Optimisation: Duplicate frame or extension of previous frame from
3913                          *      same sequence point (lost ack case).
3914                          *      The frame contains duplicate data or replaces a previous frame
3915                          *      discard the previous frame (safe as sk->inuse is set) and put
3916                          *      the new one in its place.
3917                          */
3918                          
3919                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3920                         {
3921                                 skb_append(skb1,skb);
3922                                 skb_unlink(skb1);
3923                                 kfree_skb(skb1,FREE_READ);
3924                                 dup_dumped=1;
3925                                 skb1=NULL;
3926                                 break;
3927                         }
3928                         
3929                         /*
3930                          *      Found where it fits
3931                          */
3932                          
3933                         if (after(th->seq+1, skb1->h.th->seq))
3934                         {
3935                                 skb_append(skb1,skb);
3936                                 break;
3937                         }
3938                         
3939                         /*
3940                          *      See if we've hit the start. If so insert.
3941                          */
3942                         if (skb1 == skb_peek(&sk->receive_queue))
3943                         {
3944                                 skb_queue_head(&sk->receive_queue, skb);
3945                                 break;
3946                         }
3947                 }
3948         }
3949 
3950         /*
3951          *      Figure out what the ack value for this frame is
3952          */
3953          
3954         th->ack_seq = th->seq + skb->len;
3955         if (th->syn) 
3956                 th->ack_seq++;
3957         if (th->fin)
3958                 th->ack_seq++;
3959 
3960         if (before(sk->acked_seq, sk->copied_seq)) 
3961         {
3962                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3963                 sk->acked_seq = sk->copied_seq;
3964         }
3965 
3966         /*
3967          *      Now figure out if we can ack anything. This is very messy because we really want two
3968          *      receive queues, a completed and an assembly queue. We also want only one transmit
3969          *      queue.
3970          */
3971 
3972         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3973         {
3974                 if (before(th->seq, sk->acked_seq+1)) 
3975                 {
3976                         int newwindow;
3977 
3978                         if (after(th->ack_seq, sk->acked_seq)) 
3979                         {
3980                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3981                                 if (newwindow < 0)
3982                                         newwindow = 0;  
3983                                 sk->window = newwindow;
3984                                 sk->acked_seq = th->ack_seq;
3985                         }
3986                         skb->acked = 1;
3987 
3988                         /*
3989                          *      When we ack the fin, we do the FIN 
3990                          *      processing.
3991                          */
3992 
3993                         if (skb->h.th->fin) 
3994                         {
3995                                 tcp_fin(skb,sk,skb->h.th);
3996                         }
3997           
3998                         for(skb2 = skb->next;
3999                             skb2 != (struct sk_buff *)&sk->receive_queue;
4000                             skb2 = skb2->next) 
4001                         {
4002                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4003                                 {
4004                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4005                                         {
4006                                                 newwindow = sk->window -
4007                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4008                                                 if (newwindow < 0)
4009                                                         newwindow = 0;  
4010                                                 sk->window = newwindow;
4011                                                 sk->acked_seq = skb2->h.th->ack_seq;
4012                                         }
4013                                         skb2->acked = 1;
4014                                         /*
4015                                          *      When we ack the fin, we do
4016                                          *      the fin handling.
4017                                          */
4018                                         if (skb2->h.th->fin) 
4019                                         {
4020                                                 tcp_fin(skb,sk,skb->h.th);
4021                                         }
4022 
4023                                         /*
4024                                          *      Force an immediate ack.
4025                                          */
4026                                          
4027                                         sk->ack_backlog = sk->max_ack_backlog;
4028                                 }
4029                                 else
4030                                 {
4031                                         break;
4032                                 }
4033                         }
4034 
4035                         /*
4036                          *      This also takes care of updating the window.
4037                          *      This if statement needs to be simplified.
4038                          */
4039                         if (!sk->delay_acks ||
4040                             sk->ack_backlog >= sk->max_ack_backlog || 
4041                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4042         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4043                         }
4044                         else 
4045                         {
4046                                 sk->ack_backlog++;
4047                                 if(sk->debug)
4048                                         printk("Ack queued.\n");
4049                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4050                         }
4051                 }
4052         }
4053 
4054         /*
4055          *      If we've missed a packet, send an ack.
4056          *      Also start a timer to send another.
4057          */
4058          
4059         if (!skb->acked) 
4060         {
4061         
4062         /*
4063          *      This is important.  If we don't have much room left,
4064          *      we need to throw out a few packets so we have a good
4065          *      window.  Note that mtu is used, not mss, because mss is really
4066          *      for the send side.  He could be sending us stuff as large as mtu.
4067          */
4068                  
4069                 while (sk->prot->rspace(sk) < sk->mtu) 
4070                 {
4071                         skb1 = skb_peek(&sk->receive_queue);
4072                         if (skb1 == NULL) 
4073                         {
4074                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4075                                 break;
4076                         }
4077 
4078                         /*
4079                          *      Don't throw out something that has been acked. 
4080                          */
4081                  
4082                         if (skb1->acked) 
4083                         {
4084                                 break;
4085                         }
4086                 
4087                         skb_unlink(skb1);
4088                         kfree_skb(skb1, FREE_READ);
4089                 }
4090                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4091                 sk->ack_backlog++;
4092                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4093         }
4094         else
4095         {
4096                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4097         }
4098 
4099         /*
4100          *      Now tell the user we may have some data. 
4101          */
4102          
4103         if (!sk->dead) 
4104         {
4105                 if(sk->debug)
4106                         printk("Data wakeup.\n");
4107                 sk->data_ready(sk,0);
4108         } 
4109         return(0);
4110 }
4111 
4112 
4113 /*
4114  *      This routine is only called when we have urgent data
4115  *      signalled. Its the 'slow' part of tcp_urg. It could be
4116  *      moved inline now as tcp_urg is only called from one
4117  *      place. We handle URGent data wrong. We have to - as
4118  *      BSD still doesn't use the correction from RFC961.
4119  */
4120  
4121 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4122 {
4123         unsigned long ptr = ntohs(th->urg_ptr);
4124 
4125         if (ptr)
4126                 ptr--;
4127         ptr += th->seq;
4128 
4129         /* ignore urgent data that we've already seen and read */
4130         if (after(sk->copied_seq, ptr))
4131                 return;
4132 
4133         /* do we already have a newer (or duplicate) urgent pointer? */
4134         if (sk->urg_data && !after(ptr, sk->urg_seq))
4135                 return;
4136 
4137         /* tell the world about our new urgent pointer */
4138         if (sk->proc != 0) {
4139                 if (sk->proc > 0) {
4140                         kill_proc(sk->proc, SIGURG, 1);
4141                 } else {
4142                         kill_pg(-sk->proc, SIGURG, 1);
4143                 }
4144         }
4145         sk->urg_data = URG_NOTYET;
4146         sk->urg_seq = ptr;
4147 }
4148 
4149 /*
4150  *      This is the 'fast' part of urgent handling.
4151  */
4152  
4153 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4154         unsigned long saddr, unsigned long len)
4155 {
4156         unsigned long ptr;
4157 
4158         /*
4159          *      Check if we get a new urgent pointer - normally not 
4160          */
4161          
4162         if (th->urg)
4163                 tcp_check_urg(sk,th);
4164 
4165         /*
4166          *      Do we wait for any urgent data? - normally not
4167          */
4168          
4169         if (sk->urg_data != URG_NOTYET)
4170                 return 0;
4171 
4172         /*
4173          *      Is the urgent pointer pointing into this packet? 
4174          */
4175          
4176         ptr = sk->urg_seq - th->seq + th->doff*4;
4177         if (ptr >= len)
4178                 return 0;
4179 
4180         /*
4181          *      Ok, got the correct packet, update info 
4182          */
4183          
4184         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4185         if (!sk->dead)
4186                 sk->data_ready(sk,0);
4187         return 0;
4188 }
4189 
4190 /*
4191  *      This will accept the next outstanding connection. 
4192  */
4193  
4194 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4195 {
4196         struct sock *newsk;
4197         struct sk_buff *skb;
4198   
4199   /*
4200    * We need to make sure that this socket is listening,
4201    * and that it has something pending.
4202    */
4203 
4204         if (sk->state != TCP_LISTEN) 
4205         {
4206                 sk->err = EINVAL;
4207                 return(NULL); 
4208         }
4209 
4210         /* Avoid the race. */
4211         cli();
4212         sk->inuse = 1;
4213 
4214         while((skb = tcp_dequeue_established(sk)) == NULL) 
4215         {
4216                 if (flags & O_NONBLOCK) 
4217                 {
4218                         sti();
4219                         release_sock(sk);
4220                         sk->err = EAGAIN;
4221                         return(NULL);
4222                 }
4223 
4224                 release_sock(sk);
4225                 interruptible_sleep_on(sk->sleep);
4226                 if (current->signal & ~current->blocked) 
4227                 {
4228                         sti();
4229                         sk->err = ERESTARTSYS;
4230                         return(NULL);
4231                 }
4232                 sk->inuse = 1;
4233         }
4234         sti();
4235 
4236         /*
4237          *      Now all we need to do is return skb->sk. 
4238          */
4239 
4240         newsk = skb->sk;
4241 
4242         kfree_skb(skb, FREE_READ);
4243         sk->ack_backlog--;
4244         release_sock(sk);
4245         return(newsk);
4246 }
4247 
4248 
4249 /*
4250  *      This will initiate an outgoing connection. 
4251  */
4252  
4253 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4254 {
4255         struct sk_buff *buff;
4256         struct device *dev=NULL;
4257         unsigned char *ptr;
4258         int tmp;
4259         int atype;
4260         struct tcphdr *t1;
4261         struct rtable *rt;
4262 
4263         if (sk->state != TCP_CLOSE) 
4264         {
4265                 return(-EISCONN);
4266         }
4267         
4268         if (addr_len < 8) 
4269                 return(-EINVAL);
4270 
4271         if (usin->sin_family && usin->sin_family != AF_INET) 
4272                 return(-EAFNOSUPPORT);
4273 
4274         /*
4275          *      connect() to INADDR_ANY means loopback (BSD'ism).
4276          */
4277         
4278         if(usin->sin_addr.s_addr==INADDR_ANY)
4279                 usin->sin_addr.s_addr=ip_my_addr();
4280                   
4281         /*
4282          *      Don't want a TCP connection going to a broadcast address 
4283          */
4284 
4285         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4286                 return -ENETUNREACH;
4287   
4288         sk->inuse = 1;
4289         sk->daddr = usin->sin_addr.s_addr;
4290         sk->write_seq = tcp_init_seq();
4291         sk->window_seq = sk->write_seq;
4292         sk->rcv_ack_seq = sk->write_seq -1;
4293         sk->err = 0;
4294         sk->dummy_th.dest = usin->sin_port;
4295         release_sock(sk);
4296 
4297         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4298         if (buff == NULL) 
4299         {
4300                 return(-ENOMEM);
4301         }
4302         sk->inuse = 1;
4303         buff->len = 24;
4304         buff->sk = sk;
4305         buff->free = 0;
4306         buff->localroute = sk->localroute;
4307         
4308         t1 = (struct tcphdr *) buff->data;
4309 
4310         /*
4311          *      Put in the IP header and routing stuff. 
4312          */
4313          
4314         rt=ip_rt_route(sk->daddr, NULL, NULL);
4315         
4316 
4317         /*
4318          *      We need to build the routing stuff from the things saved in skb. 
4319          */
4320 
4321         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4322                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4323         if (tmp < 0) 
4324         {
4325                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4326                 release_sock(sk);
4327                 return(-ENETUNREACH);
4328         }
4329 
4330         buff->len += tmp;
4331         t1 = (struct tcphdr *)((char *)t1 +tmp);
4332 
4333         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4334         t1->seq = ntohl(sk->write_seq++);
4335         sk->sent_seq = sk->write_seq;
4336         buff->h.seq = sk->write_seq;
4337         t1->ack = 0;
4338         t1->window = 2;
4339         t1->res1=0;
4340         t1->res2=0;
4341         t1->rst = 0;
4342         t1->urg = 0;
4343         t1->psh = 0;
4344         t1->syn = 1;
4345         t1->urg_ptr = 0;
4346         t1->doff = 6;
4347         /* use 512 or whatever user asked for */
4348         
4349         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4350                 sk->window_clamp=rt->rt_window;
4351         else
4352                 sk->window_clamp=0;
4353 
4354         if (sk->user_mss)
4355                 sk->mtu = sk->user_mss;
4356         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4357                 sk->mtu = rt->rt_mss;
4358         else 
4359         {
4360 #ifdef CONFIG_INET_SNARL
4361                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4362 #else
4363                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4364 #endif
4365                         sk->mtu = 576 - HEADER_SIZE;
4366                 else
4367                         sk->mtu = MAX_WINDOW;
4368         }
4369         /*
4370          *      but not bigger than device MTU 
4371          */
4372 
4373         if(sk->mtu <32)
4374                 sk->mtu = 32;   /* Sanity limit */
4375                 
4376         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4377         
4378         /*
4379          *      Put in the TCP options to say MTU. 
4380          */
4381 
4382         ptr = (unsigned char *)(t1+1);
4383         ptr[0] = 2;
4384         ptr[1] = 4;
4385         ptr[2] = (sk->mtu) >> 8;
4386         ptr[3] = (sk->mtu) & 0xff;
4387         tcp_send_check(t1, sk->saddr, sk->daddr,
4388                   sizeof(struct tcphdr) + 4, sk);
4389 
4390         /*
4391          *      This must go first otherwise a really quick response will get reset. 
4392          */
4393 
4394         tcp_set_state(sk,TCP_SYN_SENT);
4395         sk->rto = TCP_TIMEOUT_INIT;
4396 #if 0 /* we already did this */
4397         init_timer(&sk->retransmit_timer); 
4398 #endif
4399         sk->retransmit_timer.function=&retransmit_timer;
4400         sk->retransmit_timer.data = (unsigned long)sk;
4401         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4402         sk->retransmits = TCP_SYN_RETRIES;
4403 
4404         sk->prot->queue_xmit(sk, dev, buff, 0);  
4405         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4406         tcp_statistics.TcpActiveOpens++;
4407         tcp_statistics.TcpOutSegs++;
4408   
4409         release_sock(sk);
4410         return(0);
4411 }
4412 
4413 
4414 /* This functions checks to see if the tcp header is actually acceptable. */
4415 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4416              struct options *opt, unsigned long saddr, struct device *dev)
4417 {
4418         unsigned long next_seq;
4419 
4420         next_seq = len - 4*th->doff;
4421         if (th->fin)
4422                 next_seq++;
4423         /* if we have a zero window, we can't have any data in the packet.. */
4424         if (next_seq && !sk->window)
4425                 goto ignore_it;
4426         next_seq += th->seq;
4427 
4428         /*
4429          * This isn't quite right.  sk->acked_seq could be more recent
4430          * than sk->window.  This is however close enough.  We will accept
4431          * slightly more packets than we should, but it should not cause
4432          * problems unless someone is trying to forge packets.
4433          */
4434 
4435         /* have we already seen all of this packet? */
4436         if (!after(next_seq+1, sk->acked_seq))
4437                 goto ignore_it;
4438         /* or does it start beyond the window? */
4439         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4440                 goto ignore_it;
4441 
4442         /* ok, at least part of this packet would seem interesting.. */
4443         return 1;
4444 
4445 ignore_it:
4446         if (th->rst)
4447                 return 0;
4448 
4449         /*
4450          *      Send a reset if we get something not ours and we are
4451          *      unsynchronized. Note: We don't do anything to our end. We
4452          *      are just killing the bogus remote connection then we will
4453          *      connect again and it will work (with luck).
4454          */
4455          
4456         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4457         {
4458                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4459                 return 1;
4460         }
4461 
4462         /* Try to resync things. */
4463         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4464         return 0;
4465 }
4466 
4467 /*
4468  *      When we get a reset we do this.
4469  */
4470 
4471 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4472 {
4473         sk->zapped = 1;
4474         sk->err = ECONNRESET;
4475         if (sk->state == TCP_SYN_SENT)
4476                 sk->err = ECONNREFUSED;
4477         if (sk->state == TCP_CLOSE_WAIT)
4478                 sk->err = EPIPE;
4479 #ifdef TCP_DO_RFC1337           
4480         /*
4481          *      Time wait assassination protection [RFC1337]
4482          */
4483         if(sk->state!=TCP_TIME_WAIT)
4484         {       
4485                 tcp_set_state(sk,TCP_CLOSE);
4486                 sk->shutdown = SHUTDOWN_MASK;
4487         }
4488 #else   
4489         tcp_set_state(sk,TCP_CLOSE);
4490         sk->shutdown = SHUTDOWN_MASK;
4491 #endif  
4492         if (!sk->dead) 
4493                 sk->state_change(sk);
4494         kfree_skb(skb, FREE_READ);
4495         release_sock(sk);
4496         return(0);
4497 }
4498 
4499 /*
4500  *      A TCP packet has arrived.
4501  */
4502  
4503 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4504         unsigned long daddr, unsigned short len,
4505         unsigned long saddr, int redo, struct inet_protocol * protocol)
4506 {
4507         struct tcphdr *th;
4508         struct sock *sk;
4509         int syn_ok=0;
4510         
4511         if (!skb) 
4512         {
4513                 printk("IMPOSSIBLE 1\n");
4514                 return(0);
4515         }
4516 
4517         if (!dev) 
4518         {
4519                 printk("IMPOSSIBLE 2\n");
4520                 return(0);
4521         }
4522   
4523         tcp_statistics.TcpInSegs++;
4524   
4525         if(skb->pkt_type!=PACKET_HOST)
4526         {
4527                 kfree_skb(skb,FREE_READ);
4528                 return(0);
4529         }
4530   
4531         th = skb->h.th;
4532 
4533         /*
4534          *      Find the socket.
4535          */
4536 
4537         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4538 
4539         /*
4540          *      If this socket has got a reset its to all intents and purposes 
4541          *      really dead. Count closed sockets as dead.
4542          *
4543          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4544          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4545          *      exist so should cause resets as if the port was unreachable.
4546          */
4547          
4548         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4549                 sk=NULL;
4550 
4551         if (!redo) 
4552         {
4553                 if (tcp_check(th, len, saddr, daddr )) 
4554                 {
4555                         skb->sk = NULL;
4556                         kfree_skb(skb,FREE_READ);
4557                         /*
4558                          *      We don't release the socket because it was
4559                          *      never marked in use.
4560                          */
4561                         return(0);
4562                 }
4563                 th->seq = ntohl(th->seq);
4564 
4565                 /* See if we know about the socket. */
4566                 if (sk == NULL) 
4567                 {
4568                         /*
4569                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4570                          */
4571                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4572                         skb->sk = NULL;
4573                         /*
4574                          *      Discard frame
4575                          */
4576                         kfree_skb(skb, FREE_READ);
4577                         return(0);
4578                 }
4579 
4580                 skb->len = len;
4581                 skb->acked = 0;
4582                 skb->used = 0;
4583                 skb->free = 0;
4584                 skb->saddr = daddr;
4585                 skb->daddr = saddr;
4586         
4587                 /* We may need to add it to the backlog here. */
4588                 cli();
4589                 if (sk->inuse) 
4590                 {
4591                         skb_queue_tail(&sk->back_log, skb);
4592                         sti();
4593                         return(0);
4594                 }
4595                 sk->inuse = 1;
4596                 sti();
4597         }
4598         else
4599         {
4600                 if (sk==NULL) 
4601                 {
4602                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4603                         skb->sk = NULL;
4604                         kfree_skb(skb, FREE_READ);
4605                         return(0);
4606                 }
4607         }
4608 
4609 
4610         if (!sk->prot) 
4611         {
4612                 printk("IMPOSSIBLE 3\n");
4613                 return(0);
4614         }
4615 
4616 
4617         /*
4618          *      Charge the memory to the socket. 
4619          */
4620          
4621         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4622         {
4623                 kfree_skb(skb, FREE_READ);
4624                 release_sock(sk);
4625                 return(0);
4626         }
4627 
4628         skb->sk=sk;
4629         sk->rmem_alloc += skb->mem_len;
4630 
4631         /*
4632          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4633          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4634          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4635          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4636          */
4637 
4638         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4639         {
4640         
4641                 /*
4642                  *      Now deal with unusual cases.
4643                  */
4644          
4645                 if(sk->state==TCP_LISTEN)
4646                 {
4647                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4648                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4649 
4650                         /*
4651                          *      We don't care for RST, and non SYN are absorbed (old segments)
4652                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4653                          *      netmask on a running connection it can go broadcast. Even Sun's have
4654                          *      this problem so I'm ignoring it 
4655                          */
4656                            
4657                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4658                         {
4659                                 kfree_skb(skb, FREE_READ);
4660                                 release_sock(sk);
4661                                 return 0;
4662                         }
4663                 
4664                         /*      
4665                          *      Guess we need to make a new socket up 
4666                          */
4667                 
4668                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4669                 
4670                         /*
4671                          *      Now we have several options: In theory there is nothing else
4672                          *      in the frame. KA9Q has an option to send data with the syn,
4673                          *      BSD accepts data with the syn up to the [to be] advertised window
4674                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4675                          *      it, that fits the spec precisely and avoids incompatibilities. It
4676                          *      would be nice in future to drop through and process the data.
4677                          */
4678                          
4679                         release_sock(sk);
4680                         return 0;
4681                 }
4682         
4683                 /* retransmitted SYN? */
4684                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4685                 {
4686                         kfree_skb(skb, FREE_READ);
4687                         release_sock(sk);
4688                         return 0;
4689                 }
4690                 
4691                 /*
4692                  *      SYN sent means we have to look for a suitable ack and either reset
4693                  *      for bad matches or go to connected 
4694                  */
4695            
4696                 if(sk->state==TCP_SYN_SENT)
4697                 {
4698                         /* Crossed SYN or previous junk segment */
4699                         if(th->ack)
4700                         {
4701                                 /* We got an ack, but its not a good ack */
4702                                 if(!tcp_ack(sk,th,saddr,len))
4703                                 {
4704                                         /* Reset the ack - its an ack from a 
4705                                            different connection  [ th->rst is checked in tcp_reset()] */
4706                                         tcp_statistics.TcpAttemptFails++;
4707                                         tcp_reset(daddr, saddr, th,
4708                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4709                                         kfree_skb(skb, FREE_READ);
4710                                         release_sock(sk);
4711                                         return(0);
4712                                 }
4713                                 if(th->rst)
4714                                         return tcp_std_reset(sk,skb);
4715                                 if(!th->syn)
4716                                 {
4717                                         /* A valid ack from a different connection
4718                                            start. Shouldn't happen but cover it */
4719                                         kfree_skb(skb, FREE_READ);
4720                                         release_sock(sk);
4721                                         return 0;
4722                                 }
4723                                 /*
4724                                  *      Ok.. its good. Set up sequence numbers and
4725                                  *      move to established.
4726                                  */
4727                                 syn_ok=1;       /* Don't reset this connection for the syn */
4728                                 sk->acked_seq=th->seq+1;
4729                                 sk->fin_seq=th->seq;
4730                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4731                                 tcp_set_state(sk, TCP_ESTABLISHED);
4732                                 tcp_options(sk,th);
4733                                 sk->dummy_th.dest=th->source;
4734                                 sk->copied_seq = sk->acked_seq;
4735                                 if(!sk->dead)
4736                                 {
4737                                         sk->state_change(sk);
4738                                         sock_wake_async(sk->socket, 0);
4739                                 }
4740                                 if(sk->max_window==0)
4741                                 {
4742                                         sk->max_window = 32;
4743                                         sk->mss = min(sk->max_window, sk->mtu);
4744                                 }
4745                         }
4746                         else
4747                         {
4748                                 /* See if SYN's cross. Drop if boring */
4749                                 if(th->syn && !th->rst)
4750                                 {
4751                                         /* Crossed SYN's are fine - but talking to
4752                                            yourself is right out... */
4753                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4754                                                 sk->dummy_th.source==th->source &&
4755                                                 sk->dummy_th.dest==th->dest)
4756                                         {
4757                                                 tcp_statistics.TcpAttemptFails++;
4758                                                 return tcp_std_reset(sk,skb);
4759                                         }
4760                                         tcp_set_state(sk,TCP_SYN_RECV);
4761                                         
4762                                         /*
4763                                          *      FIXME:
4764                                          *      Must send SYN|ACK here
4765                                          */
4766                                 }               
4767                                 /* Discard junk segment */
4768                                 kfree_skb(skb, FREE_READ);
4769                                 release_sock(sk);
4770                                 return 0;
4771                         }
4772                         /*
4773                          *      SYN_RECV with data maybe.. drop through
4774                          */
4775                         goto rfc_step6;
4776                 }
4777 
4778         /*
4779          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4780          *      a more complex suggestion for fixing these reuse issues in RFC1644
4781          *      but not yet ready for general use. Also see RFC1379.
4782          */
4783         
4784 #define BSD_TIME_WAIT
4785 #ifdef BSD_TIME_WAIT
4786                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4787                         after(th->seq, sk->acked_seq) && !th->rst)
4788                 {
4789                         long seq=sk->write_seq;
4790                         if(sk->debug)
4791                                 printk("Doing a BSD time wait\n");
4792                         tcp_statistics.TcpEstabResets++;           
4793                         sk->rmem_alloc -= skb->mem_len;
4794                         skb->sk = NULL;
4795                         sk->err=ECONNRESET;
4796                         tcp_set_state(sk, TCP_CLOSE);
4797                         sk->shutdown = SHUTDOWN_MASK;
4798                         release_sock(sk);
4799                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4800                         if (sk && sk->state==TCP_LISTEN)
4801                         {
4802                                 sk->inuse=1;
4803                                 skb->sk = sk;
4804                                 sk->rmem_alloc += skb->mem_len;
4805                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4806                                 release_sock(sk);
4807                                 return 0;
4808                         }
4809                         kfree_skb(skb, FREE_READ);
4810                         return 0;
4811                 }
4812 #endif  
4813         }
4814 
4815         /*
4816          *      We are now in normal data flow (see the step list in the RFC)
4817          *      Note most of these are inline now. I'll inline the lot when
4818          *      I have time to test it hard and look at what gcc outputs 
4819          */
4820         
4821         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4822         {
4823                 kfree_skb(skb, FREE_READ);
4824                 release_sock(sk);
4825                 return 0;
4826         }
4827 
4828         if(th->rst)
4829                 return tcp_std_reset(sk,skb);
4830         
4831         /*
4832          *      !syn_ok is effectively the state test in RFC793.
4833          */
4834          
4835         if(th->syn && !syn_ok)
4836         {
4837                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4838                 return tcp_std_reset(sk,skb);   
4839         }
4840 
4841         /*
4842          *      Process the ACK
4843          */
4844          
4845 
4846         if(th->ack && !tcp_ack(sk,th,saddr,len))
4847         {
4848                 /*
4849                  *      Our three way handshake failed.
4850                  */
4851                  
4852                 if(sk->state==TCP_SYN_RECV)
4853                 {
4854                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4855                 }
4856                 kfree_skb(skb, FREE_READ);
4857                 release_sock(sk);
4858                 return 0;
4859         }
4860         
4861 rfc_step6:              /* I'll clean this up later */
4862 
4863         /*
4864          *      Process urgent data
4865          */
4866                 
4867         if(tcp_urg(sk, th, saddr, len))
4868         {
4869                 kfree_skb(skb, FREE_READ);
4870                 release_sock(sk);
4871                 return 0;
4872         }
4873         
4874         
4875         /*
4876          *      Process the encapsulated data
4877          */
4878         
4879         if(tcp_data(skb,sk, saddr, len))
4880         {
4881                 kfree_skb(skb, FREE_READ);
4882                 release_sock(sk);
4883                 return 0;
4884         }
4885 
4886         /*
4887          *      And done
4888          */     
4889         
4890         release_sock(sk);
4891         return 0;
4892 }
4893 
4894 /*
4895  *      This routine sends a packet with an out of date sequence
4896  *      number. It assumes the other end will try to ack it.
4897  */
4898 
4899 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4900 {
4901         struct sk_buff *buff;
4902         struct tcphdr *t1;
4903         struct device *dev=NULL;
4904         int tmp;
4905 
4906         if (sk->zapped)
4907                 return; /* After a valid reset we can send no more */
4908 
4909         /*
4910          *      Write data can still be transmitted/retransmitted in the
4911          *      following states.  If any other state is encountered, return.
4912          *      [listen/close will never occur here anyway]
4913          */
4914 
4915         if (sk->state != TCP_ESTABLISHED && 
4916             sk->state != TCP_CLOSE_WAIT &&
4917             sk->state != TCP_FIN_WAIT1 && 
4918             sk->state != TCP_LAST_ACK &&
4919             sk->state != TCP_CLOSING
4920         ) 
4921         {
4922                 return;
4923         }
4924 
4925         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4926         if (buff == NULL) 
4927                 return;
4928 
4929         buff->len = sizeof(struct tcphdr);
4930         buff->free = 1;
4931         buff->sk = sk;
4932         buff->localroute = sk->localroute;
4933 
4934         t1 = (struct tcphdr *) buff->data;
4935 
4936         /* Put in the IP header and routing stuff. */
4937         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4938                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4939         if (tmp < 0) 
4940         {
4941                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4942                 return;
4943         }
4944 
4945         buff->len += tmp;
4946         t1 = (struct tcphdr *)((char *)t1 +tmp);
4947 
4948         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4949 
4950         /*
4951          *      Use a previous sequence.
4952          *      This should cause the other end to send an ack.
4953          */
4954          
4955         t1->seq = htonl(sk->sent_seq-1);
4956         t1->ack = 1; 
4957         t1->res1= 0;
4958         t1->res2= 0;
4959         t1->rst = 0;
4960         t1->urg = 0;
4961         t1->psh = 0;
4962         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4963         t1->syn = 0;
4964         t1->ack_seq = ntohl(sk->acked_seq);
4965         t1->window = ntohs(tcp_select_window(sk));
4966         t1->doff = sizeof(*t1)/4;
4967         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4968          /*
4969           *     Send it and free it.
4970           *     This will prevent the timer from automatically being restarted.
4971           */
4972         sk->prot->queue_xmit(sk, dev, buff, 1);
4973         tcp_statistics.TcpOutSegs++;
4974 }
4975 
4976 /*
4977  *      A window probe timeout has occurred.
4978  */
4979 
4980 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4981 {
4982         if (sk->zapped)
4983                 return;         /* After a valid reset we can send no more */
4984 
4985         tcp_write_wakeup(sk);
4986 
4987         sk->backoff++;
4988         sk->rto = min(sk->rto << 1, 120*HZ);
4989         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4990         sk->retransmits++;
4991         sk->prot->retransmits ++;
4992 }
4993 
4994 /*
4995  *      Socket option code for TCP. 
4996  */
4997   
4998 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4999 {
5000         int val,err;
5001 
5002         if(level!=SOL_TCP)
5003                 return ip_setsockopt(sk,level,optname,optval,optlen);
5004 
5005         if (optval == NULL) 
5006                 return(-EINVAL);
5007 
5008         err=verify_area(VERIFY_READ, optval, sizeof(int));
5009         if(err)
5010                 return err;
5011         
5012         val = get_fs_long((unsigned long *)optval);
5013 
5014         switch(optname)
5015         {
5016                 case TCP_MAXSEG:
5017 /*
5018  * values greater than interface MTU won't take effect.  however at
5019  * the point when this call is done we typically don't yet know
5020  * which interface is going to be used
5021  */
5022                         if(val<1||val>MAX_WINDOW)
5023                                 return -EINVAL;
5024                         sk->user_mss=val;
5025                         return 0;
5026                 case TCP_NODELAY:
5027                         sk->nonagle=(val==0)?0:1;
5028                         return 0;
5029                 default:
5030                         return(-ENOPROTOOPT);
5031         }
5032 }
5033 
5034 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5035 {
5036         int val,err;
5037 
5038         if(level!=SOL_TCP)
5039                 return ip_getsockopt(sk,level,optname,optval,optlen);
5040                         
5041         switch(optname)
5042         {
5043                 case TCP_MAXSEG:
5044                         val=sk->user_mss;
5045                         break;
5046                 case TCP_NODELAY:
5047                         val=sk->nonagle;
5048                         break;
5049                 default:
5050                         return(-ENOPROTOOPT);
5051         }
5052         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5053         if(err)
5054                 return err;
5055         put_fs_long(sizeof(int),(unsigned long *) optlen);
5056 
5057         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5058         if(err)
5059                 return err;
5060         put_fs_long(val,(unsigned long *)optval);
5061 
5062         return(0);
5063 }       
5064 
5065 
5066 struct proto tcp_prot = {
5067         sock_wmalloc,
5068         sock_rmalloc,
5069         sock_wfree,
5070         sock_rfree,
5071         sock_rspace,
5072         sock_wspace,
5073         tcp_close,
5074         tcp_read,
5075         tcp_write,
5076         tcp_sendto,
5077         tcp_recvfrom,
5078         ip_build_header,
5079         tcp_connect,
5080         tcp_accept,
5081         ip_queue_xmit,
5082         tcp_retransmit,
5083         tcp_write_wakeup,
5084         tcp_read_wakeup,
5085         tcp_rcv,
5086         tcp_select,
5087         tcp_ioctl,
5088         NULL,
5089         tcp_shutdown,
5090         tcp_setsockopt,
5091         tcp_getsockopt,
5092         128,
5093         0,
5094         {NULL,},
5095         "TCP",
5096         0, 0
5097 };

/* [previous][next][first][last][top][bottom][index][help] */