root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since it's
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 136  *                                      only frames. At least one pc tcp stack
 137  *                                      generates them.
 138  *
 139  *
 140  * To Fix:
 141  *              Fast path the code. Two things here - fix the window calculation
 142  *              so it doesn't iterate over the queue, also spot packets with no funny
 143  *              options arriving in order and process directly.
 144  *
 145  *              Implement RFC 1191 [Path MTU discovery]
 146  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 147  *              Rewrite output state machine to use a single queue and do low window
 148  *              situations as per the spec (RFC 1122)
 149  *              Speed up input assembly algorithm.
 150  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 151  *              could do with it working on IPv4
 152  *              User settable/learned rtt/max window/mtu
 153  *              Cope with MTU/device switches when retransmitting in tcp.
 154  *              Fix the window handling to use PR's new code.
 155  *
 156  *              Change the fundamental structure to a single send queue maintained
 157  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 158  *              active routes too]). Cut the queue off in tcp_retransmit/
 159  *              tcp_transmit.
 160  *              Change the receive queue to assemble as it goes. This lets us
 161  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 162  *              tcp_data/tcp_read as well as the window shrink crud.
 163  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 164  *              tcp_queue_skb seem obvious routines to extract.
 165  *      
 166  *              This program is free software; you can redistribute it and/or
 167  *              modify it under the terms of the GNU General Public License
 168  *              as published by the Free Software Foundation; either version
 169  *              2 of the License, or(at your option) any later version.
 170  *
 171  * Description of States:
 172  *
 173  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 174  *
 175  *      TCP_SYN_RECV            received a connection request, sent ack,
 176  *                              waiting for final ack in three-way handshake.
 177  *
 178  *      TCP_ESTABLISHED         connection established
 179  *
 180  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 181  *                              transmission of remaining buffered data
 182  *
 183  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 184  *                              to shutdown
 185  *
 186  *      TCP_CLOSING             both sides have shutdown but we still have
 187  *                              data we have to finish sending
 188  *
 189  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 190  *                              closed, can only be entered from FIN_WAIT2
 191  *                              or CLOSING.  Required because the other end
 192  *                              may not have gotten our last ACK causing it
 193  *                              to retransmit the data packet (which we ignore)
 194  *
 195  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 196  *                              us to finish writing our data and to shutdown
 197  *                              (we have to close() to move on to LAST_ACK)
 198  *
 199  *      TCP_LAST_ACK            out side has shutdown after remote has
 200  *                              shutdown.  There may still be data in our
 201  *                              buffer that we have to finish sending
 202  *              
 203  *      TCP_CLOSE               socket is finished
 204  */
 205 
 206 #include <linux/types.h>
 207 #include <linux/sched.h>
 208 #include <linux/mm.h>
 209 #include <linux/time.h>
 210 #include <linux/string.h>
 211 #include <linux/config.h>
 212 #include <linux/socket.h>
 213 #include <linux/sockios.h>
 214 #include <linux/termios.h>
 215 #include <linux/in.h>
 216 #include <linux/fcntl.h>
 217 #include <linux/inet.h>
 218 #include <linux/netdevice.h>
 219 #include "snmp.h"
 220 #include "ip.h"
 221 #include "protocol.h"
 222 #include "icmp.h"
 223 #include "tcp.h"
 224 #include "arp.h"
 225 #include <linux/skbuff.h>
 226 #include "sock.h"
 227 #include "route.h"
 228 #include <linux/errno.h>
 229 #include <linux/timer.h>
 230 #include <asm/system.h>
 231 #include <asm/segment.h>
 232 #include <linux/mm.h>
 233 
 234 /*
 235  *      The MSL timer is the 'normal' timer.
 236  */
 237  
 238 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 239 
 240 #define SEQ_TICK 3
 241 unsigned long seq_offset;
 242 struct tcp_mib  tcp_statistics;
 243 
 244 static void tcp_close(struct sock *sk, int timeout);
 245 
 246 
 247 /*
 248  *      The less said about this the better, but it works and will do for 1.2 
 249  */
 250 
 251 static struct wait_queue *master_select_wakeup;
 252 
 253 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 254 {
 255         if (a < b) 
 256                 return(a);
 257         return(b);
 258 }
 259 
 260 #undef STATE_TRACE
 261 
 262 #ifdef STATE_TRACE
 263 static char *statename[]={
 264         "Unused","Established","Syn Sent","Syn Recv",
 265         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 266         "Close Wait","Last ACK","Listen","Closing"
 267 };
 268 #endif
 269 
 270 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 271 {
 272         if(sk->state==TCP_ESTABLISHED)
 273                 tcp_statistics.TcpCurrEstab--;
 274 #ifdef STATE_TRACE
 275         if(sk->debug)
 276                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 277 #endif  
 278         /* This is a hack but it doesn't occur often and it's going to
 279            be a real        to fix nicely */
 280            
 281         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 282         {
 283                 wake_up_interruptible(&master_select_wakeup);
 284         }
 285         sk->state=state;
 286         if(state==TCP_ESTABLISHED)
 287                 tcp_statistics.TcpCurrEstab++;
 288 }
 289 
 290 /*
 291  *      This routine picks a TCP windows for a socket based on
 292  *      the following constraints
 293  *  
 294  *      1. The window can never be shrunk once it is offered (RFC 793)
 295  *      2. We limit memory per socket
 296  *   
 297  *      For now we use NET2E3's heuristic of offering half the memory
 298  *      we have handy. All is not as bad as this seems however because
 299  *      of two things. Firstly we will bin packets even within the window
 300  *      in order to get the data we are waiting for into the memory limit.
 301  *      Secondly we bin common duplicate forms at receive time
 302  *      Better heuristics welcome
 303  */
 304    
 305 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307         int new_window = sk->prot->rspace(sk);
 308         
 309         if(sk->window_clamp)
 310                 new_window=min(sk->window_clamp,new_window);
 311         /*
 312          *      Two things are going on here.  First, we don't ever offer a
 313          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 314          *      receiver side of SWS as specified in RFC1122.
 315          *      Second, we always give them at least the window they
 316          *      had before, in order to avoid retracting window.  This
 317          *      is technically allowed, but RFC1122 advises against it and
 318          *      in practice it causes trouble.
 319          *
 320          *      Fixme: This doesn't correctly handle the case where
 321          *      new_window > sk->window but not by enough to allow for the
 322          *      shift in sequence space. 
 323          */
 324         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 325                 return(sk->window);
 326         return(new_window);
 327 }
 328 
 329 /*
 330  *      Find someone to 'accept'. Must be called with
 331  *      sk->inuse=1 or cli()
 332  */ 
 333 
 334 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 335 {
 336         struct sk_buff *p=skb_peek(&s->receive_queue);
 337         if(p==NULL)
 338                 return NULL;
 339         do
 340         {
 341                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 342                         return p;
 343                 p=p->next;
 344         }
 345         while(p!=(struct sk_buff *)&s->receive_queue);
 346         return NULL;
 347 }
 348 
 349 /*
 350  *      Remove a completed connection and return it. This is used by
 351  *      tcp_accept() to get connections from the queue.
 352  */
 353 
 354 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 355 {
 356         struct sk_buff *skb;
 357         unsigned long flags;
 358         save_flags(flags);
 359         cli(); 
 360         skb=tcp_find_established(s);
 361         if(skb!=NULL)
 362                 skb_unlink(skb);        /* Take it off the queue */
 363         restore_flags(flags);
 364         return skb;
 365 }
 366 
 367 /* 
 368  *      This routine closes sockets which have been at least partially
 369  *      opened, but not yet accepted. Currently it is only called by
 370  *      tcp_close, and timeout mirrors the value there. 
 371  */
 372 
 373 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 374 {
 375         struct sk_buff *skb;
 376 
 377         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 378         {
 379                 skb->sk->dead=1;
 380                 tcp_close(skb->sk, 0);
 381                 kfree_skb(skb, FREE_READ);
 382         }
 383         return;
 384 }
 385 
 386 /*
 387  *      Enter the time wait state. 
 388  */
 389 
 390 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 391 {
 392         tcp_set_state(sk,TCP_TIME_WAIT);
 393         sk->shutdown = SHUTDOWN_MASK;
 394         if (!sk->dead)
 395                 sk->state_change(sk);
 396         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 397 }
 398 
 399 /*
 400  *      A socket has timed out on its send queue and wants to do a
 401  *      little retransmitting. Currently this means TCP.
 402  */
 403 
 404 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 405 {
 406         struct sk_buff * skb;
 407         struct proto *prot;
 408         struct device *dev;
 409         int ct=0;
 410 
 411         prot = sk->prot;
 412         skb = sk->send_head;
 413 
 414         while (skb != NULL)
 415         {
 416                 struct tcphdr *th;
 417                 struct iphdr *iph;
 418                 int size;
 419 
 420                 dev = skb->dev;
 421                 IS_SKB(skb);
 422                 skb->when = jiffies;
 423 
 424                 /*
 425                  * In general it's OK just to use the old packet.  However we
 426                  * need to use the current ack and window fields.  Urg and
 427                  * urg_ptr could possibly stand to be updated as well, but we
 428                  * don't keep the necessary data.  That shouldn't be a problem,
 429                  * if the other end is doing the right thing.  Since we're
 430                  * changing the packet, we have to issue a new IP identifier.
 431                  */
 432 
 433                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 434                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 435                 size = skb->len - (((unsigned char *) th) - skb->data);
 436                 
 437                 /*
 438                  *      Note: We ought to check for window limits here but
 439                  *      currently this is done (less efficiently) elsewhere.
 440                  *      We do need to check for a route change but can't handle
 441                  *      that until we have the new 1.3.x buffers in.
 442                  *
 443                  */
 444 
 445                 iph->id = htons(ip_id_count++);
 446                 ip_send_check(iph);
 447 
 448                 /*
 449                  *      This is not the right way to handle this. We have to
 450                  *      issue an up to date window and ack report with this 
 451                  *      retransmit to keep the odd buggy tcp that relies on 
 452                  *      the fact BSD does this happy. 
 453                  *      We don't however need to recalculate the entire 
 454                  *      checksum, so someone wanting a small problem to play
 455                  *      with might like to implement RFC1141/RFC1624 and speed
 456                  *      this up by avoiding a full checksum.
 457                  */
 458                  
 459                 th->ack_seq = ntohl(sk->acked_seq);
 460                 th->window = ntohs(tcp_select_window(sk));
 461                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 462                 
 463                 /*
 464                  *      If the interface is (still) up and running, kick it.
 465                  */
 466 
 467                 if (dev->flags & IFF_UP)
 468                 {
 469                         /*
 470                          *      If the packet is still being sent by the device/protocol
 471                          *      below then don't retransmit. This is both needed, and good -
 472                          *      especially with connected mode AX.25 where it stops resends
 473                          *      occurring of an as yet unsent anyway frame!
 474                          *      We still add up the counts as the round trip time wants
 475                          *      adjusting.
 476                          */
 477                         if (sk && !skb_device_locked(skb))
 478                         {
 479                                 /* Remove it from any existing driver queue first! */
 480                                 skb_unlink(skb);
 481                                 /* Now queue it */
 482                                 ip_statistics.IpOutRequests++;
 483                                 dev_queue_xmit(skb, dev, sk->priority);
 484                         }
 485                 }
 486 
 487                 /*
 488                  *      Count retransmissions
 489                  */
 490                  
 491                 ct++;
 492                 sk->prot->retransmits ++;
 493 
 494                 /*
 495                  *      Only one retransmit requested.
 496                  */
 497         
 498                 if (!all)
 499                         break;
 500 
 501                 /*
 502                  *      This should cut it off before we send too many packets.
 503                  */
 504 
 505                 if (ct >= sk->cong_window)
 506                         break;
 507                 skb = skb->link3;
 508         }
 509 }
 510 
 511 /*
 512  *      Reset the retransmission timer
 513  */
 514  
 515 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 516 {
 517         del_timer(&sk->retransmit_timer);
 518         sk->ip_xmit_timeout = why;
 519         if((int)when < 0)
 520         {
 521                 when=3;
 522                 printk("Error: Negative timer in xmit_timer\n");
 523         }
 524         sk->retransmit_timer.expires=when;
 525         add_timer(&sk->retransmit_timer);
 526 }
 527 
 528 /*
 529  *      This is the normal code called for timeouts.  It does the retransmission
 530  *      and then does backoff.  tcp_do_retransmit is separated out because
 531  *      tcp_ack needs to send stuff from the retransmit queue without
 532  *      initiating a backoff.
 533  */
 534 
 535 
 536 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 537 {
 538         tcp_do_retransmit(sk, all);
 539 
 540         /*
 541          * Increase the timeout each time we retransmit.  Note that
 542          * we do not increase the rtt estimate.  rto is initialized
 543          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 544          * that doubling rto each time is the least we can get away with.
 545          * In KA9Q, Karn uses this for the first few times, and then
 546          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 547          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 548          * defined in the protocol as the maximum possible RTT.  I guess
 549          * we'll have to use something other than TCP to talk to the
 550          * University of Mars.
 551          *
 552          * PAWS allows us longer timeouts and large windows, so once
 553          * implemented ftp to mars will work nicely. We will have to fix
 554          * the 120 second clamps though!
 555          */
 556 
 557         sk->retransmits++;
 558         sk->backoff++;
 559         sk->rto = min(sk->rto << 1, 120*HZ);
 560         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 561 }
 562 
 563 
 564 /*
 565  *      A timer event has trigger a tcp retransmit timeout. The
 566  *      socket xmit queue is ready and set up to send. Because
 567  *      the ack receive code keeps the queue straight we do
 568  *      nothing clever here.
 569  */
 570 
 571 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 572 {
 573         if (all) 
 574         {
 575                 tcp_retransmit_time(sk, all);
 576                 return;
 577         }
 578 
 579         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 580         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 581         sk->cong_count = 0;
 582 
 583         sk->cong_window = 1;
 584 
 585         /* Do the actual retransmit. */
 586         tcp_retransmit_time(sk, all);
 587 }
 588 
 589 /*
 590  *      A write timeout has occurred. Process the after effects.
 591  */
 592 
 593 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 594 {
 595         /*
 596          *      Look for a 'soft' timeout.
 597          */
 598         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 599                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 600         {
 601                 /*
 602                  *      Attempt to recover if arp has changed (unlikely!) or
 603                  *      a route has shifted (not supported prior to 1.3).
 604                  */
 605                 arp_destroy (sk->daddr, 0);
 606                 ip_route_check (sk->daddr);
 607         }
 608         /*
 609          *      Has it gone just too far ?
 610          */
 611         if (sk->retransmits > TCP_RETR2) 
 612         {
 613                 sk->err = ETIMEDOUT;
 614                 sk->error_report(sk);
 615                 del_timer(&sk->retransmit_timer);
 616                 /*
 617                  *      Time wait the socket 
 618                  */
 619                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 620                 {
 621                         tcp_set_state(sk,TCP_TIME_WAIT);
 622                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 623                 }
 624                 else
 625                 {
 626                         /*
 627                          *      Clean up time.
 628                          */
 629                         tcp_set_state(sk, TCP_CLOSE);
 630                         return 0;
 631                 }
 632         }
 633         return 1;
 634 }
 635 
 636 /*
 637  *      The TCP retransmit timer. This lacks a few small details.
 638  *
 639  *      1.      An initial rtt timeout on the probe0 should cause what we can
 640  *              of the first write queue buffer to be split and sent.
 641  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 642  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 643  *              tcp_err should save a 'soft error' for us.
 644  */
 645 
 646 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 647 {
 648         struct sock *sk = (struct sock*)data;
 649         int why = sk->ip_xmit_timeout;
 650 
 651         /* 
 652          * only process if socket is not in use
 653          */
 654 
 655         cli();
 656         if (sk->inuse || in_bh) 
 657         {
 658                 /* Try again in 1 second */
 659                 sk->retransmit_timer.expires = HZ;
 660                 add_timer(&sk->retransmit_timer);
 661                 sti();
 662                 return;
 663         }
 664 
 665         sk->inuse = 1;
 666         sti();
 667 
 668         /* Always see if we need to send an ack. */
 669 
 670         if (sk->ack_backlog && !sk->zapped) 
 671         {
 672                 sk->prot->read_wakeup (sk);
 673                 if (! sk->dead)
 674                         sk->data_ready(sk,0);
 675         }
 676 
 677         /* Now we need to figure out why the socket was on the timer. */
 678 
 679         switch (why) 
 680         {
 681                 /* Window probing */
 682                 case TIME_PROBE0:
 683                         tcp_send_probe0(sk);
 684                         tcp_write_timeout(sk);
 685                         break;
 686                 /* Retransmitting */
 687                 case TIME_WRITE:
 688                         /* It could be we got here because we needed to send an ack.
 689                          * So we need to check for that.
 690                          */
 691                 {
 692                         struct sk_buff *skb;
 693                         unsigned long flags;
 694 
 695                         save_flags(flags);
 696                         cli();
 697                         skb = sk->send_head;
 698                         if (!skb) 
 699                         {
 700                                 restore_flags(flags);
 701                         } 
 702                         else 
 703                         {
 704                                 /*
 705                                  *      Kicked by a delayed ack. Reset timer
 706                                  *      correctly now
 707                                  */
 708                                 if (jiffies < skb->when + sk->rto) 
 709                                 {
 710                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 711                                         restore_flags(flags);
 712                                         break;
 713                                 }
 714                                 restore_flags(flags);
 715                                 /*
 716                                  *      Retransmission
 717                                  */
 718                                 sk->prot->retransmit (sk, 0);
 719                                 tcp_write_timeout(sk);
 720                         }
 721                         break;
 722                 }
 723                 /* Sending Keepalives */
 724                 case TIME_KEEPOPEN:
 725                         /* 
 726                          * this reset_timer() call is a hack, this is not
 727                          * how KEEPOPEN is supposed to work.
 728                          */
 729                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 730 
 731                         /* Send something to keep the connection open. */
 732                         if (sk->prot->write_wakeup)
 733                                   sk->prot->write_wakeup (sk);
 734                         sk->retransmits++;
 735                         tcp_write_timeout(sk);
 736                         break;
 737                 default:
 738                         printk ("rexmit_timer: timer expired - reason unknown\n");
 739                         break;
 740         }
 741         release_sock(sk);
 742 }
 743 
 744 /*
 745  * This routine is called by the ICMP module when it gets some
 746  * sort of error condition.  If err < 0 then the socket should
 747  * be closed and the error returned to the user.  If err > 0
 748  * it's just the icmp type << 8 | icmp code.  After adjustment
 749  * header points to the first 8 bytes of the tcp header.  We need
 750  * to find the appropriate port.
 751  */
 752 
 753 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 754         unsigned long saddr, struct inet_protocol *protocol)
 755 {
 756         struct tcphdr *th;
 757         struct sock *sk;
 758         struct iphdr *iph=(struct iphdr *)header;
 759   
 760         header+=4*iph->ihl;
 761    
 762 
 763         th =(struct tcphdr *)header;
 764         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 765 
 766         if (sk == NULL) 
 767                 return;
 768   
 769         if(err<0)
 770         {
 771                 sk->err = -err;
 772                 sk->error_report(sk);
 773                 return;
 774         }
 775 
 776         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 777         {
 778                 /*
 779                  * FIXME:
 780                  * For now we will just trigger a linear backoff.
 781                  * The slow start code should cause a real backoff here.
 782                  */
 783                 if (sk->cong_window > 4)
 784                         sk->cong_window--;
 785                 return;
 786         }
 787 
 788 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 789 
 790         /*
 791          * If we've already connected we will keep trying
 792          * until we time out, or the user gives up.
 793          */
 794 
 795         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 796         {
 797                 if (sk->state == TCP_SYN_SENT) 
 798                 {
 799                         tcp_statistics.TcpAttemptFails++;
 800                         tcp_set_state(sk,TCP_CLOSE);
 801                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 802                 }
 803                 sk->err = icmp_err_convert[err & 0xff].errno;           
 804         }
 805         return;
 806 }
 807 
 808 
 809 /*
 810  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 811  *      in the received data queue (ie a frame missing that needs sending to us). Not
 812  *      sorting using two queues as data arrives makes life so much harder.
 813  */
 814 
 815 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 816 {
 817         unsigned long counted;
 818         unsigned long amount;
 819         struct sk_buff *skb;
 820         int sum;
 821         unsigned long flags;
 822 
 823         if(sk && sk->debug)
 824                 printk("tcp_readable: %p - ",sk);
 825 
 826         save_flags(flags);
 827         cli();
 828         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 829         {
 830                 restore_flags(flags);
 831                 if(sk && sk->debug) 
 832                         printk("empty\n");
 833                 return(0);
 834         }
 835   
 836         counted = sk->copied_seq;       /* Where we are at the moment */
 837         amount = 0;
 838   
 839         /* 
 840          *      Do until a push or until we are out of data. 
 841          */
 842          
 843         do 
 844         {
 845                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 846                         break;
 847                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 848                 if (skb->h.th->syn)
 849                         sum++;
 850                 if (sum > 0) 
 851                 {                                       /* Add it up, move on */
 852                         amount += sum;
 853                         if (skb->h.th->syn) 
 854                                 amount--;
 855                         counted += sum;
 856                 }
 857                 /*
 858                  * Don't count urg data ... but do it in the right place!
 859                  * Consider: "old_data (ptr is here) URG PUSH data"
 860                  * The old code would stop at the first push because
 861                  * it counted the urg (amount==1) and then does amount--
 862                  * *after* the loop.  This means tcp_readable() always
 863                  * returned zero if any URG PUSH was in the queue, even
 864                  * though there was normal data available. If we subtract
 865                  * the urg data right here, we even get it to work for more
 866                  * than one URG PUSH skb without normal data.
 867                  * This means that select() finally works now with urg data
 868                  * in the queue.  Note that rlogin was never affected
 869                  * because it doesn't use select(); it uses two processes
 870                  * and a blocking read().  And the queue scan in tcp_read()
 871                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 872                  */
 873                 if (skb->h.th->urg)
 874                         amount--;       /* don't count urg data */
 875                 if (amount && skb->h.th->psh) break;
 876                 skb = skb->next;
 877         }
 878         while(skb != (struct sk_buff *)&sk->receive_queue);
 879 
 880         restore_flags(flags);
 881         if(sk->debug)
 882                 printk("got %lu bytes.\n",amount);
 883         return(amount);
 884 }
 885 
 886 /*
 887  * LISTEN is a special case for select..
 888  */
 889 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 890 {
 891         if (sel_type == SEL_IN) {
 892                 int retval;
 893 
 894                 sk->inuse = 1;
 895                 retval = (tcp_find_established(sk) != NULL);
 896                 release_sock(sk);
 897                 if (!retval)
 898                         select_wait(&master_select_wakeup,wait);
 899                 return retval;
 900         }
 901         return 0;
 902 }
 903 
 904 
 905 /*
 906  *      Wait for a TCP event.
 907  *
 908  *      Note that we don't need to set "sk->inuse", as the upper select layers
 909  *      take care of normal races (between the test and the event) and we don't
 910  *      go look at any of the socket buffers directly.
 911  */
 912 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 913 {
 914         if (sk->state == TCP_LISTEN)
 915                 return tcp_listen_select(sk, sel_type, wait);
 916 
 917         switch(sel_type) {
 918         case SEL_IN:
 919                 if (sk->err)
 920                         return 1;
 921                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 922                         break;
 923 
 924                 if (sk->shutdown & RCV_SHUTDOWN)
 925                         return 1;
 926                         
 927                 if (sk->acked_seq == sk->copied_seq)
 928                         break;
 929 
 930                 if (sk->urg_seq != sk->copied_seq ||
 931                     sk->acked_seq != sk->copied_seq+1 ||
 932                     sk->urginline || !sk->urg_data)
 933                         return 1;
 934                 break;
 935 
 936         case SEL_OUT:
 937                 if (sk->shutdown & SEND_SHUTDOWN) 
 938                         return 0;
 939                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 940                         break;
 941                 /*
 942                  * This is now right thanks to a small fix
 943                  * by Matt Dillon.
 944                  */
 945 
 946                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 947                         break;
 948                 return 1;
 949 
 950         case SEL_EX:
 951                 if (sk->err || sk->urg_data)
 952                         return 1;
 953                 break;
 954         }
 955         select_wait(sk->sleep, wait);
 956         return 0;
 957 }
 958 
 959 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 960 {
 961         int err;
 962         switch(cmd) 
 963         {
 964 
 965                 case TIOCINQ:
 966 #ifdef FIXME    /* FIXME: */
 967                 case FIONREAD:
 968 #endif
 969                 {
 970                         unsigned long amount;
 971 
 972                         if (sk->state == TCP_LISTEN) 
 973                                 return(-EINVAL);
 974 
 975                         sk->inuse = 1;
 976                         amount = tcp_readable(sk);
 977                         release_sock(sk);
 978                         err=verify_area(VERIFY_WRITE,(void *)arg,
 979                                                    sizeof(unsigned long));
 980                         if(err)
 981                                 return err;
 982                         put_fs_long(amount,(unsigned long *)arg);
 983                         return(0);
 984                 }
 985                 case SIOCATMARK:
 986                 {
 987                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 988 
 989                         err = verify_area(VERIFY_WRITE,(void *) arg,
 990                                                   sizeof(unsigned long));
 991                         if (err)
 992                                 return err;
 993                         put_fs_long(answ,(int *) arg);
 994                         return(0);
 995                 }
 996                 case TIOCOUTQ:
 997                 {
 998                         unsigned long amount;
 999 
1000                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1001                         amount = sk->prot->wspace(sk);
1002                         err=verify_area(VERIFY_WRITE,(void *)arg,
1003                                                    sizeof(unsigned long));
1004                         if(err)
1005                                 return err;
1006                         put_fs_long(amount,(unsigned long *)arg);
1007                         return(0);
1008                 }
1009                 default:
1010                         return(-EINVAL);
1011         }
1012 }
1013 
1014 
1015 /*
1016  *      This routine computes a TCP checksum. 
1017  */
1018  
1019 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1020           unsigned long saddr, unsigned long daddr)
1021 {     
1022         unsigned long sum;
1023    
1024         if (saddr == 0) saddr = ip_my_addr();
1025 
1026 /*
1027  * stupid, gcc complains when I use just one __asm__ block,
1028  * something about too many reloads, but this is just two
1029  * instructions longer than what I want
1030  */
1031         __asm__("
1032             addl %%ecx, %%ebx
1033             adcl %%edx, %%ebx
1034             adcl $0, %%ebx
1035             "
1036         : "=b"(sum)
1037         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1038         : "bx", "cx", "dx" );
1039         __asm__("
1040             movl %%ecx, %%edx
1041             cld
1042             cmpl $32, %%ecx
1043             jb 2f
1044             shrl $5, %%ecx
1045             clc
1046 1:          lodsl
1047             adcl %%eax, %%ebx
1048             lodsl
1049             adcl %%eax, %%ebx
1050             lodsl
1051             adcl %%eax, %%ebx
1052             lodsl
1053             adcl %%eax, %%ebx
1054             lodsl
1055             adcl %%eax, %%ebx
1056             lodsl
1057             adcl %%eax, %%ebx
1058             lodsl
1059             adcl %%eax, %%ebx
1060             lodsl
1061             adcl %%eax, %%ebx
1062             loop 1b
1063             adcl $0, %%ebx
1064             movl %%edx, %%ecx
1065 2:          andl $28, %%ecx
1066             je 4f
1067             shrl $2, %%ecx
1068             clc
1069 3:          lodsl
1070             adcl %%eax, %%ebx
1071             loop 3b
1072             adcl $0, %%ebx
1073 4:          movl $0, %%eax
1074             testw $2, %%dx
1075             je 5f
1076             lodsw
1077             addl %%eax, %%ebx
1078             adcl $0, %%ebx
1079             movw $0, %%ax
1080 5:          test $1, %%edx
1081             je 6f
1082             lodsb
1083             addl %%eax, %%ebx
1084             adcl $0, %%ebx
1085 6:          movl %%ebx, %%eax
1086             shrl $16, %%eax
1087             addw %%ax, %%bx
1088             adcw $0, %%bx
1089             "
1090         : "=b"(sum)
1091         : "0"(sum), "c"(len), "S"(th)
1092         : "ax", "bx", "cx", "dx", "si" );
1093 
1094         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1095   
1096         return((~sum) & 0xffff);
1097 }
1098 
1099 
1100 
1101 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1102                 unsigned long daddr, int len, struct sock *sk)
1103 {
1104         th->check = 0;
1105         th->check = tcp_check(th, len, saddr, daddr);
1106         return;
1107 }
1108 
1109 /*
1110  *      This is the main buffer sending routine. We queue the buffer
1111  *      having checked it is sane seeming.
1112  */
1113  
1114 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1115 {
1116         int size;
1117         struct tcphdr * th = skb->h.th;
1118 
1119         /*
1120          *      length of packet (not counting length of pre-tcp headers) 
1121          */
1122          
1123         size = skb->len - ((unsigned char *) th - skb->data);
1124 
1125         /*
1126          *      Sanity check it.. 
1127          */
1128          
1129         if (size < sizeof(struct tcphdr) || size > skb->len) 
1130         {
1131                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1132                         skb, skb->data, th, skb->len);
1133                 kfree_skb(skb, FREE_WRITE);
1134                 return;
1135         }
1136 
1137         /*
1138          *      If we have queued a header size packet.. (these crash a few
1139          *      tcp stacks if ack is not set)
1140          */
1141          
1142         if (size == sizeof(struct tcphdr)) 
1143         {
1144                 /* If it's got a syn or fin it's notionally included in the size..*/
1145                 if(!th->syn && !th->fin) 
1146                 {
1147                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1148                         kfree_skb(skb,FREE_WRITE);
1149                         return;
1150                 }
1151         }
1152 
1153         /*
1154          *      Actual processing.
1155          */
1156          
1157         tcp_statistics.TcpOutSegs++;  
1158         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1159         
1160         /*
1161          *      We must queue if
1162          *
1163          *      a) The right edge of this frame exceeds the window
1164          *      b) We are retransmitting (Nagle's rule)
1165          *      c) We have too many packets 'in flight'
1166          */
1167          
1168         if (after(skb->h.seq, sk->window_seq) ||
1169             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1170              sk->packets_out >= sk->cong_window) 
1171         {
1172                 /* checksum will be supplied by tcp_write_xmit.  So
1173                  * we shouldn't need to set it at all.  I'm being paranoid */
1174                 th->check = 0;
1175                 if (skb->next != NULL) 
1176                 {
1177                         printk("tcp_send_partial: next != NULL\n");
1178                         skb_unlink(skb);
1179                 }
1180                 skb_queue_tail(&sk->write_queue, skb);
1181                 
1182                 /*
1183                  *      If we don't fit we have to start the zero window
1184                  *      probes. This is broken - we really need to do a partial
1185                  *      send _first_ (This is what causes the Cisco and PC/TCP
1186                  *      grief).
1187                  */
1188                  
1189                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1190                     sk->send_head == NULL && sk->ack_backlog == 0)
1191                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1192         } 
1193         else 
1194         {
1195                 /*
1196                  *      This is going straight out
1197                  */
1198                  
1199                 th->ack_seq = ntohl(sk->acked_seq);
1200                 th->window = ntohs(tcp_select_window(sk));
1201 
1202                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1203 
1204                 sk->sent_seq = sk->write_seq;
1205                 
1206                 /*
1207                  *      This is mad. The tcp retransmit queue is put together
1208                  *      by the ip layer. This causes half the problems with
1209                  *      unroutable FIN's and other things.
1210                  */
1211                  
1212                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1213                 
1214                 /*
1215                  *      Set for next retransmit based on expected ACK time.
1216                  *      FIXME: We set this every time which means our 
1217                  *      retransmits are really about a window behind.
1218                  */
1219 
1220                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1221         }
1222 }
1223 
1224 /*
1225  *      Locking problems lead us to a messy situation where we can have
1226  *      multiple partially complete buffers queued up. This is really bad
1227  *      as we don't want to be sending partial buffers. Fix this with
1228  *      a semaphore or similar to lock tcp_write per socket.
1229  *
1230  *      These routines are pretty self descriptive.
1231  */
1232  
1233 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1234 {
1235         struct sk_buff * skb;
1236         unsigned long flags;
1237 
1238         save_flags(flags);
1239         cli();
1240         skb = sk->partial;
1241         if (skb) {
1242                 sk->partial = NULL;
1243                 del_timer(&sk->partial_timer);
1244         }
1245         restore_flags(flags);
1246         return skb;
1247 }
1248 
1249 /*
1250  *      Empty the partial queue
1251  */
1252  
1253 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1254 {
1255         struct sk_buff *skb;
1256 
1257         if (sk == NULL)
1258                 return;
1259         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1260                 tcp_send_skb(sk, skb);
1261 }
1262 
1263 /*
1264  *      Queue a partial frame
1265  */
1266  
1267 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1268 {
1269         struct sk_buff * tmp;
1270         unsigned long flags;
1271 
1272         save_flags(flags);
1273         cli();
1274         tmp = sk->partial;
1275         if (tmp)
1276                 del_timer(&sk->partial_timer);
1277         sk->partial = skb;
1278         init_timer(&sk->partial_timer);
1279         /*
1280          *      Wait up to 1 second for the buffer to fill.
1281          */
1282         sk->partial_timer.expires = HZ;
1283         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1284         sk->partial_timer.data = (unsigned long) sk;
1285         add_timer(&sk->partial_timer);
1286         restore_flags(flags);
1287         if (tmp)
1288                 tcp_send_skb(sk, tmp);
1289 }
1290 
1291 
1292 /*
1293  *      This routine sends an ack and also updates the window. 
1294  */
1295  
1296 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1297              struct sock *sk,
1298              struct tcphdr *th, unsigned long daddr)
1299 {
1300         struct sk_buff *buff;
1301         struct tcphdr *t1;
1302         struct device *dev = NULL;
1303         int tmp;
1304 
1305         if(sk->zapped)
1306                 return;         /* We have been reset, we may not send again */
1307                 
1308         /*
1309          * We need to grab some memory, and put together an ack,
1310          * and then put it into the queue to be sent.
1311          */
1312 
1313         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1314         if (buff == NULL) 
1315         {
1316                 /* 
1317                  *      Force it to send an ack. We don't have to do this
1318                  *      (ACK is unreliable) but it's much better use of 
1319                  *      bandwidth on slow links to send a spare ack than
1320                  *      resend packets. 
1321                  */
1322                  
1323                 sk->ack_backlog++;
1324                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1325                 {
1326                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1327                 }
1328                 return;
1329         }
1330 
1331         /*
1332          *      Assemble a suitable TCP frame
1333          */
1334          
1335         buff->len = sizeof(struct tcphdr);
1336         buff->sk = sk;
1337         buff->localroute = sk->localroute;
1338         t1 =(struct tcphdr *) buff->data;
1339 
1340         /* 
1341          *      Put in the IP header and routing stuff. 
1342          */
1343          
1344         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1345                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1346         if (tmp < 0) 
1347         {
1348                 buff->free = 1;
1349                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1350                 return;
1351         }
1352         buff->len += tmp;
1353         t1 =(struct tcphdr *)((char *)t1 +tmp);
1354 
1355         memcpy(t1, th, sizeof(*t1));
1356 
1357         /*
1358          *      Swap the send and the receive. 
1359          */
1360          
1361         t1->dest = th->source;
1362         t1->source = th->dest;
1363         t1->seq = ntohl(sequence);
1364         t1->ack = 1;
1365         sk->window = tcp_select_window(sk);
1366         t1->window = ntohs(sk->window);
1367         t1->res1 = 0;
1368         t1->res2 = 0;
1369         t1->rst = 0;
1370         t1->urg = 0;
1371         t1->syn = 0;
1372         t1->psh = 0;
1373         t1->fin = 0;
1374         
1375         /*
1376          *      If we have nothing queued for transmit and the transmit timer
1377          *      is on we are just doing an ACK timeout and need to switch
1378          *      to a keepalive.
1379          */
1380          
1381         if (ack == sk->acked_seq) 
1382         {
1383                 sk->ack_backlog = 0;
1384                 sk->bytes_rcv = 0;
1385                 sk->ack_timed = 0;
1386                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1387                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1388                 {
1389                         if(sk->keepopen) {
1390                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1391                         } else {
1392                                 delete_timer(sk);
1393                         }
1394                 }
1395         }
1396         
1397         /*
1398          *      Fill in the packet and send it
1399          */
1400          
1401         t1->ack_seq = ntohl(ack);
1402         t1->doff = sizeof(*t1)/4;
1403         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1404         if (sk->debug)
1405                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1406         tcp_statistics.TcpOutSegs++;
1407         sk->prot->queue_xmit(sk, dev, buff, 1);
1408 }
1409 
1410 
1411 /* 
1412  *      This routine builds a generic TCP header. 
1413  */
1414  
1415 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1416 {
1417 
1418         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1419         th->seq = htonl(sk->write_seq);
1420         th->psh =(push == 0) ? 1 : 0;
1421         th->doff = sizeof(*th)/4;
1422         th->ack = 1;
1423         th->fin = 0;
1424         sk->ack_backlog = 0;
1425         sk->bytes_rcv = 0;
1426         sk->ack_timed = 0;
1427         th->ack_seq = htonl(sk->acked_seq);
1428         sk->window = tcp_select_window(sk);
1429         th->window = htons(sk->window);
1430 
1431         return(sizeof(*th));
1432 }
1433 
1434 /*
1435  *      This routine copies from a user buffer into a socket,
1436  *      and starts the transmit system.
1437  */
1438 
1439 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1440           int len, int nonblock, unsigned flags)
1441 {
1442         int copied = 0;
1443         int copy;
1444         int tmp;
1445         struct sk_buff *skb;
1446         struct sk_buff *send_tmp;
1447         unsigned char *buff;
1448         struct proto *prot;
1449         struct device *dev = NULL;
1450 
1451         sk->inuse=1;
1452         prot = sk->prot;
1453         while(len > 0) 
1454         {
1455                 if (sk->err) 
1456                 {                       /* Stop on an error */
1457                         release_sock(sk);
1458                         if (copied) 
1459                                 return(copied);
1460                         tmp = -sk->err;
1461                         sk->err = 0;
1462                         return(tmp);
1463                 }
1464 
1465                 /*
1466                  *      First thing we do is make sure that we are established. 
1467                  */
1468         
1469                 if (sk->shutdown & SEND_SHUTDOWN) 
1470                 {
1471                         release_sock(sk);
1472                         sk->err = EPIPE;
1473                         if (copied) 
1474                                 return(copied);
1475                         sk->err = 0;
1476                         return(-EPIPE);
1477                 }
1478 
1479                 /* 
1480                  *      Wait for a connection to finish.
1481                  */
1482         
1483                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1484                 {
1485                         if (sk->err) 
1486                         {
1487                                 release_sock(sk);
1488                                 if (copied) 
1489                                         return(copied);
1490                                 tmp = -sk->err;
1491                                 sk->err = 0;
1492                                 return(tmp);
1493                         }
1494 
1495                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1496                         {
1497                                 release_sock(sk);
1498                                 if (copied) 
1499                                         return(copied);
1500 
1501                                 if (sk->err) 
1502                                 {
1503                                         tmp = -sk->err;
1504                                         sk->err = 0;
1505                                         return(tmp);
1506                                 }
1507 
1508                                 if (sk->keepopen) 
1509                                 {
1510                                         send_sig(SIGPIPE, current, 0);
1511                                 }
1512                                 return(-EPIPE);
1513                         }
1514 
1515                         if (nonblock || copied) 
1516                         {
1517                                 release_sock(sk);
1518                                 if (copied) 
1519                                         return(copied);
1520                                 return(-EAGAIN);
1521                         }
1522 
1523                         release_sock(sk);
1524                         cli();
1525                 
1526                         if (sk->state != TCP_ESTABLISHED &&
1527                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1528                         {
1529                                 interruptible_sleep_on(sk->sleep);
1530                                 if (current->signal & ~current->blocked) 
1531                                 {
1532                                         sti();
1533                                         if (copied) 
1534                                                 return(copied);
1535                                         return(-ERESTARTSYS);
1536                                 }
1537                         }
1538                         sk->inuse = 1;
1539                         sti();
1540                 }
1541 
1542         /*
1543          * The following code can result in copy <= if sk->mss is ever
1544          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1545          * sk->mtu is constant once SYN processing is finished.  I.e. we
1546          * had better not get here until we've seen his SYN and at least one
1547          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1548          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1549          * non-decreasing.  Note that any ioctl to set user_mss must be done
1550          * before the exchange of SYN's.  If the initial ack from the other
1551          * end has a window of 0, max_window and thus mss will both be 0.
1552          */
1553 
1554         /* 
1555          *      Now we need to check if we have a half built packet. 
1556          */
1557 
1558                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1559                 {
1560                         int hdrlen;
1561 
1562                          /* IP header + TCP header */
1563                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1564                                  + sizeof(struct tcphdr);
1565         
1566                         /* Add more stuff to the end of skb->len */
1567                         if (!(flags & MSG_OOB)) 
1568                         {
1569                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1570                                 /* FIXME: this is really a bug. */
1571                                 if (copy <= 0) 
1572                                 {
1573                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1574                                         copy = 0;
1575                                 }
1576           
1577                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1578                                 skb->len += copy;
1579                                 from += copy;
1580                                 copied += copy;
1581                                 len -= copy;
1582                                 sk->write_seq += copy;
1583                         }
1584                         if ((skb->len - hdrlen) >= sk->mss ||
1585                                 (flags & MSG_OOB) || !sk->packets_out)
1586                                 tcp_send_skb(sk, skb);
1587                         else
1588                                 tcp_enqueue_partial(skb, sk);
1589                         continue;
1590                 }
1591 
1592         /*
1593          * We also need to worry about the window.
1594          * If window < 1/2 the maximum window we've seen from this
1595          *   host, don't use it.  This is sender side
1596          *   silly window prevention, as specified in RFC1122.
1597          *   (Note that this is different than earlier versions of
1598          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1599          *   use the whole MSS.  Since the results in the right
1600          *   edge of the packet being outside the window, it will
1601          *   be queued for later rather than sent.
1602          */
1603 
1604                 copy = sk->window_seq - sk->write_seq;
1605                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1606                         copy = sk->mss;
1607                 if (copy > len)
1608                         copy = len;
1609 
1610         /*
1611          *      We should really check the window here also. 
1612          */
1613          
1614                 send_tmp = NULL;
1615                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1616                 {
1617                         /*
1618                          *      We will release the socket in case we sleep here. 
1619                          */
1620                         release_sock(sk);
1621                         /*
1622                          *      NB: following must be mtu, because mss can be increased.
1623                          *      mss is always <= mtu 
1624                          */
1625                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1626                         sk->inuse = 1;
1627                         send_tmp = skb;
1628                 } 
1629                 else 
1630                 {
1631                         /*
1632                          *      We will release the socket in case we sleep here. 
1633                          */
1634                         release_sock(sk);
1635                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1636                         sk->inuse = 1;
1637                 }
1638 
1639                 /*
1640                  *      If we didn't get any memory, we need to sleep. 
1641                  */
1642 
1643                 if (skb == NULL) 
1644                 {
1645                         sk->socket->flags |= SO_NOSPACE;
1646                         if (nonblock) 
1647                         {
1648                                 release_sock(sk);
1649                                 if (copied) 
1650                                         return(copied);
1651                                 return(-EAGAIN);
1652                         }
1653 
1654                         /*
1655                          *      FIXME: here is another race condition. 
1656                          */
1657 
1658                         tmp = sk->wmem_alloc;
1659                         release_sock(sk);
1660                         cli();
1661                         /*
1662                          *      Again we will try to avoid it. 
1663                          */
1664                         if (tmp <= sk->wmem_alloc &&
1665                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1666                                 && sk->err == 0) 
1667                         {
1668                                 sk->socket->flags &= ~SO_NOSPACE;
1669                                 interruptible_sleep_on(sk->sleep);
1670                                 if (current->signal & ~current->blocked) 
1671                                 {
1672                                         sti();
1673                                         if (copied) 
1674                                                 return(copied);
1675                                         return(-ERESTARTSYS);
1676                                 }
1677                         }
1678                         sk->inuse = 1;
1679                         sti();
1680                         continue;
1681                 }
1682 
1683                 skb->len = 0;
1684                 skb->sk = sk;
1685                 skb->free = 0;
1686                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1687         
1688                 buff = skb->data;
1689         
1690                 /*
1691                  * FIXME: we need to optimize this.
1692                  * Perhaps some hints here would be good.
1693                  */
1694                 
1695                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1696                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1697                 if (tmp < 0 ) 
1698                 {
1699                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1700                         release_sock(sk);
1701                         if (copied) 
1702                                 return(copied);
1703                         return(tmp);
1704                 }
1705                 skb->len += tmp;
1706                 skb->dev = dev;
1707                 buff += tmp;
1708                 skb->h.th =(struct tcphdr *) buff;
1709                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1710                 if (tmp < 0) 
1711                 {
1712                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1713                         release_sock(sk);
1714                         if (copied) 
1715                                 return(copied);
1716                         return(tmp);
1717                 }
1718 
1719                 if (flags & MSG_OOB) 
1720                 {
1721                         ((struct tcphdr *)buff)->urg = 1;
1722                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1723                 }
1724                 skb->len += tmp;
1725                 memcpy_fromfs(buff+tmp, from, copy);
1726 
1727                 from += copy;
1728                 copied += copy;
1729                 len -= copy;
1730                 skb->len += copy;
1731                 skb->free = 0;
1732                 sk->write_seq += copy;
1733         
1734                 if (send_tmp != NULL && sk->packets_out) 
1735                 {
1736                         tcp_enqueue_partial(send_tmp, sk);
1737                         continue;
1738                 }
1739                 tcp_send_skb(sk, skb);
1740         }
1741         sk->err = 0;
1742 
1743 /*
1744  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1745  *      interactive fast network servers. It's meant to be on and
1746  *      it really improves the throughput though not the echo time
1747  *      on my slow slip link - Alan
1748  */
1749 
1750 /*
1751  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1752  */
1753  
1754         if(sk->partial && ((!sk->packets_out) 
1755      /* If not nagling we can send on the before case too.. */
1756               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1757         ))
1758                 tcp_send_partial(sk);
1759 
1760         release_sock(sk);
1761         return(copied);
1762 }
1763 
1764 /*
1765  *      This is just a wrapper. 
1766  */
1767 
1768 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1769            int len, int nonblock, unsigned flags,
1770            struct sockaddr_in *addr, int addr_len)
1771 {
1772         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1773                 return -EINVAL;
1774         if (sk->state == TCP_CLOSE)
1775                 return -ENOTCONN;
1776         if (addr_len < sizeof(*addr))
1777                 return -EINVAL;
1778         if (addr->sin_family && addr->sin_family != AF_INET) 
1779                 return -EINVAL;
1780         if (addr->sin_port != sk->dummy_th.dest) 
1781                 return -EISCONN;
1782         if (addr->sin_addr.s_addr != sk->daddr) 
1783                 return -EISCONN;
1784         return tcp_write(sk, from, len, nonblock, flags);
1785 }
1786 
1787 
1788 /*
1789  *      Send an ack if one is backlogged at this point. Ought to merge
1790  *      this with tcp_send_ack().
1791  */
1792  
1793 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1794 {
1795         int tmp;
1796         struct device *dev = NULL;
1797         struct tcphdr *t1;
1798         struct sk_buff *buff;
1799 
1800         if (!sk->ack_backlog) 
1801                 return;
1802 
1803         /*
1804          * FIXME: we need to put code here to prevent this routine from
1805          * being called.  Being called once in a while is ok, so only check
1806          * if this is the second time in a row.
1807          */
1808 
1809         /*
1810          * We need to grab some memory, and put together an ack,
1811          * and then put it into the queue to be sent.
1812          */
1813 
1814         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1815         if (buff == NULL) 
1816         {
1817                 /* Try again real soon. */
1818                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1819                 return;
1820         }
1821 
1822         buff->len = sizeof(struct tcphdr);
1823         buff->sk = sk;
1824         buff->localroute = sk->localroute;
1825         
1826         /*
1827          *      Put in the IP header and routing stuff. 
1828          */
1829 
1830         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1831                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1832         if (tmp < 0) 
1833         {
1834                 buff->free = 1;
1835                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1836                 return;
1837         }
1838 
1839         buff->len += tmp;
1840         t1 =(struct tcphdr *)(buff->data +tmp);
1841 
1842         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1843         t1->seq = htonl(sk->sent_seq);
1844         t1->ack = 1;
1845         t1->res1 = 0;
1846         t1->res2 = 0;
1847         t1->rst = 0;
1848         t1->urg = 0;
1849         t1->syn = 0;
1850         t1->psh = 0;
1851         sk->ack_backlog = 0;
1852         sk->bytes_rcv = 0;
1853         sk->window = tcp_select_window(sk);
1854         t1->window = ntohs(sk->window);
1855         t1->ack_seq = ntohl(sk->acked_seq);
1856         t1->doff = sizeof(*t1)/4;
1857         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1858         sk->prot->queue_xmit(sk, dev, buff, 1);
1859         tcp_statistics.TcpOutSegs++;
1860 }
1861 
1862 
1863 /*
1864  *      FIXME:
1865  *      This routine frees used buffers.
1866  *      It should consider sending an ACK to let the
1867  *      other end know we now have a bigger window.
1868  */
1869 
1870 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1871 {
1872         unsigned long flags;
1873         unsigned long left;
1874         struct sk_buff *skb;
1875         unsigned long rspace;
1876 
1877         if(sk->debug)
1878                 printk("cleaning rbuf for sk=%p\n", sk);
1879   
1880         save_flags(flags);
1881         cli();
1882   
1883         left = sk->prot->rspace(sk);
1884  
1885         /*
1886          *      We have to loop through all the buffer headers,
1887          *      and try to free up all the space we can.
1888          */
1889 
1890         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1891         {
1892                 if (!skb->used || skb->users) 
1893                         break;
1894                 skb_unlink(skb);
1895                 skb->sk = sk;
1896                 kfree_skb(skb, FREE_READ);
1897         }
1898 
1899         restore_flags(flags);
1900 
1901         /*
1902          *      FIXME:
1903          *      At this point we should send an ack if the difference
1904          *      in the window, and the amount of space is bigger than
1905          *      TCP_WINDOW_DIFF.
1906          */
1907 
1908         if(sk->debug)
1909                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1910                                             left);
1911         if ((rspace=sk->prot->rspace(sk)) != left) 
1912         {
1913                 /*
1914                  * This area has caused the most trouble.  The current strategy
1915                  * is to simply do nothing if the other end has room to send at
1916                  * least 3 full packets, because the ack from those will auto-
1917                  * matically update the window.  If the other end doesn't think
1918                  * we have much space left, but we have room for at least 1 more
1919                  * complete packet than it thinks we do, we will send an ack
1920                  * immediately.  Otherwise we will wait up to .5 seconds in case
1921                  * the user reads some more.
1922                  */
1923                 sk->ack_backlog++;
1924         /*
1925          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1926          * if the other end is offering a window smaller than the agreed on MSS
1927          * (called sk->mtu here).  In theory there's no connection between send
1928          * and receive, and so no reason to think that they're going to send
1929          * small packets.  For the moment I'm using the hack of reducing the mss
1930          * only on the send side, so I'm putting mtu here.
1931          */
1932 
1933                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1934                 {
1935                         /* Send an ack right now. */
1936                         tcp_read_wakeup(sk);
1937                 } 
1938                 else 
1939                 {
1940                         /* Force it to send an ack soon. */
1941                         int was_active = del_timer(&sk->retransmit_timer);
1942                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1943                         {
1944                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1945                         } 
1946                         else
1947                                 add_timer(&sk->retransmit_timer);
1948                 }
1949         }
1950 } 
1951 
1952 
1953 /*
1954  *      Handle reading urgent data. BSD has very simple semantics for
1955  *      this, no blocking and very strange errors 8)
1956  */
1957  
1958 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1959              unsigned char *to, int len, unsigned flags)
1960 {
1961         /*
1962          *      No URG data to read
1963          */
1964         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1965                 return -EINVAL; /* Yes this is right ! */
1966                 
1967         if (sk->err) 
1968         {
1969                 int tmp = -sk->err;
1970                 sk->err = 0;
1971                 return tmp;
1972         }
1973 
1974         if (sk->state == TCP_CLOSE || sk->done) 
1975         {
1976                 if (!sk->done) {
1977                         sk->done = 1;
1978                         return 0;
1979                 }
1980                 return -ENOTCONN;
1981         }
1982 
1983         if (sk->shutdown & RCV_SHUTDOWN) 
1984         {
1985                 sk->done = 1;
1986                 return 0;
1987         }
1988         sk->inuse = 1;
1989         if (sk->urg_data & URG_VALID) 
1990         {
1991                 char c = sk->urg_data;
1992                 if (!(flags & MSG_PEEK))
1993                         sk->urg_data = URG_READ;
1994                 put_fs_byte(c, to);
1995                 release_sock(sk);
1996                 return 1;
1997         }
1998         release_sock(sk);
1999         
2000         /*
2001          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2002          * the available implementations agree in this case:
2003          * this call should never block, independent of the
2004          * blocking state of the socket.
2005          * Mike <pall@rz.uni-karlsruhe.de>
2006          */
2007         return -EAGAIN;
2008 }
2009 
2010 
2011 /*
2012  *      This routine copies from a sock struct into the user buffer. 
2013  */
2014  
2015 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2016         int len, int nonblock, unsigned flags)
2017 {
2018         struct wait_queue wait = { current, NULL };
2019         int copied = 0;
2020         unsigned long peek_seq;
2021         volatile unsigned long *seq;    /* So gcc doesn't overoptimise */
2022         unsigned long used;
2023 
2024         /* 
2025          *      This error should be checked. 
2026          */
2027          
2028         if (sk->state == TCP_LISTEN)
2029                 return -ENOTCONN;
2030 
2031         /*
2032          *      Urgent data needs to be handled specially. 
2033          */
2034          
2035         if (flags & MSG_OOB)
2036                 return tcp_read_urg(sk, nonblock, to, len, flags);
2037 
2038         /*
2039          *      Copying sequence to update. This is volatile to handle
2040          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2041          *      inline and thus not flush cached variables otherwise).
2042          */
2043          
2044         peek_seq = sk->copied_seq;
2045         seq = &sk->copied_seq;
2046         if (flags & MSG_PEEK)
2047                 seq = &peek_seq;
2048 
2049         add_wait_queue(sk->sleep, &wait);
2050         sk->inuse = 1;
2051         while (len > 0) 
2052         {
2053                 struct sk_buff * skb;
2054                 unsigned long offset;
2055         
2056                 /*
2057                  * Are we at urgent data? Stop if we have read anything.
2058                  */
2059                  
2060                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2061                         break;
2062 
2063                 /*
2064                  *      Next get a buffer.
2065                  */
2066                  
2067                 current->state = TASK_INTERRUPTIBLE;
2068 
2069                 skb = skb_peek(&sk->receive_queue);
2070                 do 
2071                 {
2072                         if (!skb)
2073                                 break;
2074                         if (before(*seq, skb->h.th->seq))
2075                                 break;
2076                         offset = *seq - skb->h.th->seq;
2077                         if (skb->h.th->syn)
2078                                 offset--;
2079                         if (offset < skb->len)
2080                                 goto found_ok_skb;
2081                         if (skb->h.th->fin)
2082                                 goto found_fin_ok;
2083                         if (!(flags & MSG_PEEK))
2084                                 skb->used = 1;
2085                         skb = skb->next;
2086                 }
2087                 while (skb != (struct sk_buff *)&sk->receive_queue);
2088 
2089                 if (copied)
2090                         break;
2091 
2092                 if (sk->err) 
2093                 {
2094                         copied = -sk->err;
2095                         sk->err = 0;
2096                         break;
2097                 }
2098 
2099                 if (sk->state == TCP_CLOSE) 
2100                 {
2101                         if (!sk->done) 
2102                         {
2103                                 sk->done = 1;
2104                                 break;
2105                         }
2106                         copied = -ENOTCONN;
2107                         break;
2108                 }
2109 
2110                 if (sk->shutdown & RCV_SHUTDOWN) 
2111                 {
2112                         sk->done = 1;
2113                         break;
2114                 }
2115                         
2116                 if (nonblock) 
2117                 {
2118                         copied = -EAGAIN;
2119                         break;
2120                 }
2121 
2122                 cleanup_rbuf(sk);
2123                 release_sock(sk);
2124                 sk->socket->flags |= SO_WAITDATA;
2125                 schedule();
2126                 sk->socket->flags &= ~SO_WAITDATA;
2127                 sk->inuse = 1;
2128 
2129                 if (current->signal & ~current->blocked) 
2130                 {
2131                         copied = -ERESTARTSYS;
2132                         break;
2133                 }
2134                 continue;
2135 
2136         found_ok_skb:
2137                 /*
2138                  *      Lock the buffer. We can be fairly relaxed as
2139                  *      an interrupt will never steal a buffer we are 
2140                  *      using unless I've missed something serious in
2141                  *      tcp_data.
2142                  */
2143                 
2144                 skb->users++;
2145                 
2146                 /*
2147                  *      Ok so how much can we use ? 
2148                  */
2149                  
2150                 used = skb->len - offset;
2151                 if (len < used)
2152                         used = len;
2153                 /*
2154                  *      Do we have urgent data here? 
2155                  */
2156                 
2157                 if (sk->urg_data) 
2158                 {
2159                         unsigned long urg_offset = sk->urg_seq - *seq;
2160                         if (urg_offset < used) 
2161                         {
2162                                 if (!urg_offset) 
2163                                 {
2164                                         if (!sk->urginline) 
2165                                         {
2166                                                 ++*seq;
2167                                                 offset++;
2168                                                 used--;
2169                                         }
2170                                 }
2171                                 else
2172                                         used = urg_offset;
2173                         }
2174                 }
2175                 
2176                 /*
2177                  *      Copy it - We _MUST_ update *seq first so that we
2178                  *      don't ever double read when we have dual readers
2179                  */
2180                  
2181                 *seq += used;
2182 
2183                 /*
2184                  *      This memcpy_tofs can sleep. If it sleeps and we
2185                  *      do a second read it relies on the skb->users to avoid
2186                  *      a crash when cleanup_rbuf() gets called.
2187                  */
2188                  
2189                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2190                         skb->h.th->doff*4 + offset, used);
2191                 copied += used;
2192                 len -= used;
2193                 to += used;
2194                 
2195                 /*
2196                  *      We now will not sleep again until we are finished
2197                  *      with skb. Sorry if you are doing the SMP port
2198                  *      but you'll just have to fix it neatly ;)
2199                  */
2200                  
2201                 skb->users --;
2202                 
2203                 if (after(sk->copied_seq,sk->urg_seq))
2204                         sk->urg_data = 0;
2205                 if (used + offset < skb->len)
2206                         continue;
2207                 
2208                 /*
2209                  *      Process the FIN.
2210                  */
2211 
2212                 if (skb->h.th->fin)
2213                         goto found_fin_ok;
2214                 if (flags & MSG_PEEK)
2215                         continue;
2216                 skb->used = 1;
2217                 continue;
2218 
2219         found_fin_ok:
2220                 ++*seq;
2221                 if (flags & MSG_PEEK)
2222                         break;
2223                         
2224                 /*
2225                  *      All is done
2226                  */
2227                  
2228                 skb->used = 1;
2229                 sk->shutdown |= RCV_SHUTDOWN;
2230                 break;
2231 
2232         }
2233         remove_wait_queue(sk->sleep, &wait);
2234         current->state = TASK_RUNNING;
2235 
2236         /* Clean up data we have read: This will do ACK frames */
2237         cleanup_rbuf(sk);
2238         release_sock(sk);
2239         return copied;
2240 }
2241 
2242 /*
2243  *      State processing on a close. This implements the state shift for
2244  *      sending our FIN frame. Note that we only send a FIN for some 
2245  *      states. A shutdown() may have already sent the FIN, or we may be
2246  *      closed.
2247  */
2248  
2249 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2250 {
2251         int ns=TCP_CLOSE;
2252         int send_fin=0;
2253         switch(sk->state)
2254         {
2255                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2256                         break;
2257                 case TCP_SYN_RECV:
2258                 case TCP_ESTABLISHED:   /* Closedown begin */
2259                         ns=TCP_FIN_WAIT1;
2260                         send_fin=1;
2261                         break;
2262                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2263                 case TCP_FIN_WAIT2:
2264                 case TCP_CLOSING:
2265                         ns=sk->state;
2266                         break;
2267                 case TCP_CLOSE:
2268                 case TCP_LISTEN:
2269                         break;
2270                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2271                                            wait only for the ACK */
2272                         ns=TCP_LAST_ACK;
2273                         send_fin=1;
2274         }
2275         
2276         tcp_set_state(sk,ns);
2277                 
2278         /*
2279          *      This is a (useful) BSD violating of the RFC. There is a
2280          *      problem with TCP as specified in that the other end could
2281          *      keep a socket open forever with no application left this end.
2282          *      We use a 3 minute timeout (about the same as BSD) then kill
2283          *      our end. If they send after that then tough - BUT: long enough
2284          *      that we won't make the old 4*rto = almost no time - whoops
2285          *      reset mistake.
2286          */
2287         if(dead && ns==TCP_FIN_WAIT2)
2288         {
2289                 int timer_active=del_timer(&sk->timer);
2290                 if(timer_active)
2291                         add_timer(&sk->timer);
2292                 else
2293                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2294         }
2295         
2296         return send_fin;
2297 }
2298 
2299 /*
2300  *      Send a fin.
2301  */
2302 
2303 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2304 {
2305         struct proto *prot =(struct proto *)sk->prot;
2306         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2307         struct tcphdr *t1;
2308         struct sk_buff *buff;
2309         struct device *dev=NULL;
2310         int tmp;
2311                 
2312         release_sock(sk); /* in case the malloc sleeps. */
2313         
2314         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2315         sk->inuse = 1;
2316 
2317         if (buff == NULL)
2318         {
2319                 /* This is a disaster if it occurs */
2320                 printk("tcp_send_fin: Impossible malloc failure");
2321                 return;
2322         }
2323 
2324         /*
2325          *      Administrivia
2326          */
2327          
2328         buff->sk = sk;
2329         buff->len = sizeof(*t1);
2330         buff->localroute = sk->localroute;
2331         t1 =(struct tcphdr *) buff->data;
2332 
2333         /*
2334          *      Put in the IP header and routing stuff. 
2335          */
2336 
2337         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2338                            IPPROTO_TCP, sk->opt,
2339                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2340         if (tmp < 0) 
2341         {
2342                 int t;
2343                 /*
2344                  *      Finish anyway, treat this as a send that got lost. 
2345                  *      (Not good).
2346                  */
2347                  
2348                 buff->free = 1;
2349                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2350                 sk->write_seq++;
2351                 t=del_timer(&sk->timer);
2352                 if(t)
2353                         add_timer(&sk->timer);
2354                 else
2355                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2356                 return;
2357         }
2358         
2359         /*
2360          *      We ought to check if the end of the queue is a buffer and
2361          *      if so simply add the fin to that buffer, not send it ahead.
2362          */
2363 
2364         t1 =(struct tcphdr *)((char *)t1 +tmp);
2365         buff->len += tmp;
2366         buff->dev = dev;
2367         memcpy(t1, th, sizeof(*t1));
2368         t1->seq = ntohl(sk->write_seq);
2369         sk->write_seq++;
2370         buff->h.seq = sk->write_seq;
2371         t1->ack = 1;
2372         t1->ack_seq = ntohl(sk->acked_seq);
2373         t1->window = ntohs(sk->window=tcp_select_window(sk));
2374         t1->fin = 1;
2375         t1->rst = 0;
2376         t1->doff = sizeof(*t1)/4;
2377         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2378 
2379         /*
2380          * If there is data in the write queue, the fin must be appended to
2381          * the write queue.
2382          */
2383         
2384         if (skb_peek(&sk->write_queue) != NULL) 
2385         {
2386                 buff->free = 0;
2387                 if (buff->next != NULL) 
2388                 {
2389                         printk("tcp_send_fin: next != NULL\n");
2390                         skb_unlink(buff);
2391                 }
2392                 skb_queue_tail(&sk->write_queue, buff);
2393         } 
2394         else 
2395         {
2396                 sk->sent_seq = sk->write_seq;
2397                 sk->prot->queue_xmit(sk, dev, buff, 0);
2398                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2399         }
2400 }
2401 
2402 /*
2403  *      Shutdown the sending side of a connection. Much like close except
2404  *      that we don't receive shut down or set sk->dead=1.
2405  */
2406 
2407 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2408 {
2409         /*
2410          *      We need to grab some memory, and put together a FIN,
2411          *      and then put it into the queue to be sent.
2412          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2413          */
2414 
2415         if (!(how & SEND_SHUTDOWN)) 
2416                 return;
2417          
2418         /*
2419          *      If we've already sent a FIN, or it's a closed state
2420          */
2421          
2422         if (sk->state == TCP_FIN_WAIT1 ||
2423             sk->state == TCP_FIN_WAIT2 ||
2424             sk->state == TCP_CLOSING ||
2425             sk->state == TCP_LAST_ACK ||
2426             sk->state == TCP_TIME_WAIT || 
2427             sk->state == TCP_CLOSE ||
2428             sk->state == TCP_LISTEN
2429           )
2430         {
2431                 return;
2432         }
2433         sk->inuse = 1;
2434 
2435         /*
2436          * flag that the sender has shutdown
2437          */
2438 
2439         sk->shutdown |= SEND_SHUTDOWN;
2440 
2441         /*
2442          *  Clear out any half completed packets. 
2443          */
2444 
2445         if (sk->partial)
2446                 tcp_send_partial(sk);
2447                 
2448         /*
2449          *      FIN if needed
2450          */
2451          
2452         if(tcp_close_state(sk,0))
2453                 tcp_send_fin(sk);
2454                 
2455         release_sock(sk);
2456 }
2457 
2458 
2459 static int
2460 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2461              int to_len, int nonblock, unsigned flags,
2462              struct sockaddr_in *addr, int *addr_len)
2463 {
2464         int result;
2465   
2466         /* 
2467          *      Have to check these first unlike the old code. If 
2468          *      we check them after we lose data on an error
2469          *      which is wrong 
2470          */
2471 
2472         if(addr_len)
2473                 *addr_len = sizeof(*addr);
2474         result=tcp_read(sk, to, to_len, nonblock, flags);
2475 
2476         if (result < 0) 
2477                 return(result);
2478   
2479         if(addr)
2480         {
2481                 addr->sin_family = AF_INET;
2482                 addr->sin_port = sk->dummy_th.dest;
2483                 addr->sin_addr.s_addr = sk->daddr;
2484         }
2485         return(result);
2486 }
2487 
2488 
2489 /*
2490  *      This routine will send an RST to the other tcp. 
2491  */
2492  
2493 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2494           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2495 {
2496         struct sk_buff *buff;
2497         struct tcphdr *t1;
2498         int tmp;
2499         struct device *ndev=NULL;
2500 
2501         /*
2502          *      Cannot reset a reset (Think about it).
2503          */
2504          
2505         if(th->rst)
2506                 return;
2507   
2508         /*
2509          * We need to grab some memory, and put together an RST,
2510          * and then put it into the queue to be sent.
2511          */
2512 
2513         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2514         if (buff == NULL) 
2515                 return;
2516 
2517         buff->len = sizeof(*t1);
2518         buff->sk = NULL;
2519         buff->dev = dev;
2520         buff->localroute = 0;
2521 
2522         t1 =(struct tcphdr *) buff->data;
2523 
2524         /*
2525          *      Put in the IP header and routing stuff. 
2526          */
2527 
2528         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2529                            sizeof(struct tcphdr),tos,ttl);
2530         if (tmp < 0) 
2531         {
2532                 buff->free = 1;
2533                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2534                 return;
2535         }
2536 
2537         t1 =(struct tcphdr *)((char *)t1 +tmp);
2538         buff->len += tmp;
2539         memcpy(t1, th, sizeof(*t1));
2540 
2541         /*
2542          *      Swap the send and the receive. 
2543          */
2544 
2545         t1->dest = th->source;
2546         t1->source = th->dest;
2547         t1->rst = 1;  
2548         t1->window = 0;
2549   
2550         if(th->ack)
2551         {
2552                 t1->ack = 0;
2553                 t1->seq = th->ack_seq;
2554                 t1->ack_seq = 0;
2555         }
2556         else
2557         {
2558                 t1->ack = 1;
2559                 if(!th->syn)
2560                         t1->ack_seq=htonl(th->seq);
2561                 else
2562                         t1->ack_seq=htonl(th->seq+1);
2563                 t1->seq=0;
2564         }
2565 
2566         t1->syn = 0;
2567         t1->urg = 0;
2568         t1->fin = 0;
2569         t1->psh = 0;
2570         t1->doff = sizeof(*t1)/4;
2571         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2572         prot->queue_xmit(NULL, ndev, buff, 1);
2573         tcp_statistics.TcpOutSegs++;
2574 }
2575 
2576 
2577 /*
2578  *      Look for tcp options. Parses everything but only knows about MSS.
2579  *      This routine is always called with the packet containing the SYN.
2580  *      However it may also be called with the ack to the SYN.  So you
2581  *      can't assume this is always the SYN.  It's always called after
2582  *      we have set up sk->mtu to our own MTU.
2583  *
2584  *      We need at minimum to add PAWS support here. Possibly large windows
2585  *      as Linux gets deployed on 100Mb/sec networks.
2586  */
2587  
2588 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2589 {
2590         unsigned char *ptr;
2591         int length=(th->doff*4)-sizeof(struct tcphdr);
2592         int mss_seen = 0;
2593     
2594         ptr = (unsigned char *)(th + 1);
2595   
2596         while(length>0)
2597         {
2598                 int opcode=*ptr++;
2599                 int opsize=*ptr++;
2600                 switch(opcode)
2601                 {
2602                         case TCPOPT_EOL:
2603                                 return;
2604                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2605                                 length--;
2606                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2607                                 continue;
2608                         
2609                         default:
2610                                 if(opsize<=2)   /* Avoid silly options looping forever */
2611                                         return;
2612                                 switch(opcode)
2613                                 {
2614                                         case TCPOPT_MSS:
2615                                                 if(opsize==4 && th->syn)
2616                                                 {
2617                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2618                                                         mss_seen = 1;
2619                                                 }
2620                                                 break;
2621                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2622                                 }
2623                                 ptr+=opsize-2;
2624                                 length-=opsize;
2625                 }
2626         }
2627         if (th->syn) 
2628         {
2629                 if (! mss_seen)
2630                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2631         }
2632 #ifdef CONFIG_INET_PCTCP
2633         sk->mss = min(sk->max_window >> 1, sk->mtu);
2634 #else    
2635         sk->mss = min(sk->max_window, sk->mtu);
2636 #endif  
2637 }
2638 
2639 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2640 {
2641         dst = ntohl(dst);
2642         if (IN_CLASSA(dst))
2643                 return htonl(IN_CLASSA_NET);
2644         if (IN_CLASSB(dst))
2645                 return htonl(IN_CLASSB_NET);
2646         return htonl(IN_CLASSC_NET);
2647 }
2648 
2649 /*
2650  *      Default sequence number picking algorithm.
2651  *      As close as possible to RFC 793, which
2652  *      suggests using a 250kHz clock.
2653  *      Further reading shows this assumes 2MB/s networks.
2654  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2655  *      That's funny, Linux has one built in!  Use it!
2656  */
2657 
2658 extern inline unsigned long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2659 {
2660         struct timeval tv;
2661         do_gettimeofday(&tv);
2662         return tv.tv_usec+tv.tv_sec*1000000;
2663 }
2664 
2665 /*
2666  *      This routine handles a connection request.
2667  *      It should make sure we haven't already responded.
2668  *      Because of the way BSD works, we have to send a syn/ack now.
2669  *      This also means it will be harder to close a socket which is
2670  *      listening.
2671  */
2672  
2673 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2674                  unsigned long daddr, unsigned long saddr,
2675                  struct options *opt, struct device *dev, unsigned long seq)
2676 {
2677         struct sk_buff *buff;
2678         struct tcphdr *t1;
2679         unsigned char *ptr;
2680         struct sock *newsk;
2681         struct tcphdr *th;
2682         struct device *ndev=NULL;
2683         int tmp;
2684         struct rtable *rt;
2685   
2686         th = skb->h.th;
2687 
2688         /* If the socket is dead, don't accept the connection. */
2689         if (!sk->dead) 
2690         {
2691                 sk->data_ready(sk,0);
2692         }
2693         else 
2694         {
2695                 if(sk->debug)
2696                         printk("Reset on %p: Connect on dead socket.\n",sk);
2697                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2698                 tcp_statistics.TcpAttemptFails++;
2699                 kfree_skb(skb, FREE_READ);
2700                 return;
2701         }
2702 
2703         /*
2704          * Make sure we can accept more.  This will prevent a
2705          * flurry of syns from eating up all our memory.
2706          */
2707 
2708         if (sk->ack_backlog >= sk->max_ack_backlog) 
2709         {
2710                 tcp_statistics.TcpAttemptFails++;
2711                 kfree_skb(skb, FREE_READ);
2712                 return;
2713         }
2714 
2715         /*
2716          * We need to build a new sock struct.
2717          * It is sort of bad to have a socket without an inode attached
2718          * to it, but the wake_up's will just wake up the listening socket,
2719          * and if the listening socket is destroyed before this is taken
2720          * off of the queue, this will take care of it.
2721          */
2722 
2723         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2724         if (newsk == NULL) 
2725         {
2726                 /* just ignore the syn.  It will get retransmitted. */
2727                 tcp_statistics.TcpAttemptFails++;
2728                 kfree_skb(skb, FREE_READ);
2729                 return;
2730         }
2731 
2732         memcpy(newsk, sk, sizeof(*newsk));
2733         skb_queue_head_init(&newsk->write_queue);
2734         skb_queue_head_init(&newsk->receive_queue);
2735         newsk->send_head = NULL;
2736         newsk->send_tail = NULL;
2737         skb_queue_head_init(&newsk->back_log);
2738         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2739         newsk->rto = TCP_TIMEOUT_INIT;
2740         newsk->mdev = 0;
2741         newsk->max_window = 0;
2742         newsk->cong_window = 1;
2743         newsk->cong_count = 0;
2744         newsk->ssthresh = 0;
2745         newsk->backoff = 0;
2746         newsk->blog = 0;
2747         newsk->intr = 0;
2748         newsk->proc = 0;
2749         newsk->done = 0;
2750         newsk->partial = NULL;
2751         newsk->pair = NULL;
2752         newsk->wmem_alloc = 0;
2753         newsk->rmem_alloc = 0;
2754         newsk->localroute = sk->localroute;
2755 
2756         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2757 
2758         newsk->err = 0;
2759         newsk->shutdown = 0;
2760         newsk->ack_backlog = 0;
2761         newsk->acked_seq = skb->h.th->seq+1;
2762         newsk->copied_seq = skb->h.th->seq+1;
2763         newsk->fin_seq = skb->h.th->seq;
2764         newsk->state = TCP_SYN_RECV;
2765         newsk->timeout = 0;
2766         newsk->ip_xmit_timeout = 0;
2767         newsk->write_seq = seq; 
2768         newsk->window_seq = newsk->write_seq;
2769         newsk->rcv_ack_seq = newsk->write_seq;
2770         newsk->urg_data = 0;
2771         newsk->retransmits = 0;
2772         newsk->linger=0;
2773         newsk->destroy = 0;
2774         init_timer(&newsk->timer);
2775         newsk->timer.data = (unsigned long)newsk;
2776         newsk->timer.function = &net_timer;
2777         init_timer(&newsk->retransmit_timer);
2778         newsk->retransmit_timer.data = (unsigned long)newsk;
2779         newsk->retransmit_timer.function=&retransmit_timer;
2780         newsk->dummy_th.source = skb->h.th->dest;
2781         newsk->dummy_th.dest = skb->h.th->source;
2782         
2783         /*
2784          *      Swap these two, they are from our point of view. 
2785          */
2786          
2787         newsk->daddr = saddr;
2788         newsk->saddr = daddr;
2789 
2790         put_sock(newsk->num,newsk);
2791         newsk->dummy_th.res1 = 0;
2792         newsk->dummy_th.doff = 6;
2793         newsk->dummy_th.fin = 0;
2794         newsk->dummy_th.syn = 0;
2795         newsk->dummy_th.rst = 0;        
2796         newsk->dummy_th.psh = 0;
2797         newsk->dummy_th.ack = 0;
2798         newsk->dummy_th.urg = 0;
2799         newsk->dummy_th.res2 = 0;
2800         newsk->acked_seq = skb->h.th->seq + 1;
2801         newsk->copied_seq = skb->h.th->seq + 1;
2802         newsk->socket = NULL;
2803 
2804         /*
2805          *      Grab the ttl and tos values and use them 
2806          */
2807 
2808         newsk->ip_ttl=sk->ip_ttl;
2809         newsk->ip_tos=skb->ip_hdr->tos;
2810 
2811         /*
2812          *      Use 512 or whatever user asked for 
2813          */
2814 
2815         /*
2816          *      Note use of sk->user_mss, since user has no direct access to newsk 
2817          */
2818 
2819         rt=ip_rt_route(saddr, NULL,NULL);
2820         
2821         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2822                 newsk->window_clamp = rt->rt_window;
2823         else
2824                 newsk->window_clamp = 0;
2825                 
2826         if (sk->user_mss)
2827                 newsk->mtu = sk->user_mss;
2828         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2829                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2830         else 
2831         {
2832 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2833                 if ((saddr ^ daddr) & default_mask(saddr))
2834 #else
2835                 if ((saddr ^ daddr) & dev->pa_mask)
2836 #endif
2837                         newsk->mtu = 576 - HEADER_SIZE;
2838                 else
2839                         newsk->mtu = MAX_WINDOW;
2840         }
2841 
2842         /*
2843          *      But not bigger than device MTU 
2844          */
2845 
2846         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2847 
2848         /*
2849          *      This will min with what arrived in the packet 
2850          */
2851 
2852         tcp_options(newsk,skb->h.th);
2853 
2854         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2855         if (buff == NULL) 
2856         {
2857                 sk->err = -ENOMEM;
2858                 newsk->dead = 1;
2859                 newsk->state = TCP_CLOSE;
2860                 /* And this will destroy it */
2861                 release_sock(newsk);
2862                 kfree_skb(skb, FREE_READ);
2863                 tcp_statistics.TcpAttemptFails++;
2864                 return;
2865         }
2866   
2867         buff->len = sizeof(struct tcphdr)+4;
2868         buff->sk = newsk;
2869         buff->localroute = newsk->localroute;
2870 
2871         t1 =(struct tcphdr *) buff->data;
2872 
2873         /*
2874          *      Put in the IP header and routing stuff. 
2875          */
2876 
2877         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2878                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2879 
2880         /*
2881          *      Something went wrong. 
2882          */
2883 
2884         if (tmp < 0) 
2885         {
2886                 sk->err = tmp;
2887                 buff->free = 1;
2888                 kfree_skb(buff,FREE_WRITE);
2889                 newsk->dead = 1;
2890                 newsk->state = TCP_CLOSE;
2891                 release_sock(newsk);
2892                 skb->sk = sk;
2893                 kfree_skb(skb, FREE_READ);
2894                 tcp_statistics.TcpAttemptFails++;
2895                 return;
2896         }
2897 
2898         buff->len += tmp;
2899         t1 =(struct tcphdr *)((char *)t1 +tmp);
2900   
2901         memcpy(t1, skb->h.th, sizeof(*t1));
2902         buff->h.seq = newsk->write_seq;
2903         /*
2904          *      Swap the send and the receive. 
2905          */
2906         t1->dest = skb->h.th->source;
2907         t1->source = newsk->dummy_th.source;
2908         t1->seq = ntohl(newsk->write_seq++);
2909         t1->ack = 1;
2910         newsk->window = tcp_select_window(newsk);
2911         newsk->sent_seq = newsk->write_seq;
2912         t1->window = ntohs(newsk->window);
2913         t1->res1 = 0;
2914         t1->res2 = 0;
2915         t1->rst = 0;
2916         t1->urg = 0;
2917         t1->psh = 0;
2918         t1->syn = 1;
2919         t1->ack_seq = ntohl(skb->h.th->seq+1);
2920         t1->doff = sizeof(*t1)/4+1;
2921         ptr =(unsigned char *)(t1+1);
2922         ptr[0] = 2;
2923         ptr[1] = 4;
2924         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2925         ptr[3] =(newsk->mtu) & 0xff;
2926 
2927         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2928         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2929         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2930         skb->sk = newsk;
2931 
2932         /*
2933          *      Charge the sock_buff to newsk. 
2934          */
2935          
2936         sk->rmem_alloc -= skb->mem_len;
2937         newsk->rmem_alloc += skb->mem_len;
2938         
2939         skb_queue_tail(&sk->receive_queue,skb);
2940         sk->ack_backlog++;
2941         release_sock(newsk);
2942         tcp_statistics.TcpOutSegs++;
2943 }
2944 
2945 
2946 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2947 {
2948         /*
2949          * We need to grab some memory, and put together a FIN, 
2950          * and then put it into the queue to be sent.
2951          */
2952         
2953         sk->inuse = 1;
2954         
2955         if(sk->state == TCP_LISTEN)
2956         {
2957                 /* Special case */
2958                 tcp_set_state(sk, TCP_CLOSE);
2959                 tcp_close_pending(sk);
2960                 release_sock(sk);
2961                 return;
2962         }
2963         
2964         sk->keepopen = 1;
2965         sk->shutdown = SHUTDOWN_MASK;
2966 
2967         if (!sk->dead) 
2968                 sk->state_change(sk);
2969 
2970         if (timeout == 0) 
2971         {
2972                 struct sk_buff *skb;
2973                 
2974                 /*
2975                  *  We need to flush the recv. buffs.  We do this only on the
2976                  *  descriptor close, not protocol-sourced closes, because the
2977                  *  reader process may not have drained the data yet!
2978                  */
2979                  
2980                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2981                         kfree_skb(skb, FREE_READ);
2982                 /*
2983                  *      Get rid off any half-completed packets. 
2984                  */
2985 
2986                 if (sk->partial) 
2987                         tcp_send_partial(sk);
2988         }
2989 
2990                 
2991         /*
2992          *      Timeout is not the same thing - however the code likes
2993          *      to send both the same way (sigh).
2994          */
2995          
2996         if(timeout)
2997         {
2998                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2999         }
3000         else
3001         {
3002                 if(tcp_close_state(sk,1)==1)
3003                 {
3004                         tcp_send_fin(sk);
3005                 }
3006         }
3007         release_sock(sk);
3008 }
3009 
3010 
3011 /*
3012  *      This routine takes stuff off of the write queue,
3013  *      and puts it in the xmit queue. This happens as incoming acks
3014  *      open up the remote window for us.
3015  */
3016  
3017 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3018 {
3019         struct sk_buff *skb;
3020 
3021         /*
3022          *      The bytes will have to remain here. In time closedown will
3023          *      empty the write queue and all will be happy 
3024          */
3025 
3026         if(sk->zapped)
3027                 return;
3028 
3029         /*
3030          *      Anything on the transmit queue that fits the window can
3031          *      be added providing we are not
3032          *
3033          *      a) retransmitting (Nagle's rule)
3034          *      b) exceeding our congestion window.
3035          */
3036          
3037         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3038                 before(skb->h.seq, sk->window_seq + 1) &&
3039                 (sk->retransmits == 0 ||
3040                  sk->ip_xmit_timeout != TIME_WRITE ||
3041                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3042                 && sk->packets_out < sk->cong_window) 
3043         {
3044                 IS_SKB(skb);
3045                 skb_unlink(skb);
3046                 
3047                 /*
3048                  *      See if we really need to send the packet. 
3049                  */
3050                  
3051                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3052                 {
3053                         /*
3054                          *      This is acked data. We can discard it. This 
3055                          *      cannot currently occur.
3056                          */
3057                          
3058                         sk->retransmits = 0;
3059                         kfree_skb(skb, FREE_WRITE);
3060                         if (!sk->dead) 
3061                                 sk->write_space(sk);
3062                 } 
3063                 else
3064                 {
3065                         struct tcphdr *th;
3066                         struct iphdr *iph;
3067                         int size;
3068 /*
3069  * put in the ack seq and window at this point rather than earlier,
3070  * in order to keep them monotonic.  We really want to avoid taking
3071  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3072  * Ack and window will in general have changed since this packet was put
3073  * on the write queue.
3074  */
3075                         iph = (struct iphdr *)(skb->data +
3076                                                skb->dev->hard_header_len);
3077                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3078                         size = skb->len - (((unsigned char *) th) - skb->data);
3079                         
3080                         th->ack_seq = ntohl(sk->acked_seq);
3081                         th->window = ntohs(tcp_select_window(sk));
3082 
3083                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3084 
3085                         sk->sent_seq = skb->h.seq;
3086                         
3087                         /*
3088                          *      IP manages our queue for some crazy reason
3089                          */
3090                          
3091                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3092                         
3093                         /*
3094                          *      Again we slide the timer wrongly
3095                          */
3096                          
3097                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3098                 }
3099         }
3100 }
3101 
3102 
3103 /*
3104  *      This routine deals with incoming acks, but not outgoing ones.
3105  */
3106 
3107 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3108 {
3109         unsigned long ack;
3110         int flag = 0;
3111 
3112         /* 
3113          * 1 - there was data in packet as well as ack or new data is sent or 
3114          *     in shutdown state
3115          * 2 - data from retransmit queue was acked and removed
3116          * 4 - window shrunk or data from retransmit queue was acked and removed
3117          */
3118 
3119         if(sk->zapped)
3120                 return(1);      /* Dead, cant ack any more so why bother */
3121 
3122         /*
3123          *      Have we discovered a larger window
3124          */
3125          
3126         ack = ntohl(th->ack_seq);
3127 
3128         if (ntohs(th->window) > sk->max_window) 
3129         {
3130                 sk->max_window = ntohs(th->window);
3131 #ifdef CONFIG_INET_PCTCP
3132                 /* Hack because we don't send partial packets to non SWS
3133                    handling hosts */
3134                 sk->mss = min(sk->max_window>>1, sk->mtu);
3135 #else
3136                 sk->mss = min(sk->max_window, sk->mtu);
3137 #endif  
3138         }
3139 
3140         /*
3141          *      We have dropped back to keepalive timeouts. Thus we have
3142          *      no retransmits pending.
3143          */
3144          
3145         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3146                 sk->retransmits = 0;
3147 
3148         /*
3149          *      If the ack is newer than sent or older than previous acks
3150          *      then we can probably ignore it.
3151          */
3152          
3153         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3154         {
3155                 if(sk->debug)
3156                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3157                         
3158                 /*
3159                  *      Keepalive processing.
3160                  */
3161                  
3162                 if (after(ack, sk->sent_seq)) 
3163                 {
3164                         return(0);
3165                 }
3166                 
3167                 /*
3168                  *      Restart the keepalive timer.
3169                  */
3170                  
3171                 if (sk->keepopen) 
3172                 {
3173                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3174                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3175                 }
3176                 return(1);
3177         }
3178 
3179         /*
3180          *      If there is data set flag 1
3181          */
3182          
3183         if (len != th->doff*4) 
3184                 flag |= 1;
3185 
3186         /*
3187          *      See if our window has been shrunk. 
3188          */
3189 
3190         if (after(sk->window_seq, ack+ntohs(th->window))) 
3191         {
3192                 /*
3193                  * We may need to move packets from the send queue
3194                  * to the write queue, if the window has been shrunk on us.
3195                  * The RFC says you are not allowed to shrink your window
3196                  * like this, but if the other end does, you must be able
3197                  * to deal with it.
3198                  */
3199                 struct sk_buff *skb;
3200                 struct sk_buff *skb2;
3201                 struct sk_buff *wskb = NULL;
3202         
3203                 skb2 = sk->send_head;
3204                 sk->send_head = NULL;
3205                 sk->send_tail = NULL;
3206         
3207                 /*
3208                  *      This is an artifact of a flawed concept. We want one
3209                  *      queue and a smarter send routine when we send all.
3210                  */
3211         
3212                 flag |= 4;      /* Window changed */
3213         
3214                 sk->window_seq = ack + ntohs(th->window);
3215                 cli();
3216                 while (skb2 != NULL) 
3217                 {
3218                         skb = skb2;
3219                         skb2 = skb->link3;
3220                         skb->link3 = NULL;
3221                         if (after(skb->h.seq, sk->window_seq)) 
3222                         {
3223                                 if (sk->packets_out > 0) 
3224                                         sk->packets_out--;
3225                                 /* We may need to remove this from the dev send list. */
3226                                 if (skb->next != NULL) 
3227                                 {
3228                                         skb_unlink(skb);                                
3229                                 }
3230                                 /* Now add it to the write_queue. */
3231                                 if (wskb == NULL)
3232                                         skb_queue_head(&sk->write_queue,skb);
3233                                 else
3234                                         skb_append(wskb,skb);
3235                                 wskb = skb;
3236                         } 
3237                         else 
3238                         {
3239                                 if (sk->send_head == NULL) 
3240                                 {
3241                                         sk->send_head = skb;
3242                                         sk->send_tail = skb;
3243                                 }
3244                                 else
3245                                 {
3246                                         sk->send_tail->link3 = skb;
3247                                         sk->send_tail = skb;
3248                                 }
3249                                 skb->link3 = NULL;
3250                         }
3251                 }
3252                 sti();
3253         }
3254 
3255         /*
3256          *      Pipe has emptied
3257          */
3258          
3259         if (sk->send_tail == NULL || sk->send_head == NULL) 
3260         {
3261                 sk->send_head = NULL;
3262                 sk->send_tail = NULL;
3263                 sk->packets_out= 0;
3264         }
3265 
3266         /*
3267          *      Update the right hand window edge of the host
3268          */
3269          
3270         sk->window_seq = ack + ntohs(th->window);
3271 
3272         /*
3273          *      We don't want too many packets out there. 
3274          */
3275          
3276         if (sk->ip_xmit_timeout == TIME_WRITE && 
3277                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3278         {
3279                 /* 
3280                  * This is Jacobson's slow start and congestion avoidance. 
3281                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3282                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3283                  * counter and increment it once every cwnd times.  It's possible
3284                  * that this should be done only if sk->retransmits == 0.  I'm
3285                  * interpreting "new data is acked" as including data that has
3286                  * been retransmitted but is just now being acked.
3287                  */
3288                 if (sk->cong_window < sk->ssthresh)  
3289                         /* 
3290                          *      In "safe" area, increase
3291                          */
3292                         sk->cong_window++;
3293                 else 
3294                 {
3295                         /*
3296                          *      In dangerous area, increase slowly.  In theory this is
3297                          *      sk->cong_window += 1 / sk->cong_window
3298                          */
3299                         if (sk->cong_count >= sk->cong_window) 
3300                         {
3301                                 sk->cong_window++;
3302                                 sk->cong_count = 0;
3303                         }
3304                         else 
3305                                 sk->cong_count++;
3306                 }
3307         }
3308 
3309         /*
3310          *      Remember the highest ack received.
3311          */
3312          
3313         sk->rcv_ack_seq = ack;
3314 
3315         /*
3316          *      If this ack opens up a zero window, clear backoff.  It was
3317          *      being used to time the probes, and is probably far higher than
3318          *      it needs to be for normal retransmission.
3319          */
3320 
3321         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3322         {
3323                 sk->retransmits = 0;    /* Our probe was answered */
3324                 
3325                 /*
3326                  *      Was it a usable window open ?
3327                  */
3328                  
3329                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3330                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3331                 {
3332                         sk->backoff = 0;
3333                         
3334                         /*
3335                          *      Recompute rto from rtt.  this eliminates any backoff.
3336                          */
3337 
3338                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3339                         if (sk->rto > 120*HZ)
3340                                 sk->rto = 120*HZ;
3341                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3342                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3343                                                    .2 of a second is going to need huge windows (SIGH) */
3344                         sk->rto = 20;
3345                 }
3346         }
3347 
3348         /* 
3349          *      See if we can take anything off of the retransmit queue.
3350          */
3351    
3352         while(sk->send_head != NULL) 
3353         {
3354                 /* Check for a bug. */
3355                 if (sk->send_head->link3 &&
3356                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3357                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3358                         
3359                 /*
3360                  *      If our packet is before the ack sequence we can
3361                  *      discard it as it's confirmed to have arrived the other end.
3362                  */
3363                  
3364                 if (before(sk->send_head->h.seq, ack+1)) 
3365                 {
3366                         struct sk_buff *oskb;   
3367                         if (sk->retransmits) 
3368                         {       
3369                                 /*
3370                                  *      We were retransmitting.  don't count this in RTT est 
3371                                  */
3372                                 flag |= 2;
3373 
3374                                 /*
3375                                  * even though we've gotten an ack, we're still
3376                                  * retransmitting as long as we're sending from
3377                                  * the retransmit queue.  Keeping retransmits non-zero
3378                                  * prevents us from getting new data interspersed with
3379                                  * retransmissions.
3380                                  */
3381 
3382                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3383                                         sk->retransmits = 1;
3384                                 else
3385                                         sk->retransmits = 0;
3386                         }
3387                         /*
3388                          * Note that we only reset backoff and rto in the
3389                          * rtt recomputation code.  And that doesn't happen
3390                          * if there were retransmissions in effect.  So the
3391                          * first new packet after the retransmissions is
3392                          * sent with the backoff still in effect.  Not until
3393                          * we get an ack from a non-retransmitted packet do
3394                          * we reset the backoff and rto.  This allows us to deal
3395                          * with a situation where the network delay has increased
3396                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3397                          */
3398 
3399                         /*
3400                          *      We have one less packet out there. 
3401                          */
3402                          
3403                         if (sk->packets_out > 0) 
3404                                 sk->packets_out --;
3405                         /* 
3406                          *      Wake up the process, it can probably write more. 
3407                          */
3408                         if (!sk->dead) 
3409                                 sk->write_space(sk);
3410                         oskb = sk->send_head;
3411 
3412                         if (!(flag&2))  /* Not retransmitting */
3413                         {
3414                                 long m;
3415         
3416                                 /*
3417                                  *      The following amusing code comes from Jacobson's
3418                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3419                                  *      are scaled versions of rtt and mean deviation.
3420                                  *      This is designed to be as fast as possible 
3421                                  *      m stands for "measurement".
3422                                  */
3423         
3424                                 m = jiffies - oskb->when;  /* RTT */
3425                                 if(m<=0)
3426                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3427                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3428                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3429                                 if (m < 0)
3430                                         m = -m;         /* m is now abs(error) */
3431                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3432                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3433         
3434                                 /*
3435                                  *      Now update timeout.  Note that this removes any backoff.
3436                                  */
3437                          
3438                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3439                                 if (sk->rto > 120*HZ)
3440                                         sk->rto = 120*HZ;
3441                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3442                                         sk->rto = 20;
3443                                 sk->backoff = 0;
3444                         }
3445                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3446                                            In this case as we just set it up */
3447                         cli();
3448                         oskb = sk->send_head;
3449                         IS_SKB(oskb);
3450                         sk->send_head = oskb->link3;
3451                         if (sk->send_head == NULL) 
3452                         {
3453                                 sk->send_tail = NULL;
3454                         }
3455 
3456                 /*
3457                  *      We may need to remove this from the dev send list. 
3458                  */
3459 
3460                         if (oskb->next)
3461                                 skb_unlink(oskb);
3462                         sti();
3463                         kfree_skb(oskb, FREE_WRITE); /* write. */
3464                         if (!sk->dead) 
3465                                 sk->write_space(sk);
3466                 }
3467                 else
3468                 {
3469                         break;
3470                 }
3471         }
3472 
3473         /*
3474          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3475          * returns non-NULL, we complete ignore the timer stuff in the else
3476          * clause.  We ought to organize the code so that else clause can
3477          * (should) be executed regardless, possibly moving the PROBE timer
3478          * reset over.  The skb_peek() thing should only move stuff to the
3479          * write queue, NOT also manage the timer functions.
3480          */
3481 
3482         /*
3483          * Maybe we can take some stuff off of the write queue,
3484          * and put it onto the xmit queue.
3485          */
3486         if (skb_peek(&sk->write_queue) != NULL) 
3487         {
3488                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3489                         (sk->retransmits == 0 || 
3490                          sk->ip_xmit_timeout != TIME_WRITE ||
3491                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3492                         && sk->packets_out < sk->cong_window) 
3493                 {
3494                         /*
3495                          *      Add more data to the send queue.
3496                          */
3497                         flag |= 1;
3498                         tcp_write_xmit(sk);
3499                 }
3500                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3501                         sk->send_head == NULL &&
3502                         sk->ack_backlog == 0 &&
3503                         sk->state != TCP_TIME_WAIT) 
3504                 {
3505                         /*
3506                          *      Data to queue but no room.
3507                          */
3508                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3509                 }               
3510         }
3511         else
3512         {
3513                 /*
3514                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3515                  * from TCP_CLOSE we don't do anything
3516                  *
3517                  * from anything else, if there is write data (or fin) pending,
3518                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3519                  * a KEEPALIVE timeout, else we delete the timer.
3520                  *
3521                  * We do not set flag for nominal write data, otherwise we may
3522                  * force a state where we start to write itsy bitsy tidbits
3523                  * of data.
3524                  */
3525 
3526                 switch(sk->state) {
3527                 case TCP_TIME_WAIT:
3528                         /*
3529                          * keep us in TIME_WAIT until we stop getting packets,
3530                          * reset the timeout.
3531                          */
3532                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3533                         break;
3534                 case TCP_CLOSE:
3535                         /*
3536                          * don't touch the timer.
3537                          */
3538                         break;
3539                 default:
3540                         /*
3541                          *      Must check send_head, write_queue, and ack_backlog
3542                          *      to determine which timeout to use.
3543                          */
3544                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3545                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3546                         } else if (sk->keepopen) {
3547                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3548                         } else {
3549                                 del_timer(&sk->retransmit_timer);
3550                                 sk->ip_xmit_timeout = 0;
3551                         }
3552                         break;
3553                 }
3554         }
3555 
3556         /*
3557          *      We have nothing queued but space to send. Send any partial
3558          *      packets immediately (end of Nagle rule application).
3559          */
3560          
3561         if (sk->packets_out == 0 && sk->partial != NULL &&
3562                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3563         {
3564                 flag |= 1;
3565                 tcp_send_partial(sk);
3566         }
3567 
3568         /*
3569          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3570          * we are now waiting for an acknowledge to our FIN.  The other end is
3571          * already in TIME_WAIT.
3572          *
3573          * Move to TCP_CLOSE on success.
3574          */
3575 
3576         if (sk->state == TCP_LAST_ACK) 
3577         {
3578                 if (!sk->dead)
3579                         sk->state_change(sk);
3580                 if(sk->debug)
3581                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3582                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3583                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3584                 {
3585                         flag |= 1;
3586                         tcp_set_state(sk,TCP_CLOSE);
3587                         sk->shutdown = SHUTDOWN_MASK;
3588                 }
3589         }
3590 
3591         /*
3592          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3593          *
3594          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3595          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3596          */
3597 
3598         if (sk->state == TCP_FIN_WAIT1) 
3599         {
3600 
3601                 if (!sk->dead) 
3602                         sk->state_change(sk);
3603                 if (sk->rcv_ack_seq == sk->write_seq) 
3604                 {
3605                         flag |= 1;
3606                         sk->shutdown |= SEND_SHUTDOWN;
3607                         tcp_set_state(sk, TCP_FIN_WAIT2);
3608                 }
3609         }
3610 
3611         /*
3612          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3613          *
3614          *      Move to TIME_WAIT
3615          */
3616 
3617         if (sk->state == TCP_CLOSING) 
3618         {
3619 
3620                 if (!sk->dead) 
3621                         sk->state_change(sk);
3622                 if (sk->rcv_ack_seq == sk->write_seq) 
3623                 {
3624                         flag |= 1;
3625                         tcp_time_wait(sk);
3626                 }
3627         }
3628         
3629         /*
3630          *      Final ack of a three way shake 
3631          */
3632          
3633         if(sk->state==TCP_SYN_RECV)
3634         {
3635                 tcp_set_state(sk, TCP_ESTABLISHED);
3636                 tcp_options(sk,th);
3637                 sk->dummy_th.dest=th->source;
3638                 sk->copied_seq = sk->acked_seq;
3639                 if(!sk->dead)
3640                         sk->state_change(sk);
3641                 if(sk->max_window==0)
3642                 {
3643                         sk->max_window=32;      /* Sanity check */
3644                         sk->mss=min(sk->max_window,sk->mtu);
3645                 }
3646         }
3647         
3648         /*
3649          * I make no guarantees about the first clause in the following
3650          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3651          * what conditions "!flag" would be true.  However I think the rest
3652          * of the conditions would prevent that from causing any
3653          * unnecessary retransmission. 
3654          *   Clearly if the first packet has expired it should be 
3655          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3656          * harder to explain:  You have to look carefully at how and when the
3657          * timer is set and with what timeout.  The most recent transmission always
3658          * sets the timer.  So in general if the most recent thing has timed
3659          * out, everything before it has as well.  So we want to go ahead and
3660          * retransmit some more.  If we didn't explicitly test for this
3661          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3662          * would not be true.  If you look at the pattern of timing, you can
3663          * show that rto is increased fast enough that the next packet would
3664          * almost never be retransmitted immediately.  Then you'd end up
3665          * waiting for a timeout to send each packet on the retransmission
3666          * queue.  With my implementation of the Karn sampling algorithm,
3667          * the timeout would double each time.  The net result is that it would
3668          * take a hideous amount of time to recover from a single dropped packet.
3669          * It's possible that there should also be a test for TIME_WRITE, but
3670          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3671          * got to be in real retransmission mode.
3672          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3673          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3674          * As long as no further losses occur, this seems reasonable.
3675          */
3676         
3677         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3678                (((flag&2) && sk->retransmits) ||
3679                (sk->send_head->when + sk->rto < jiffies))) 
3680         {
3681                 if(sk->send_head->when + sk->rto < jiffies)
3682                         tcp_retransmit(sk,0);   
3683                 else
3684                 {
3685                         tcp_do_retransmit(sk, 1);
3686                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3687                 }
3688         }
3689 
3690         return(1);
3691 }
3692 
3693 
3694 /*
3695  *      Process the FIN bit. This now behaves as it is supposed to work
3696  *      and the FIN takes effect when it is validly part of sequence
3697  *      space. Not before when we get holes.
3698  *
3699  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3700  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3701  *      TIME-WAIT)
3702  *
3703  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3704  *      close and we go into CLOSING (and later onto TIME-WAIT)
3705  *
3706  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3707  *
3708  */
3709  
3710 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3711 {
3712         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3713 
3714         if (!sk->dead) 
3715         {
3716                 sk->state_change(sk);
3717                 sock_wake_async(sk->socket, 1);
3718         }
3719 
3720         switch(sk->state) 
3721         {
3722                 case TCP_SYN_RECV:
3723                 case TCP_SYN_SENT:
3724                 case TCP_ESTABLISHED:
3725                         /*
3726                          * move to CLOSE_WAIT, tcp_data() already handled
3727                          * sending the ack.
3728                          */
3729                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3730                         if (th->rst)
3731                                 sk->shutdown = SHUTDOWN_MASK;
3732                         break;
3733 
3734                 case TCP_CLOSE_WAIT:
3735                 case TCP_CLOSING:
3736                         /*
3737                          * received a retransmission of the FIN, do
3738                          * nothing.
3739                          */
3740                         break;
3741                 case TCP_TIME_WAIT:
3742                         /*
3743                          * received a retransmission of the FIN,
3744                          * restart the TIME_WAIT timer.
3745                          */
3746                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3747                         return(0);
3748                 case TCP_FIN_WAIT1:
3749                         /*
3750                          * This case occurs when a simultaneous close
3751                          * happens, we must ack the received FIN and
3752                          * enter the CLOSING state.
3753                          *
3754                          * This causes a WRITE timeout, which will either
3755                          * move on to TIME_WAIT when we timeout, or resend
3756                          * the FIN properly (maybe we get rid of that annoying
3757                          * FIN lost hang). The TIME_WRITE code is already correct
3758                          * for handling this timeout.
3759                          */
3760 
3761                         if(sk->ip_xmit_timeout != TIME_WRITE)
3762                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3763                         tcp_set_state(sk,TCP_CLOSING);
3764                         break;
3765                 case TCP_FIN_WAIT2:
3766                         /*
3767                          * received a FIN -- send ACK and enter TIME_WAIT
3768                          */
3769                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3770                         sk->shutdown|=SHUTDOWN_MASK;
3771                         tcp_set_state(sk,TCP_TIME_WAIT);
3772                         break;
3773                 case TCP_CLOSE:
3774                         /*
3775                          * already in CLOSE
3776                          */
3777                         break;
3778                 default:
3779                         tcp_set_state(sk,TCP_LAST_ACK);
3780         
3781                         /* Start the timers. */
3782                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3783                         return(0);
3784         }
3785 
3786         return(0);
3787 }
3788 
3789 
3790 
3791 /*
3792  *      This routine handles the data.  If there is room in the buffer,
3793  *      it will be have already been moved into it.  If there is no
3794  *      room, then we will just have to discard the packet.
3795  */
3796 
3797 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3798          unsigned long saddr, unsigned short len)
3799 {
3800         struct sk_buff *skb1, *skb2;
3801         struct tcphdr *th;
3802         int dup_dumped=0;
3803         unsigned long new_seq;
3804         unsigned long shut_seq;
3805 
3806         th = skb->h.th;
3807         skb->len = len -(th->doff*4);
3808 
3809         /*
3810          *      The bytes in the receive read/assembly queue has increased. Needed for the
3811          *      low memory discard algorithm 
3812          */
3813            
3814         sk->bytes_rcv += skb->len;
3815         
3816         if (skb->len == 0 && !th->fin) 
3817         {
3818                 /* 
3819                  *      Don't want to keep passing ack's back and forth. 
3820                  *      (someone sent us dataless, boring frame)
3821                  */
3822                 if (!th->ack)
3823                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3824                 kfree_skb(skb, FREE_READ);
3825                 return(0);
3826         }
3827         
3828         /*
3829          *      We no longer have anyone receiving data on this connection.
3830          */
3831 
3832 #ifndef TCP_DONT_RST_SHUTDOWN            
3833 
3834         if(sk->shutdown & RCV_SHUTDOWN)
3835         {
3836                 /*
3837                  *      FIXME: BSD has some magic to avoid sending resets to
3838                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3839                  *      BSD stacks still have broken keepalives so we want to
3840                  *      cope with it.
3841                  */
3842 
3843                 if(skb->len)    /* We don't care if it's just an ack or
3844                                    a keepalive/window probe */
3845                 {
3846                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3847                         
3848                         /* Do this the way 4.4BSD treats it. Not what I'd
3849                            regard as the meaning of the spec but it's what BSD
3850                            does and clearly they know everything 8) */
3851 
3852                         /*
3853                          *      This is valid because of two things
3854                          *
3855                          *      a) The way tcp_data behaves at the bottom.
3856                          *      b) A fin takes effect when read not when received.
3857                          */
3858                          
3859                         shut_seq=sk->acked_seq+1;       /* Last byte */
3860                         
3861                         if(after(new_seq,shut_seq))
3862                         {
3863                                 if(sk->debug)
3864                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3865                                                 sk, new_seq, shut_seq, sk->blog);
3866                                 if(sk->dead)
3867                                 {
3868                                         sk->acked_seq = new_seq + th->fin;
3869                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3870                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3871                                         tcp_statistics.TcpEstabResets++;
3872                                         tcp_set_state(sk,TCP_CLOSE);
3873                                         sk->err = EPIPE;
3874                                         sk->shutdown = SHUTDOWN_MASK;
3875                                         kfree_skb(skb, FREE_READ);
3876                                         return 0;
3877                                 }
3878                         }
3879                 }
3880         }
3881 
3882 #endif
3883 
3884         /*
3885          *      Now we have to walk the chain, and figure out where this one
3886          *      goes into it.  This is set up so that the last packet we received
3887          *      will be the first one we look at, that way if everything comes
3888          *      in order, there will be no performance loss, and if they come
3889          *      out of order we will be able to fit things in nicely.
3890          *
3891          *      [AC: This is wrong. We should assume in order first and then walk
3892          *       forwards from the first hole based upon real traffic patterns.]
3893          *      
3894          */
3895 
3896         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3897         {
3898                 skb_queue_head(&sk->receive_queue,skb);
3899                 skb1= NULL;
3900         } 
3901         else
3902         {
3903                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3904                 {
3905                         if(sk->debug)
3906                         {
3907                                 printk("skb1=%p :", skb1);
3908                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3909                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3910                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3911                                                 sk->acked_seq);
3912                         }
3913                         
3914                         /*
3915                          *      Optimisation: Duplicate frame or extension of previous frame from
3916                          *      same sequence point (lost ack case).
3917                          *      The frame contains duplicate data or replaces a previous frame
3918                          *      discard the previous frame (safe as sk->inuse is set) and put
3919                          *      the new one in its place.
3920                          */
3921                          
3922                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3923                         {
3924                                 skb_append(skb1,skb);
3925                                 skb_unlink(skb1);
3926                                 kfree_skb(skb1,FREE_READ);
3927                                 dup_dumped=1;
3928                                 skb1=NULL;
3929                                 break;
3930                         }
3931                         
3932                         /*
3933                          *      Found where it fits
3934                          */
3935                          
3936                         if (after(th->seq+1, skb1->h.th->seq))
3937                         {
3938                                 skb_append(skb1,skb);
3939                                 break;
3940                         }
3941                         
3942                         /*
3943                          *      See if we've hit the start. If so insert.
3944                          */
3945                         if (skb1 == skb_peek(&sk->receive_queue))
3946                         {
3947                                 skb_queue_head(&sk->receive_queue, skb);
3948                                 break;
3949                         }
3950                 }
3951         }
3952 
3953         /*
3954          *      Figure out what the ack value for this frame is
3955          */
3956          
3957         th->ack_seq = th->seq + skb->len;
3958         if (th->syn) 
3959                 th->ack_seq++;
3960         if (th->fin)
3961                 th->ack_seq++;
3962 
3963         if (before(sk->acked_seq, sk->copied_seq)) 
3964         {
3965                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3966                 sk->acked_seq = sk->copied_seq;
3967         }
3968 
3969         /*
3970          *      Now figure out if we can ack anything. This is very messy because we really want two
3971          *      receive queues, a completed and an assembly queue. We also want only one transmit
3972          *      queue.
3973          */
3974 
3975         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3976         {
3977                 if (before(th->seq, sk->acked_seq+1)) 
3978                 {
3979                         int newwindow;
3980 
3981                         if (after(th->ack_seq, sk->acked_seq)) 
3982                         {
3983                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3984                                 if (newwindow < 0)
3985                                         newwindow = 0;  
3986                                 sk->window = newwindow;
3987                                 sk->acked_seq = th->ack_seq;
3988                         }
3989                         skb->acked = 1;
3990 
3991                         /*
3992                          *      When we ack the fin, we do the FIN 
3993                          *      processing.
3994                          */
3995 
3996                         if (skb->h.th->fin) 
3997                         {
3998                                 tcp_fin(skb,sk,skb->h.th);
3999                         }
4000           
4001                         for(skb2 = skb->next;
4002                             skb2 != (struct sk_buff *)&sk->receive_queue;
4003                             skb2 = skb2->next) 
4004                         {
4005                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4006                                 {
4007                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4008                                         {
4009                                                 newwindow = sk->window -
4010                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4011                                                 if (newwindow < 0)
4012                                                         newwindow = 0;  
4013                                                 sk->window = newwindow;
4014                                                 sk->acked_seq = skb2->h.th->ack_seq;
4015                                         }
4016                                         skb2->acked = 1;
4017                                         /*
4018                                          *      When we ack the fin, we do
4019                                          *      the fin handling.
4020                                          */
4021                                         if (skb2->h.th->fin) 
4022                                         {
4023                                                 tcp_fin(skb,sk,skb->h.th);
4024                                         }
4025 
4026                                         /*
4027                                          *      Force an immediate ack.
4028                                          */
4029                                          
4030                                         sk->ack_backlog = sk->max_ack_backlog;
4031                                 }
4032                                 else
4033                                 {
4034                                         break;
4035                                 }
4036                         }
4037 
4038                         /*
4039                          *      This also takes care of updating the window.
4040                          *      This if statement needs to be simplified.
4041                          */
4042                         if (!sk->delay_acks ||
4043                             sk->ack_backlog >= sk->max_ack_backlog || 
4044                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4045         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4046                         }
4047                         else 
4048                         {
4049                                 sk->ack_backlog++;
4050                                 if(sk->debug)
4051                                         printk("Ack queued.\n");
4052                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4053                         }
4054                 }
4055         }
4056 
4057         /*
4058          *      If we've missed a packet, send an ack.
4059          *      Also start a timer to send another.
4060          */
4061          
4062         if (!skb->acked) 
4063         {
4064         
4065         /*
4066          *      This is important.  If we don't have much room left,
4067          *      we need to throw out a few packets so we have a good
4068          *      window.  Note that mtu is used, not mss, because mss is really
4069          *      for the send side.  He could be sending us stuff as large as mtu.
4070          */
4071                  
4072                 while (sk->prot->rspace(sk) < sk->mtu) 
4073                 {
4074                         skb1 = skb_peek(&sk->receive_queue);
4075                         if (skb1 == NULL) 
4076                         {
4077                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4078                                 break;
4079                         }
4080 
4081                         /*
4082                          *      Don't throw out something that has been acked. 
4083                          */
4084                  
4085                         if (skb1->acked) 
4086                         {
4087                                 break;
4088                         }
4089                 
4090                         skb_unlink(skb1);
4091                         kfree_skb(skb1, FREE_READ);
4092                 }
4093                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4094                 sk->ack_backlog++;
4095                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4096         }
4097         else
4098         {
4099                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4100         }
4101 
4102         /*
4103          *      Now tell the user we may have some data. 
4104          */
4105          
4106         if (!sk->dead) 
4107         {
4108                 if(sk->debug)
4109                         printk("Data wakeup.\n");
4110                 sk->data_ready(sk,0);
4111         } 
4112         return(0);
4113 }
4114 
4115 
4116 /*
4117  *      This routine is only called when we have urgent data
4118  *      signalled. Its the 'slow' part of tcp_urg. It could be
4119  *      moved inline now as tcp_urg is only called from one
4120  *      place. We handle URGent data wrong. We have to - as
4121  *      BSD still doesn't use the correction from RFC961.
4122  */
4123  
4124 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4125 {
4126         unsigned long ptr = ntohs(th->urg_ptr);
4127 
4128         if (ptr)
4129                 ptr--;
4130         ptr += th->seq;
4131 
4132         /* ignore urgent data that we've already seen and read */
4133         if (after(sk->copied_seq, ptr))
4134                 return;
4135 
4136         /* do we already have a newer (or duplicate) urgent pointer? */
4137         if (sk->urg_data && !after(ptr, sk->urg_seq))
4138                 return;
4139 
4140         /* tell the world about our new urgent pointer */
4141         if (sk->proc != 0) {
4142                 if (sk->proc > 0) {
4143                         kill_proc(sk->proc, SIGURG, 1);
4144                 } else {
4145                         kill_pg(-sk->proc, SIGURG, 1);
4146                 }
4147         }
4148         sk->urg_data = URG_NOTYET;
4149         sk->urg_seq = ptr;
4150 }
4151 
4152 /*
4153  *      This is the 'fast' part of urgent handling.
4154  */
4155  
4156 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4157         unsigned long saddr, unsigned long len)
4158 {
4159         unsigned long ptr;
4160 
4161         /*
4162          *      Check if we get a new urgent pointer - normally not 
4163          */
4164          
4165         if (th->urg)
4166                 tcp_check_urg(sk,th);
4167 
4168         /*
4169          *      Do we wait for any urgent data? - normally not
4170          */
4171          
4172         if (sk->urg_data != URG_NOTYET)
4173                 return 0;
4174 
4175         /*
4176          *      Is the urgent pointer pointing into this packet? 
4177          */
4178          
4179         ptr = sk->urg_seq - th->seq + th->doff*4;
4180         if (ptr >= len)
4181                 return 0;
4182 
4183         /*
4184          *      Ok, got the correct packet, update info 
4185          */
4186          
4187         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4188         if (!sk->dead)
4189                 sk->data_ready(sk,0);
4190         return 0;
4191 }
4192 
4193 /*
4194  *      This will accept the next outstanding connection. 
4195  */
4196  
4197 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4198 {
4199         struct sock *newsk;
4200         struct sk_buff *skb;
4201   
4202   /*
4203    * We need to make sure that this socket is listening,
4204    * and that it has something pending.
4205    */
4206 
4207         if (sk->state != TCP_LISTEN) 
4208         {
4209                 sk->err = EINVAL;
4210                 return(NULL); 
4211         }
4212 
4213         /* Avoid the race. */
4214         cli();
4215         sk->inuse = 1;
4216 
4217         while((skb = tcp_dequeue_established(sk)) == NULL) 
4218         {
4219                 if (flags & O_NONBLOCK) 
4220                 {
4221                         sti();
4222                         release_sock(sk);
4223                         sk->err = EAGAIN;
4224                         return(NULL);
4225                 }
4226 
4227                 release_sock(sk);
4228                 interruptible_sleep_on(sk->sleep);
4229                 if (current->signal & ~current->blocked) 
4230                 {
4231                         sti();
4232                         sk->err = ERESTARTSYS;
4233                         return(NULL);
4234                 }
4235                 sk->inuse = 1;
4236         }
4237         sti();
4238 
4239         /*
4240          *      Now all we need to do is return skb->sk. 
4241          */
4242 
4243         newsk = skb->sk;
4244 
4245         kfree_skb(skb, FREE_READ);
4246         sk->ack_backlog--;
4247         release_sock(sk);
4248         return(newsk);
4249 }
4250 
4251 
4252 /*
4253  *      This will initiate an outgoing connection. 
4254  */
4255  
4256 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4257 {
4258         struct sk_buff *buff;
4259         struct device *dev=NULL;
4260         unsigned char *ptr;
4261         int tmp;
4262         int atype;
4263         struct tcphdr *t1;
4264         struct rtable *rt;
4265 
4266         if (sk->state != TCP_CLOSE) 
4267         {
4268                 return(-EISCONN);
4269         }
4270         
4271         if (addr_len < 8) 
4272                 return(-EINVAL);
4273 
4274         if (usin->sin_family && usin->sin_family != AF_INET) 
4275                 return(-EAFNOSUPPORT);
4276 
4277         /*
4278          *      connect() to INADDR_ANY means loopback (BSD'ism).
4279          */
4280         
4281         if(usin->sin_addr.s_addr==INADDR_ANY)
4282                 usin->sin_addr.s_addr=ip_my_addr();
4283                   
4284         /*
4285          *      Don't want a TCP connection going to a broadcast address 
4286          */
4287 
4288         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4289                 return -ENETUNREACH;
4290   
4291         sk->inuse = 1;
4292         sk->daddr = usin->sin_addr.s_addr;
4293         sk->write_seq = tcp_init_seq();
4294         sk->window_seq = sk->write_seq;
4295         sk->rcv_ack_seq = sk->write_seq -1;
4296         sk->err = 0;
4297         sk->dummy_th.dest = usin->sin_port;
4298         release_sock(sk);
4299 
4300         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4301         if (buff == NULL) 
4302         {
4303                 return(-ENOMEM);
4304         }
4305         sk->inuse = 1;
4306         buff->len = 24;
4307         buff->sk = sk;
4308         buff->free = 0;
4309         buff->localroute = sk->localroute;
4310         
4311         t1 = (struct tcphdr *) buff->data;
4312 
4313         /*
4314          *      Put in the IP header and routing stuff. 
4315          */
4316          
4317         rt=ip_rt_route(sk->daddr, NULL, NULL);
4318         
4319 
4320         /*
4321          *      We need to build the routing stuff from the things saved in skb. 
4322          */
4323 
4324         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4325                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4326         if (tmp < 0) 
4327         {
4328                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4329                 release_sock(sk);
4330                 return(-ENETUNREACH);
4331         }
4332 
4333         buff->len += tmp;
4334         t1 = (struct tcphdr *)((char *)t1 +tmp);
4335 
4336         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4337         t1->seq = ntohl(sk->write_seq++);
4338         sk->sent_seq = sk->write_seq;
4339         buff->h.seq = sk->write_seq;
4340         t1->ack = 0;
4341         t1->window = 2;
4342         t1->res1=0;
4343         t1->res2=0;
4344         t1->rst = 0;
4345         t1->urg = 0;
4346         t1->psh = 0;
4347         t1->syn = 1;
4348         t1->urg_ptr = 0;
4349         t1->doff = 6;
4350         /* use 512 or whatever user asked for */
4351         
4352         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4353                 sk->window_clamp=rt->rt_window;
4354         else
4355                 sk->window_clamp=0;
4356 
4357         if (sk->user_mss)
4358                 sk->mtu = sk->user_mss;
4359         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4360                 sk->mtu = rt->rt_mss;
4361         else 
4362         {
4363 #ifdef CONFIG_INET_SNARL
4364                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4365 #else
4366                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4367 #endif
4368                         sk->mtu = 576 - HEADER_SIZE;
4369                 else
4370                         sk->mtu = MAX_WINDOW;
4371         }
4372         /*
4373          *      but not bigger than device MTU 
4374          */
4375 
4376         if(sk->mtu <32)
4377                 sk->mtu = 32;   /* Sanity limit */
4378                 
4379         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4380         
4381         /*
4382          *      Put in the TCP options to say MTU. 
4383          */
4384 
4385         ptr = (unsigned char *)(t1+1);
4386         ptr[0] = 2;
4387         ptr[1] = 4;
4388         ptr[2] = (sk->mtu) >> 8;
4389         ptr[3] = (sk->mtu) & 0xff;
4390         tcp_send_check(t1, sk->saddr, sk->daddr,
4391                   sizeof(struct tcphdr) + 4, sk);
4392 
4393         /*
4394          *      This must go first otherwise a really quick response will get reset. 
4395          */
4396 
4397         tcp_set_state(sk,TCP_SYN_SENT);
4398         sk->rto = TCP_TIMEOUT_INIT;
4399 #if 0 /* we already did this */
4400         init_timer(&sk->retransmit_timer); 
4401 #endif
4402         sk->retransmit_timer.function=&retransmit_timer;
4403         sk->retransmit_timer.data = (unsigned long)sk;
4404         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4405         sk->retransmits = TCP_SYN_RETRIES;
4406 
4407         sk->prot->queue_xmit(sk, dev, buff, 0);  
4408         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4409         tcp_statistics.TcpActiveOpens++;
4410         tcp_statistics.TcpOutSegs++;
4411   
4412         release_sock(sk);
4413         return(0);
4414 }
4415 
4416 
4417 /* This functions checks to see if the tcp header is actually acceptable. */
4418 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4419              struct options *opt, unsigned long saddr, struct device *dev)
4420 {
4421         unsigned long next_seq;
4422 
4423         next_seq = len - 4*th->doff;
4424         if (th->fin)
4425                 next_seq++;
4426         /* if we have a zero window, we can't have any data in the packet.. */
4427         if (next_seq && !sk->window)
4428                 goto ignore_it;
4429         next_seq += th->seq;
4430 
4431         /*
4432          * This isn't quite right.  sk->acked_seq could be more recent
4433          * than sk->window.  This is however close enough.  We will accept
4434          * slightly more packets than we should, but it should not cause
4435          * problems unless someone is trying to forge packets.
4436          */
4437 
4438         /* have we already seen all of this packet? */
4439         if (!after(next_seq+1, sk->acked_seq))
4440                 goto ignore_it;
4441         /* or does it start beyond the window? */
4442         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4443                 goto ignore_it;
4444 
4445         /* ok, at least part of this packet would seem interesting.. */
4446         return 1;
4447 
4448 ignore_it:
4449         if (th->rst)
4450                 return 0;
4451 
4452         /*
4453          *      Send a reset if we get something not ours and we are
4454          *      unsynchronized. Note: We don't do anything to our end. We
4455          *      are just killing the bogus remote connection then we will
4456          *      connect again and it will work (with luck).
4457          */
4458          
4459         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4460         {
4461                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4462                 return 1;
4463         }
4464 
4465         /* Try to resync things. */
4466         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4467         return 0;
4468 }
4469 
4470 /*
4471  *      When we get a reset we do this.
4472  */
4473 
4474 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4475 {
4476         sk->zapped = 1;
4477         sk->err = ECONNRESET;
4478         if (sk->state == TCP_SYN_SENT)
4479                 sk->err = ECONNREFUSED;
4480         if (sk->state == TCP_CLOSE_WAIT)
4481                 sk->err = EPIPE;
4482 #ifdef TCP_DO_RFC1337           
4483         /*
4484          *      Time wait assassination protection [RFC1337]
4485          */
4486         if(sk->state!=TCP_TIME_WAIT)
4487         {       
4488                 tcp_set_state(sk,TCP_CLOSE);
4489                 sk->shutdown = SHUTDOWN_MASK;
4490         }
4491 #else   
4492         tcp_set_state(sk,TCP_CLOSE);
4493         sk->shutdown = SHUTDOWN_MASK;
4494 #endif  
4495         if (!sk->dead) 
4496                 sk->state_change(sk);
4497         kfree_skb(skb, FREE_READ);
4498         release_sock(sk);
4499         return(0);
4500 }
4501 
4502 /*
4503  *      A TCP packet has arrived.
4504  */
4505  
4506 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4507         unsigned long daddr, unsigned short len,
4508         unsigned long saddr, int redo, struct inet_protocol * protocol)
4509 {
4510         struct tcphdr *th;
4511         struct sock *sk;
4512         int syn_ok=0;
4513         
4514         if (!skb) 
4515         {
4516                 printk("IMPOSSIBLE 1\n");
4517                 return(0);
4518         }
4519 
4520         if (!dev) 
4521         {
4522                 printk("IMPOSSIBLE 2\n");
4523                 return(0);
4524         }
4525   
4526         tcp_statistics.TcpInSegs++;
4527   
4528         if(skb->pkt_type!=PACKET_HOST)
4529         {
4530                 kfree_skb(skb,FREE_READ);
4531                 return(0);
4532         }
4533   
4534         th = skb->h.th;
4535 
4536         /*
4537          *      Find the socket.
4538          */
4539 
4540         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4541 
4542         /*
4543          *      If this socket has got a reset it's to all intents and purposes 
4544          *      really dead. Count closed sockets as dead.
4545          *
4546          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4547          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4548          *      exist so should cause resets as if the port was unreachable.
4549          */
4550          
4551         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4552                 sk=NULL;
4553 
4554         if (!redo) 
4555         {
4556                 if (tcp_check(th, len, saddr, daddr )) 
4557                 {
4558                         skb->sk = NULL;
4559                         kfree_skb(skb,FREE_READ);
4560                         /*
4561                          *      We don't release the socket because it was
4562                          *      never marked in use.
4563                          */
4564                         return(0);
4565                 }
4566                 th->seq = ntohl(th->seq);
4567 
4568                 /* See if we know about the socket. */
4569                 if (sk == NULL) 
4570                 {
4571                         /*
4572                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4573                          */
4574                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4575                         skb->sk = NULL;
4576                         /*
4577                          *      Discard frame
4578                          */
4579                         kfree_skb(skb, FREE_READ);
4580                         return(0);
4581                 }
4582 
4583                 skb->len = len;
4584                 skb->acked = 0;
4585                 skb->used = 0;
4586                 skb->free = 0;
4587                 skb->saddr = daddr;
4588                 skb->daddr = saddr;
4589         
4590                 /* We may need to add it to the backlog here. */
4591                 cli();
4592                 if (sk->inuse) 
4593                 {
4594                         skb_queue_tail(&sk->back_log, skb);
4595                         sti();
4596                         return(0);
4597                 }
4598                 sk->inuse = 1;
4599                 sti();
4600         }
4601         else
4602         {
4603                 if (sk==NULL) 
4604                 {
4605                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4606                         skb->sk = NULL;
4607                         kfree_skb(skb, FREE_READ);
4608                         return(0);
4609                 }
4610         }
4611 
4612 
4613         if (!sk->prot) 
4614         {
4615                 printk("IMPOSSIBLE 3\n");
4616                 return(0);
4617         }
4618 
4619 
4620         /*
4621          *      Charge the memory to the socket. 
4622          */
4623          
4624         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4625         {
4626                 kfree_skb(skb, FREE_READ);
4627                 release_sock(sk);
4628                 return(0);
4629         }
4630 
4631         skb->sk=sk;
4632         sk->rmem_alloc += skb->mem_len;
4633 
4634         /*
4635          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4636          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4637          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4638          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4639          */
4640 
4641         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4642         {
4643         
4644                 /*
4645                  *      Now deal with unusual cases.
4646                  */
4647          
4648                 if(sk->state==TCP_LISTEN)
4649                 {
4650                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4651                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4652 
4653                         /*
4654                          *      We don't care for RST, and non SYN are absorbed (old segments)
4655                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4656                          *      netmask on a running connection it can go broadcast. Even Sun's have
4657                          *      this problem so I'm ignoring it 
4658                          */
4659                            
4660                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4661                         {
4662                                 kfree_skb(skb, FREE_READ);
4663                                 release_sock(sk);
4664                                 return 0;
4665                         }
4666                 
4667                         /*      
4668                          *      Guess we need to make a new socket up 
4669                          */
4670                 
4671                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4672                 
4673                         /*
4674                          *      Now we have several options: In theory there is nothing else
4675                          *      in the frame. KA9Q has an option to send data with the syn,
4676                          *      BSD accepts data with the syn up to the [to be] advertised window
4677                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4678                          *      it, that fits the spec precisely and avoids incompatibilities. It
4679                          *      would be nice in future to drop through and process the data.
4680                          */
4681                          
4682                         release_sock(sk);
4683                         return 0;
4684                 }
4685         
4686                 /* retransmitted SYN? */
4687                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4688                 {
4689                         kfree_skb(skb, FREE_READ);
4690                         release_sock(sk);
4691                         return 0;
4692                 }
4693                 
4694                 /*
4695                  *      SYN sent means we have to look for a suitable ack and either reset
4696                  *      for bad matches or go to connected 
4697                  */
4698            
4699                 if(sk->state==TCP_SYN_SENT)
4700                 {
4701                         /* Crossed SYN or previous junk segment */
4702                         if(th->ack)
4703                         {
4704                                 /* We got an ack, but it's not a good ack */
4705                                 if(!tcp_ack(sk,th,saddr,len))
4706                                 {
4707                                         /* Reset the ack - its an ack from a 
4708                                            different connection  [ th->rst is checked in tcp_reset()] */
4709                                         tcp_statistics.TcpAttemptFails++;
4710                                         tcp_reset(daddr, saddr, th,
4711                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4712                                         kfree_skb(skb, FREE_READ);
4713                                         release_sock(sk);
4714                                         return(0);
4715                                 }
4716                                 if(th->rst)
4717                                         return tcp_std_reset(sk,skb);
4718                                 if(!th->syn)
4719                                 {
4720                                         /* A valid ack from a different connection
4721                                            start. Shouldn't happen but cover it */
4722                                         kfree_skb(skb, FREE_READ);
4723                                         release_sock(sk);
4724                                         return 0;
4725                                 }
4726                                 /*
4727                                  *      Ok.. it's good. Set up sequence numbers and
4728                                  *      move to established.
4729                                  */
4730                                 syn_ok=1;       /* Don't reset this connection for the syn */
4731                                 sk->acked_seq=th->seq+1;
4732                                 sk->fin_seq=th->seq;
4733                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4734                                 tcp_set_state(sk, TCP_ESTABLISHED);
4735                                 tcp_options(sk,th);
4736                                 sk->dummy_th.dest=th->source;
4737                                 sk->copied_seq = sk->acked_seq;
4738                                 if(!sk->dead)
4739                                 {
4740                                         sk->state_change(sk);
4741                                         sock_wake_async(sk->socket, 0);
4742                                 }
4743                                 if(sk->max_window==0)
4744                                 {
4745                                         sk->max_window = 32;
4746                                         sk->mss = min(sk->max_window, sk->mtu);
4747                                 }
4748                         }
4749                         else
4750                         {
4751                                 /* See if SYN's cross. Drop if boring */
4752                                 if(th->syn && !th->rst)
4753                                 {
4754                                         /* Crossed SYN's are fine - but talking to
4755                                            yourself is right out... */
4756                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4757                                                 sk->dummy_th.source==th->source &&
4758                                                 sk->dummy_th.dest==th->dest)
4759                                         {
4760                                                 tcp_statistics.TcpAttemptFails++;
4761                                                 return tcp_std_reset(sk,skb);
4762                                         }
4763                                         tcp_set_state(sk,TCP_SYN_RECV);
4764                                         
4765                                         /*
4766                                          *      FIXME:
4767                                          *      Must send SYN|ACK here
4768                                          */
4769                                 }               
4770                                 /* Discard junk segment */
4771                                 kfree_skb(skb, FREE_READ);
4772                                 release_sock(sk);
4773                                 return 0;
4774                         }
4775                         /*
4776                          *      SYN_RECV with data maybe.. drop through
4777                          */
4778                         goto rfc_step6;
4779                 }
4780 
4781         /*
4782          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4783          *      a more complex suggestion for fixing these reuse issues in RFC1644
4784          *      but not yet ready for general use. Also see RFC1379.
4785          */
4786         
4787 #define BSD_TIME_WAIT
4788 #ifdef BSD_TIME_WAIT
4789                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4790                         after(th->seq, sk->acked_seq) && !th->rst)
4791                 {
4792                         long seq=sk->write_seq;
4793                         if(sk->debug)
4794                                 printk("Doing a BSD time wait\n");
4795                         tcp_statistics.TcpEstabResets++;           
4796                         sk->rmem_alloc -= skb->mem_len;
4797                         skb->sk = NULL;
4798                         sk->err=ECONNRESET;
4799                         tcp_set_state(sk, TCP_CLOSE);
4800                         sk->shutdown = SHUTDOWN_MASK;
4801                         release_sock(sk);
4802                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4803                         if (sk && sk->state==TCP_LISTEN)
4804                         {
4805                                 sk->inuse=1;
4806                                 skb->sk = sk;
4807                                 sk->rmem_alloc += skb->mem_len;
4808                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4809                                 release_sock(sk);
4810                                 return 0;
4811                         }
4812                         kfree_skb(skb, FREE_READ);
4813                         return 0;
4814                 }
4815 #endif  
4816         }
4817 
4818         /*
4819          *      We are now in normal data flow (see the step list in the RFC)
4820          *      Note most of these are inline now. I'll inline the lot when
4821          *      I have time to test it hard and look at what gcc outputs 
4822          */
4823         
4824         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4825         {
4826                 kfree_skb(skb, FREE_READ);
4827                 release_sock(sk);
4828                 return 0;
4829         }
4830 
4831         if(th->rst)
4832                 return tcp_std_reset(sk,skb);
4833         
4834         /*
4835          *      !syn_ok is effectively the state test in RFC793.
4836          */
4837          
4838         if(th->syn && !syn_ok)
4839         {
4840                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4841                 return tcp_std_reset(sk,skb);   
4842         }
4843 
4844         /*
4845          *      Process the ACK
4846          */
4847          
4848 
4849         if(th->ack && !tcp_ack(sk,th,saddr,len))
4850         {
4851                 /*
4852                  *      Our three way handshake failed.
4853                  */
4854                  
4855                 if(sk->state==TCP_SYN_RECV)
4856                 {
4857                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4858                 }
4859                 kfree_skb(skb, FREE_READ);
4860                 release_sock(sk);
4861                 return 0;
4862         }
4863         
4864 rfc_step6:              /* I'll clean this up later */
4865 
4866         /*
4867          *      Process urgent data
4868          */
4869                 
4870         if(tcp_urg(sk, th, saddr, len))
4871         {
4872                 kfree_skb(skb, FREE_READ);
4873                 release_sock(sk);
4874                 return 0;
4875         }
4876         
4877         
4878         /*
4879          *      Process the encapsulated data
4880          */
4881         
4882         if(tcp_data(skb,sk, saddr, len))
4883         {
4884                 kfree_skb(skb, FREE_READ);
4885                 release_sock(sk);
4886                 return 0;
4887         }
4888 
4889         /*
4890          *      And done
4891          */     
4892         
4893         release_sock(sk);
4894         return 0;
4895 }
4896 
4897 /*
4898  *      This routine sends a packet with an out of date sequence
4899  *      number. It assumes the other end will try to ack it.
4900  */
4901 
4902 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4903 {
4904         struct sk_buff *buff;
4905         struct tcphdr *t1;
4906         struct device *dev=NULL;
4907         int tmp;
4908 
4909         if (sk->zapped)
4910                 return; /* After a valid reset we can send no more */
4911 
4912         /*
4913          *      Write data can still be transmitted/retransmitted in the
4914          *      following states.  If any other state is encountered, return.
4915          *      [listen/close will never occur here anyway]
4916          */
4917 
4918         if (sk->state != TCP_ESTABLISHED && 
4919             sk->state != TCP_CLOSE_WAIT &&
4920             sk->state != TCP_FIN_WAIT1 && 
4921             sk->state != TCP_LAST_ACK &&
4922             sk->state != TCP_CLOSING
4923         ) 
4924         {
4925                 return;
4926         }
4927 
4928         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4929         if (buff == NULL) 
4930                 return;
4931 
4932         buff->len = sizeof(struct tcphdr);
4933         buff->free = 1;
4934         buff->sk = sk;
4935         buff->localroute = sk->localroute;
4936 
4937         t1 = (struct tcphdr *) buff->data;
4938 
4939         /* Put in the IP header and routing stuff. */
4940         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4941                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4942         if (tmp < 0) 
4943         {
4944                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4945                 return;
4946         }
4947 
4948         buff->len += tmp;
4949         t1 = (struct tcphdr *)((char *)t1 +tmp);
4950 
4951         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4952 
4953         /*
4954          *      Use a previous sequence.
4955          *      This should cause the other end to send an ack.
4956          */
4957          
4958         t1->seq = htonl(sk->sent_seq-1);
4959         t1->ack = 1; 
4960         t1->res1= 0;
4961         t1->res2= 0;
4962         t1->rst = 0;
4963         t1->urg = 0;
4964         t1->psh = 0;
4965         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4966         t1->syn = 0;
4967         t1->ack_seq = ntohl(sk->acked_seq);
4968         t1->window = ntohs(tcp_select_window(sk));
4969         t1->doff = sizeof(*t1)/4;
4970         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4971          /*
4972           *     Send it and free it.
4973           *     This will prevent the timer from automatically being restarted.
4974           */
4975         sk->prot->queue_xmit(sk, dev, buff, 1);
4976         tcp_statistics.TcpOutSegs++;
4977 }
4978 
4979 /*
4980  *      A window probe timeout has occurred.
4981  */
4982 
4983 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4984 {
4985         if (sk->zapped)
4986                 return;         /* After a valid reset we can send no more */
4987 
4988         tcp_write_wakeup(sk);
4989 
4990         sk->backoff++;
4991         sk->rto = min(sk->rto << 1, 120*HZ);
4992         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4993         sk->retransmits++;
4994         sk->prot->retransmits ++;
4995 }
4996 
4997 /*
4998  *      Socket option code for TCP. 
4999  */
5000   
5001 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5002 {
5003         int val,err;
5004 
5005         if(level!=SOL_TCP)
5006                 return ip_setsockopt(sk,level,optname,optval,optlen);
5007 
5008         if (optval == NULL) 
5009                 return(-EINVAL);
5010 
5011         err=verify_area(VERIFY_READ, optval, sizeof(int));
5012         if(err)
5013                 return err;
5014         
5015         val = get_fs_long((unsigned long *)optval);
5016 
5017         switch(optname)
5018         {
5019                 case TCP_MAXSEG:
5020 /*
5021  * values greater than interface MTU won't take effect.  however at
5022  * the point when this call is done we typically don't yet know
5023  * which interface is going to be used
5024  */
5025                         if(val<1||val>MAX_WINDOW)
5026                                 return -EINVAL;
5027                         sk->user_mss=val;
5028                         return 0;
5029                 case TCP_NODELAY:
5030                         sk->nonagle=(val==0)?0:1;
5031                         return 0;
5032                 default:
5033                         return(-ENOPROTOOPT);
5034         }
5035 }
5036 
5037 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5038 {
5039         int val,err;
5040 
5041         if(level!=SOL_TCP)
5042                 return ip_getsockopt(sk,level,optname,optval,optlen);
5043                         
5044         switch(optname)
5045         {
5046                 case TCP_MAXSEG:
5047                         val=sk->user_mss;
5048                         break;
5049                 case TCP_NODELAY:
5050                         val=sk->nonagle;
5051                         break;
5052                 default:
5053                         return(-ENOPROTOOPT);
5054         }
5055         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5056         if(err)
5057                 return err;
5058         put_fs_long(sizeof(int),(unsigned long *) optlen);
5059 
5060         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5061         if(err)
5062                 return err;
5063         put_fs_long(val,(unsigned long *)optval);
5064 
5065         return(0);
5066 }       
5067 
5068 
5069 struct proto tcp_prot = {
5070         sock_wmalloc,
5071         sock_rmalloc,
5072         sock_wfree,
5073         sock_rfree,
5074         sock_rspace,
5075         sock_wspace,
5076         tcp_close,
5077         tcp_read,
5078         tcp_write,
5079         tcp_sendto,
5080         tcp_recvfrom,
5081         ip_build_header,
5082         tcp_connect,
5083         tcp_accept,
5084         ip_queue_xmit,
5085         tcp_retransmit,
5086         tcp_write_wakeup,
5087         tcp_read_wakeup,
5088         tcp_rcv,
5089         tcp_select,
5090         tcp_ioctl,
5091         NULL,
5092         tcp_shutdown,
5093         tcp_setsockopt,
5094         tcp_getsockopt,
5095         128,
5096         0,
5097         {NULL,},
5098         "TCP",
5099         0, 0
5100 };

/* [previous][next][first][last][top][bottom][index][help] */