root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since it's
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 136  *                                      only frames. At least one pc tcp stack
 137  *                                      generates them.
 138  *
 139  *
 140  * To Fix:
 141  *              Fast path the code. Two things here - fix the window calculation
 142  *              so it doesn't iterate over the queue, also spot packets with no funny
 143  *              options arriving in order and process directly.
 144  *
 145  *              Implement RFC 1191 [Path MTU discovery]
 146  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 147  *              Rewrite output state machine to use a single queue and do low window
 148  *              situations as per the spec (RFC 1122)
 149  *              Speed up input assembly algorithm.
 150  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 151  *              could do with it working on IPv4
 152  *              User settable/learned rtt/max window/mtu
 153  *              Cope with MTU/device switches when retransmitting in tcp.
 154  *              Fix the window handling to use PR's new code.
 155  *
 156  *              Change the fundamental structure to a single send queue maintained
 157  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 158  *              active routes too]). Cut the queue off in tcp_retransmit/
 159  *              tcp_transmit.
 160  *              Change the receive queue to assemble as it goes. This lets us
 161  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 162  *              tcp_data/tcp_read as well as the window shrink crud.
 163  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 164  *              tcp_queue_skb seem obvious routines to extract.
 165  *      
 166  *              This program is free software; you can redistribute it and/or
 167  *              modify it under the terms of the GNU General Public License
 168  *              as published by the Free Software Foundation; either version
 169  *              2 of the License, or(at your option) any later version.
 170  *
 171  * Description of States:
 172  *
 173  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 174  *
 175  *      TCP_SYN_RECV            received a connection request, sent ack,
 176  *                              waiting for final ack in three-way handshake.
 177  *
 178  *      TCP_ESTABLISHED         connection established
 179  *
 180  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 181  *                              transmission of remaining buffered data
 182  *
 183  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 184  *                              to shutdown
 185  *
 186  *      TCP_CLOSING             both sides have shutdown but we still have
 187  *                              data we have to finish sending
 188  *
 189  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 190  *                              closed, can only be entered from FIN_WAIT2
 191  *                              or CLOSING.  Required because the other end
 192  *                              may not have gotten our last ACK causing it
 193  *                              to retransmit the data packet (which we ignore)
 194  *
 195  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 196  *                              us to finish writing our data and to shutdown
 197  *                              (we have to close() to move on to LAST_ACK)
 198  *
 199  *      TCP_LAST_ACK            out side has shutdown after remote has
 200  *                              shutdown.  There may still be data in our
 201  *                              buffer that we have to finish sending
 202  *              
 203  *      TCP_CLOSE               socket is finished
 204  */
 205 
 206 #include <linux/types.h>
 207 #include <linux/sched.h>
 208 #include <linux/mm.h>
 209 #include <linux/time.h>
 210 #include <linux/string.h>
 211 #include <linux/config.h>
 212 #include <linux/socket.h>
 213 #include <linux/sockios.h>
 214 #include <linux/termios.h>
 215 #include <linux/in.h>
 216 #include <linux/fcntl.h>
 217 #include <linux/inet.h>
 218 #include <linux/netdevice.h>
 219 #include "snmp.h"
 220 #include "ip.h"
 221 #include "protocol.h"
 222 #include "icmp.h"
 223 #include "tcp.h"
 224 #include "arp.h"
 225 #include <linux/skbuff.h>
 226 #include "sock.h"
 227 #include "route.h"
 228 #include <linux/errno.h>
 229 #include <linux/timer.h>
 230 #include <asm/system.h>
 231 #include <asm/segment.h>
 232 #include <linux/mm.h>
 233 
 234 /*
 235  *      The MSL timer is the 'normal' timer.
 236  */
 237  
 238 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 239 
 240 #define SEQ_TICK 3
 241 unsigned long seq_offset;
 242 struct tcp_mib  tcp_statistics;
 243 
 244 static void tcp_close(struct sock *sk, int timeout);
 245 
 246 
 247 /*
 248  *      The less said about this the better, but it works and will do for 1.2 
 249  */
 250 
 251 static struct wait_queue *master_select_wakeup;
 252 
 253 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 254 {
 255         if (a < b) 
 256                 return(a);
 257         return(b);
 258 }
 259 
 260 #undef STATE_TRACE
 261 
 262 #ifdef STATE_TRACE
 263 static char *statename[]={
 264         "Unused","Established","Syn Sent","Syn Recv",
 265         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 266         "Close Wait","Last ACK","Listen","Closing"
 267 };
 268 #endif
 269 
 270 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 271 {
 272         if(sk->state==TCP_ESTABLISHED)
 273                 tcp_statistics.TcpCurrEstab--;
 274 #ifdef STATE_TRACE
 275         if(sk->debug)
 276                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 277 #endif  
 278         /* This is a hack but it doesn't occur often and it's going to
 279            be a real        to fix nicely */
 280            
 281         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 282         {
 283                 wake_up_interruptible(&master_select_wakeup);
 284         }
 285         sk->state=state;
 286         if(state==TCP_ESTABLISHED)
 287                 tcp_statistics.TcpCurrEstab++;
 288 }
 289 
 290 /*
 291  *      This routine picks a TCP windows for a socket based on
 292  *      the following constraints
 293  *  
 294  *      1. The window can never be shrunk once it is offered (RFC 793)
 295  *      2. We limit memory per socket
 296  *   
 297  *      For now we use NET2E3's heuristic of offering half the memory
 298  *      we have handy. All is not as bad as this seems however because
 299  *      of two things. Firstly we will bin packets even within the window
 300  *      in order to get the data we are waiting for into the memory limit.
 301  *      Secondly we bin common duplicate forms at receive time
 302  *      Better heuristics welcome
 303  */
 304    
 305 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307         int new_window = sk->prot->rspace(sk);
 308         
 309         if(sk->window_clamp)
 310                 new_window=min(sk->window_clamp,new_window);
 311         /*
 312          *      Two things are going on here.  First, we don't ever offer a
 313          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 314          *      receiver side of SWS as specified in RFC1122.
 315          *      Second, we always give them at least the window they
 316          *      had before, in order to avoid retracting window.  This
 317          *      is technically allowed, but RFC1122 advises against it and
 318          *      in practice it causes trouble.
 319          *
 320          *      Fixme: This doesn't correctly handle the case where
 321          *      new_window > sk->window but not by enough to allow for the
 322          *      shift in sequence space. 
 323          */
 324         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 325                 return(sk->window);
 326         return(new_window);
 327 }
 328 
 329 /*
 330  *      Find someone to 'accept'. Must be called with
 331  *      sk->inuse=1 or cli()
 332  */ 
 333 
 334 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 335 {
 336         struct sk_buff *p=skb_peek(&s->receive_queue);
 337         if(p==NULL)
 338                 return NULL;
 339         do
 340         {
 341                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 342                         return p;
 343                 p=p->next;
 344         }
 345         while(p!=(struct sk_buff *)&s->receive_queue);
 346         return NULL;
 347 }
 348 
 349 /*
 350  *      Remove a completed connection and return it. This is used by
 351  *      tcp_accept() to get connections from the queue.
 352  */
 353 
 354 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 355 {
 356         struct sk_buff *skb;
 357         unsigned long flags;
 358         save_flags(flags);
 359         cli(); 
 360         skb=tcp_find_established(s);
 361         if(skb!=NULL)
 362                 skb_unlink(skb);        /* Take it off the queue */
 363         restore_flags(flags);
 364         return skb;
 365 }
 366 
 367 /* 
 368  *      This routine closes sockets which have been at least partially
 369  *      opened, but not yet accepted. Currently it is only called by
 370  *      tcp_close, and timeout mirrors the value there. 
 371  */
 372 
 373 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 374 {
 375         struct sk_buff *skb;
 376 
 377         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 378         {
 379                 skb->sk->dead=1;
 380                 tcp_close(skb->sk, 0);
 381                 kfree_skb(skb, FREE_READ);
 382         }
 383         return;
 384 }
 385 
 386 /*
 387  *      Enter the time wait state. 
 388  */
 389 
 390 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 391 {
 392         tcp_set_state(sk,TCP_TIME_WAIT);
 393         sk->shutdown = SHUTDOWN_MASK;
 394         if (!sk->dead)
 395                 sk->state_change(sk);
 396         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 397 }
 398 
 399 /*
 400  *      A socket has timed out on its send queue and wants to do a
 401  *      little retransmitting. Currently this means TCP.
 402  */
 403 
 404 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 405 {
 406         struct sk_buff * skb;
 407         struct proto *prot;
 408         struct device *dev;
 409         int ct=0;
 410 
 411         prot = sk->prot;
 412         skb = sk->send_head;
 413 
 414         while (skb != NULL)
 415         {
 416                 struct tcphdr *th;
 417                 struct iphdr *iph;
 418                 int size;
 419 
 420                 dev = skb->dev;
 421                 IS_SKB(skb);
 422                 skb->when = jiffies;
 423 
 424                 /*
 425                  * In general it's OK just to use the old packet.  However we
 426                  * need to use the current ack and window fields.  Urg and
 427                  * urg_ptr could possibly stand to be updated as well, but we
 428                  * don't keep the necessary data.  That shouldn't be a problem,
 429                  * if the other end is doing the right thing.  Since we're
 430                  * changing the packet, we have to issue a new IP identifier.
 431                  */
 432 
 433                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 434                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 435                 size = skb->len - (((unsigned char *) th) - skb->data);
 436                 
 437                 /*
 438                  *      Note: We ought to check for window limits here but
 439                  *      currently this is done (less efficiently) elsewhere.
 440                  *      We do need to check for a route change but can't handle
 441                  *      that until we have the new 1.3.x buffers in.
 442                  *
 443                  */
 444 
 445                 iph->id = htons(ip_id_count++);
 446                 ip_send_check(iph);
 447 
 448                 /*
 449                  *      This is not the right way to handle this. We have to
 450                  *      issue an up to date window and ack report with this 
 451                  *      retransmit to keep the odd buggy tcp that relies on 
 452                  *      the fact BSD does this happy. 
 453                  *      We don't however need to recalculate the entire 
 454                  *      checksum, so someone wanting a small problem to play
 455                  *      with might like to implement RFC1141/RFC1624 and speed
 456                  *      this up by avoiding a full checksum.
 457                  */
 458                  
 459                 th->ack_seq = ntohl(sk->acked_seq);
 460                 th->window = ntohs(tcp_select_window(sk));
 461                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 462                 
 463                 /*
 464                  *      If the interface is (still) up and running, kick it.
 465                  */
 466 
 467                 if (dev->flags & IFF_UP)
 468                 {
 469                         /*
 470                          *      If the packet is still being sent by the device/protocol
 471                          *      below then don't retransmit. This is both needed, and good -
 472                          *      especially with connected mode AX.25 where it stops resends
 473                          *      occurring of an as yet unsent anyway frame!
 474                          *      We still add up the counts as the round trip time wants
 475                          *      adjusting.
 476                          */
 477                         if (sk && !skb_device_locked(skb))
 478                         {
 479                                 /* Remove it from any existing driver queue first! */
 480                                 skb_unlink(skb);
 481                                 /* Now queue it */
 482                                 ip_statistics.IpOutRequests++;
 483                                 dev_queue_xmit(skb, dev, sk->priority);
 484                         }
 485                 }
 486 
 487                 /*
 488                  *      Count retransmissions
 489                  */
 490                  
 491                 ct++;
 492                 sk->prot->retransmits ++;
 493 
 494                 /*
 495                  *      Only one retransmit requested.
 496                  */
 497         
 498                 if (!all)
 499                         break;
 500 
 501                 /*
 502                  *      This should cut it off before we send too many packets.
 503                  */
 504 
 505                 if (ct >= sk->cong_window)
 506                         break;
 507                 skb = skb->link3;
 508         }
 509 }
 510 
 511 /*
 512  *      Reset the retransmission timer
 513  */
 514  
 515 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 516 {
 517         del_timer(&sk->retransmit_timer);
 518         sk->ip_xmit_timeout = why;
 519         if((int)when < 0)
 520         {
 521                 when=3;
 522                 printk("Error: Negative timer in xmit_timer\n");
 523         }
 524         sk->retransmit_timer.expires=when;
 525         add_timer(&sk->retransmit_timer);
 526 }
 527 
 528 /*
 529  *      This is the normal code called for timeouts.  It does the retransmission
 530  *      and then does backoff.  tcp_do_retransmit is separated out because
 531  *      tcp_ack needs to send stuff from the retransmit queue without
 532  *      initiating a backoff.
 533  */
 534 
 535 
 536 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 537 {
 538         tcp_do_retransmit(sk, all);
 539 
 540         /*
 541          * Increase the timeout each time we retransmit.  Note that
 542          * we do not increase the rtt estimate.  rto is initialized
 543          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 544          * that doubling rto each time is the least we can get away with.
 545          * In KA9Q, Karn uses this for the first few times, and then
 546          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 547          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 548          * defined in the protocol as the maximum possible RTT.  I guess
 549          * we'll have to use something other than TCP to talk to the
 550          * University of Mars.
 551          *
 552          * PAWS allows us longer timeouts and large windows, so once
 553          * implemented ftp to mars will work nicely. We will have to fix
 554          * the 120 second clamps though!
 555          */
 556 
 557         sk->retransmits++;
 558         sk->backoff++;
 559         sk->rto = min(sk->rto << 1, 120*HZ);
 560         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 561 }
 562 
 563 
 564 /*
 565  *      A timer event has trigger a tcp retransmit timeout. The
 566  *      socket xmit queue is ready and set up to send. Because
 567  *      the ack receive code keeps the queue straight we do
 568  *      nothing clever here.
 569  */
 570 
 571 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 572 {
 573         if (all) 
 574         {
 575                 tcp_retransmit_time(sk, all);
 576                 return;
 577         }
 578 
 579         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 580         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 581         sk->cong_count = 0;
 582 
 583         sk->cong_window = 1;
 584 
 585         /* Do the actual retransmit. */
 586         tcp_retransmit_time(sk, all);
 587 }
 588 
 589 /*
 590  *      A write timeout has occurred. Process the after effects.
 591  */
 592 
 593 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 594 {
 595         /*
 596          *      Look for a 'soft' timeout.
 597          */
 598         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 599                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 600         {
 601                 /*
 602                  *      Attempt to recover if arp has changed (unlikely!) or
 603                  *      a route has shifted (not supported prior to 1.3).
 604                  */
 605                 arp_destroy (sk->daddr, 0);
 606                 ip_route_check (sk->daddr);
 607         }
 608         /*
 609          *      Has it gone just too far ?
 610          */
 611         if (sk->retransmits > TCP_RETR2) 
 612         {
 613                 sk->err = ETIMEDOUT;
 614                 sk->error_report(sk);
 615                 del_timer(&sk->retransmit_timer);
 616                 /*
 617                  *      Time wait the socket 
 618                  */
 619                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 620                 {
 621                         tcp_set_state(sk,TCP_TIME_WAIT);
 622                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 623                 }
 624                 else
 625                 {
 626                         /*
 627                          *      Clean up time.
 628                          */
 629                         tcp_set_state(sk, TCP_CLOSE);
 630                         return 0;
 631                 }
 632         }
 633         return 1;
 634 }
 635 
 636 /*
 637  *      The TCP retransmit timer. This lacks a few small details.
 638  *
 639  *      1.      An initial rtt timeout on the probe0 should cause what we can
 640  *              of the first write queue buffer to be split and sent.
 641  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 642  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 643  *              tcp_err should save a 'soft error' for us.
 644  */
 645 
 646 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 647 {
 648         struct sock *sk = (struct sock*)data;
 649         int why = sk->ip_xmit_timeout;
 650 
 651         /* 
 652          * only process if socket is not in use
 653          */
 654 
 655         cli();
 656         if (sk->inuse || in_bh) 
 657         {
 658                 /* Try again in 1 second */
 659                 sk->retransmit_timer.expires = HZ;
 660                 add_timer(&sk->retransmit_timer);
 661                 sti();
 662                 return;
 663         }
 664 
 665         sk->inuse = 1;
 666         sti();
 667 
 668         /* Always see if we need to send an ack. */
 669 
 670         if (sk->ack_backlog && !sk->zapped) 
 671         {
 672                 sk->prot->read_wakeup (sk);
 673                 if (! sk->dead)
 674                         sk->data_ready(sk,0);
 675         }
 676 
 677         /* Now we need to figure out why the socket was on the timer. */
 678 
 679         switch (why) 
 680         {
 681                 /* Window probing */
 682                 case TIME_PROBE0:
 683                         tcp_send_probe0(sk);
 684                         tcp_write_timeout(sk);
 685                         break;
 686                 /* Retransmitting */
 687                 case TIME_WRITE:
 688                         /* It could be we got here because we needed to send an ack.
 689                          * So we need to check for that.
 690                          */
 691                 {
 692                         struct sk_buff *skb;
 693                         unsigned long flags;
 694 
 695                         save_flags(flags);
 696                         cli();
 697                         skb = sk->send_head;
 698                         if (!skb) 
 699                         {
 700                                 restore_flags(flags);
 701                         } 
 702                         else 
 703                         {
 704                                 /*
 705                                  *      Kicked by a delayed ack. Reset timer
 706                                  *      correctly now
 707                                  */
 708                                 if (jiffies < skb->when + sk->rto) 
 709                                 {
 710                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 711                                         restore_flags(flags);
 712                                         break;
 713                                 }
 714                                 restore_flags(flags);
 715                                 /*
 716                                  *      Retransmission
 717                                  */
 718                                 sk->prot->retransmit (sk, 0);
 719                                 tcp_write_timeout(sk);
 720                         }
 721                         break;
 722                 }
 723                 /* Sending Keepalives */
 724                 case TIME_KEEPOPEN:
 725                         /* 
 726                          * this reset_timer() call is a hack, this is not
 727                          * how KEEPOPEN is supposed to work.
 728                          */
 729                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 730 
 731                         /* Send something to keep the connection open. */
 732                         if (sk->prot->write_wakeup)
 733                                   sk->prot->write_wakeup (sk);
 734                         sk->retransmits++;
 735                         tcp_write_timeout(sk);
 736                         break;
 737                 default:
 738                         printk ("rexmit_timer: timer expired - reason unknown\n");
 739                         break;
 740         }
 741         release_sock(sk);
 742 }
 743 
 744 /*
 745  * This routine is called by the ICMP module when it gets some
 746  * sort of error condition.  If err < 0 then the socket should
 747  * be closed and the error returned to the user.  If err > 0
 748  * it's just the icmp type << 8 | icmp code.  After adjustment
 749  * header points to the first 8 bytes of the tcp header.  We need
 750  * to find the appropriate port.
 751  */
 752 
 753 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 754         unsigned long saddr, struct inet_protocol *protocol)
 755 {
 756         struct tcphdr *th;
 757         struct sock *sk;
 758         struct iphdr *iph=(struct iphdr *)header;
 759   
 760         header+=4*iph->ihl;
 761    
 762 
 763         th =(struct tcphdr *)header;
 764         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 765 
 766         if (sk == NULL) 
 767                 return;
 768   
 769         if(err<0)
 770         {
 771                 sk->err = -err;
 772                 sk->error_report(sk);
 773                 return;
 774         }
 775 
 776         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 777         {
 778                 /*
 779                  * FIXME:
 780                  * For now we will just trigger a linear backoff.
 781                  * The slow start code should cause a real backoff here.
 782                  */
 783                 if (sk->cong_window > 4)
 784                         sk->cong_window--;
 785                 return;
 786         }
 787 
 788 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 789 
 790         /*
 791          * If we've already connected we will keep trying
 792          * until we time out, or the user gives up.
 793          */
 794 
 795         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 796         {
 797                 if (sk->state == TCP_SYN_SENT) 
 798                 {
 799                         tcp_statistics.TcpAttemptFails++;
 800                         tcp_set_state(sk,TCP_CLOSE);
 801                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 802                 }
 803                 sk->err = icmp_err_convert[err & 0xff].errno;           
 804         }
 805         return;
 806 }
 807 
 808 
 809 /*
 810  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 811  *      in the received data queue (ie a frame missing that needs sending to us). Not
 812  *      sorting using two queues as data arrives makes life so much harder.
 813  */
 814 
 815 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 816 {
 817         unsigned long counted;
 818         unsigned long amount;
 819         struct sk_buff *skb;
 820         int sum;
 821         unsigned long flags;
 822 
 823         if(sk && sk->debug)
 824                 printk("tcp_readable: %p - ",sk);
 825 
 826         save_flags(flags);
 827         cli();
 828         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 829         {
 830                 restore_flags(flags);
 831                 if(sk && sk->debug) 
 832                         printk("empty\n");
 833                 return(0);
 834         }
 835   
 836         counted = sk->copied_seq;       /* Where we are at the moment */
 837         amount = 0;
 838   
 839         /* 
 840          *      Do until a push or until we are out of data. 
 841          */
 842          
 843         do 
 844         {
 845                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 846                         break;
 847                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 848                 if (skb->h.th->syn)
 849                         sum++;
 850                 if (sum > 0) 
 851                 {                                       /* Add it up, move on */
 852                         amount += sum;
 853                         if (skb->h.th->syn) 
 854                                 amount--;
 855                         counted += sum;
 856                 }
 857                 /*
 858                  * Don't count urg data ... but do it in the right place!
 859                  * Consider: "old_data (ptr is here) URG PUSH data"
 860                  * The old code would stop at the first push because
 861                  * it counted the urg (amount==1) and then does amount--
 862                  * *after* the loop.  This means tcp_readable() always
 863                  * returned zero if any URG PUSH was in the queue, even
 864                  * though there was normal data available. If we subtract
 865                  * the urg data right here, we even get it to work for more
 866                  * than one URG PUSH skb without normal data.
 867                  * This means that select() finally works now with urg data
 868                  * in the queue.  Note that rlogin was never affected
 869                  * because it doesn't use select(); it uses two processes
 870                  * and a blocking read().  And the queue scan in tcp_read()
 871                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 872                  */
 873                 if (skb->h.th->urg)
 874                         amount--;       /* don't count urg data */
 875                 if (amount && skb->h.th->psh) break;
 876                 skb = skb->next;
 877         }
 878         while(skb != (struct sk_buff *)&sk->receive_queue);
 879 
 880         restore_flags(flags);
 881         if(sk->debug)
 882                 printk("got %lu bytes.\n",amount);
 883         return(amount);
 884 }
 885 
 886 /*
 887  * LISTEN is a special case for select..
 888  */
 889 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 890 {
 891         if (sel_type == SEL_IN) {
 892                 int retval;
 893 
 894                 sk->inuse = 1;
 895                 retval = (tcp_find_established(sk) != NULL);
 896                 release_sock(sk);
 897                 if (!retval)
 898                         select_wait(&master_select_wakeup,wait);
 899                 return retval;
 900         }
 901         return 0;
 902 }
 903 
 904 
 905 /*
 906  *      Wait for a TCP event.
 907  *
 908  *      Note that we don't need to set "sk->inuse", as the upper select layers
 909  *      take care of normal races (between the test and the event) and we don't
 910  *      go look at any of the socket buffers directly.
 911  */
 912 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 913 {
 914         if (sk->state == TCP_LISTEN)
 915                 return tcp_listen_select(sk, sel_type, wait);
 916 
 917         switch(sel_type) {
 918         case SEL_IN:
 919                 if (sk->err)
 920                         return 1;
 921                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 922                         break;
 923 
 924                 if (sk->shutdown & RCV_SHUTDOWN)
 925                         return 1;
 926                         
 927                 if (sk->acked_seq == sk->copied_seq)
 928                         break;
 929 
 930                 if (sk->urg_seq != sk->copied_seq ||
 931                     sk->acked_seq != sk->copied_seq+1 ||
 932                     sk->urginline || !sk->urg_data)
 933                         return 1;
 934                 break;
 935 
 936         case SEL_OUT:
 937                 if (sk->err)
 938                         return 1;
 939                 if (sk->shutdown & SEND_SHUTDOWN) 
 940                         return 0;
 941                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 942                         break;
 943                 /*
 944                  * This is now right thanks to a small fix
 945                  * by Matt Dillon.
 946                  */
 947 
 948                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 949                         break;
 950                 return 1;
 951 
 952         case SEL_EX:
 953                 if (sk->urg_data)
 954                         return 1;
 955                 break;
 956         }
 957         select_wait(sk->sleep, wait);
 958         return 0;
 959 }
 960 
 961 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 962 {
 963         int err;
 964         switch(cmd) 
 965         {
 966 
 967                 case TIOCINQ:
 968 #ifdef FIXME    /* FIXME: */
 969                 case FIONREAD:
 970 #endif
 971                 {
 972                         unsigned long amount;
 973 
 974                         if (sk->state == TCP_LISTEN) 
 975                                 return(-EINVAL);
 976 
 977                         sk->inuse = 1;
 978                         amount = tcp_readable(sk);
 979                         release_sock(sk);
 980                         err=verify_area(VERIFY_WRITE,(void *)arg,
 981                                                    sizeof(unsigned long));
 982                         if(err)
 983                                 return err;
 984                         put_fs_long(amount,(unsigned long *)arg);
 985                         return(0);
 986                 }
 987                 case SIOCATMARK:
 988                 {
 989                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 990 
 991                         err = verify_area(VERIFY_WRITE,(void *) arg,
 992                                                   sizeof(unsigned long));
 993                         if (err)
 994                                 return err;
 995                         put_fs_long(answ,(int *) arg);
 996                         return(0);
 997                 }
 998                 case TIOCOUTQ:
 999                 {
1000                         unsigned long amount;
1001 
1002                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1003                         amount = sk->prot->wspace(sk);
1004                         err=verify_area(VERIFY_WRITE,(void *)arg,
1005                                                    sizeof(unsigned long));
1006                         if(err)
1007                                 return err;
1008                         put_fs_long(amount,(unsigned long *)arg);
1009                         return(0);
1010                 }
1011                 default:
1012                         return(-EINVAL);
1013         }
1014 }
1015 
1016 
1017 /*
1018  *      This routine computes a TCP checksum. 
1019  */
1020  
1021 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1022           unsigned long saddr, unsigned long daddr)
1023 {     
1024         unsigned long sum;
1025    
1026         if (saddr == 0) saddr = ip_my_addr();
1027 
1028 /*
1029  * stupid, gcc complains when I use just one __asm__ block,
1030  * something about too many reloads, but this is just two
1031  * instructions longer than what I want
1032  */
1033         __asm__("
1034             addl %%ecx, %%ebx
1035             adcl %%edx, %%ebx
1036             adcl $0, %%ebx
1037             "
1038         : "=b"(sum)
1039         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1040         : "bx", "cx", "dx" );
1041         __asm__("
1042             movl %%ecx, %%edx
1043             cld
1044             cmpl $32, %%ecx
1045             jb 2f
1046             shrl $5, %%ecx
1047             clc
1048 1:          lodsl
1049             adcl %%eax, %%ebx
1050             lodsl
1051             adcl %%eax, %%ebx
1052             lodsl
1053             adcl %%eax, %%ebx
1054             lodsl
1055             adcl %%eax, %%ebx
1056             lodsl
1057             adcl %%eax, %%ebx
1058             lodsl
1059             adcl %%eax, %%ebx
1060             lodsl
1061             adcl %%eax, %%ebx
1062             lodsl
1063             adcl %%eax, %%ebx
1064             loop 1b
1065             adcl $0, %%ebx
1066             movl %%edx, %%ecx
1067 2:          andl $28, %%ecx
1068             je 4f
1069             shrl $2, %%ecx
1070             clc
1071 3:          lodsl
1072             adcl %%eax, %%ebx
1073             loop 3b
1074             adcl $0, %%ebx
1075 4:          movl $0, %%eax
1076             testw $2, %%dx
1077             je 5f
1078             lodsw
1079             addl %%eax, %%ebx
1080             adcl $0, %%ebx
1081             movw $0, %%ax
1082 5:          test $1, %%edx
1083             je 6f
1084             lodsb
1085             addl %%eax, %%ebx
1086             adcl $0, %%ebx
1087 6:          movl %%ebx, %%eax
1088             shrl $16, %%eax
1089             addw %%ax, %%bx
1090             adcw $0, %%bx
1091             "
1092         : "=b"(sum)
1093         : "0"(sum), "c"(len), "S"(th)
1094         : "ax", "bx", "cx", "dx", "si" );
1095 
1096         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1097   
1098         return((~sum) & 0xffff);
1099 }
1100 
1101 
1102 
1103 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1104                 unsigned long daddr, int len, struct sock *sk)
1105 {
1106         th->check = 0;
1107         th->check = tcp_check(th, len, saddr, daddr);
1108         return;
1109 }
1110 
1111 /*
1112  *      This is the main buffer sending routine. We queue the buffer
1113  *      having checked it is sane seeming.
1114  */
1115  
1116 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1117 {
1118         int size;
1119         struct tcphdr * th = skb->h.th;
1120 
1121         /*
1122          *      length of packet (not counting length of pre-tcp headers) 
1123          */
1124          
1125         size = skb->len - ((unsigned char *) th - skb->data);
1126 
1127         /*
1128          *      Sanity check it.. 
1129          */
1130          
1131         if (size < sizeof(struct tcphdr) || size > skb->len) 
1132         {
1133                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1134                         skb, skb->data, th, skb->len);
1135                 kfree_skb(skb, FREE_WRITE);
1136                 return;
1137         }
1138 
1139         /*
1140          *      If we have queued a header size packet.. (these crash a few
1141          *      tcp stacks if ack is not set)
1142          */
1143          
1144         if (size == sizeof(struct tcphdr)) 
1145         {
1146                 /* If it's got a syn or fin it's notionally included in the size..*/
1147                 if(!th->syn && !th->fin) 
1148                 {
1149                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1150                         kfree_skb(skb,FREE_WRITE);
1151                         return;
1152                 }
1153         }
1154 
1155         /*
1156          *      Actual processing.
1157          */
1158          
1159         tcp_statistics.TcpOutSegs++;  
1160         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1161         
1162         /*
1163          *      We must queue if
1164          *
1165          *      a) The right edge of this frame exceeds the window
1166          *      b) We are retransmitting (Nagle's rule)
1167          *      c) We have too many packets 'in flight'
1168          */
1169          
1170         if (after(skb->h.seq, sk->window_seq) ||
1171             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1172              sk->packets_out >= sk->cong_window) 
1173         {
1174                 /* checksum will be supplied by tcp_write_xmit.  So
1175                  * we shouldn't need to set it at all.  I'm being paranoid */
1176                 th->check = 0;
1177                 if (skb->next != NULL) 
1178                 {
1179                         printk("tcp_send_partial: next != NULL\n");
1180                         skb_unlink(skb);
1181                 }
1182                 skb_queue_tail(&sk->write_queue, skb);
1183                 
1184                 /*
1185                  *      If we don't fit we have to start the zero window
1186                  *      probes. This is broken - we really need to do a partial
1187                  *      send _first_ (This is what causes the Cisco and PC/TCP
1188                  *      grief).
1189                  */
1190                  
1191                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1192                     sk->send_head == NULL && sk->ack_backlog == 0)
1193                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1194         } 
1195         else 
1196         {
1197                 /*
1198                  *      This is going straight out
1199                  */
1200                  
1201                 th->ack_seq = ntohl(sk->acked_seq);
1202                 th->window = ntohs(tcp_select_window(sk));
1203 
1204                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1205 
1206                 sk->sent_seq = sk->write_seq;
1207                 
1208                 /*
1209                  *      This is mad. The tcp retransmit queue is put together
1210                  *      by the ip layer. This causes half the problems with
1211                  *      unroutable FIN's and other things.
1212                  */
1213                  
1214                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1215                 
1216                 /*
1217                  *      Set for next retransmit based on expected ACK time.
1218                  *      FIXME: We set this every time which means our 
1219                  *      retransmits are really about a window behind.
1220                  */
1221 
1222                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1223         }
1224 }
1225 
1226 /*
1227  *      Locking problems lead us to a messy situation where we can have
1228  *      multiple partially complete buffers queued up. This is really bad
1229  *      as we don't want to be sending partial buffers. Fix this with
1230  *      a semaphore or similar to lock tcp_write per socket.
1231  *
1232  *      These routines are pretty self descriptive.
1233  */
1234  
1235 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1236 {
1237         struct sk_buff * skb;
1238         unsigned long flags;
1239 
1240         save_flags(flags);
1241         cli();
1242         skb = sk->partial;
1243         if (skb) {
1244                 sk->partial = NULL;
1245                 del_timer(&sk->partial_timer);
1246         }
1247         restore_flags(flags);
1248         return skb;
1249 }
1250 
1251 /*
1252  *      Empty the partial queue
1253  */
1254  
1255 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1256 {
1257         struct sk_buff *skb;
1258 
1259         if (sk == NULL)
1260                 return;
1261         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1262                 tcp_send_skb(sk, skb);
1263 }
1264 
1265 /*
1266  *      Queue a partial frame
1267  */
1268  
1269 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1270 {
1271         struct sk_buff * tmp;
1272         unsigned long flags;
1273 
1274         save_flags(flags);
1275         cli();
1276         tmp = sk->partial;
1277         if (tmp)
1278                 del_timer(&sk->partial_timer);
1279         sk->partial = skb;
1280         init_timer(&sk->partial_timer);
1281         /*
1282          *      Wait up to 1 second for the buffer to fill.
1283          */
1284         sk->partial_timer.expires = HZ;
1285         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1286         sk->partial_timer.data = (unsigned long) sk;
1287         add_timer(&sk->partial_timer);
1288         restore_flags(flags);
1289         if (tmp)
1290                 tcp_send_skb(sk, tmp);
1291 }
1292 
1293 
1294 /*
1295  *      This routine sends an ack and also updates the window. 
1296  */
1297  
1298 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1299              struct sock *sk,
1300              struct tcphdr *th, unsigned long daddr)
1301 {
1302         struct sk_buff *buff;
1303         struct tcphdr *t1;
1304         struct device *dev = NULL;
1305         int tmp;
1306 
1307         if(sk->zapped)
1308                 return;         /* We have been reset, we may not send again */
1309                 
1310         /*
1311          * We need to grab some memory, and put together an ack,
1312          * and then put it into the queue to be sent.
1313          */
1314 
1315         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1316         if (buff == NULL) 
1317         {
1318                 /* 
1319                  *      Force it to send an ack. We don't have to do this
1320                  *      (ACK is unreliable) but it's much better use of 
1321                  *      bandwidth on slow links to send a spare ack than
1322                  *      resend packets. 
1323                  */
1324                  
1325                 sk->ack_backlog++;
1326                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1327                 {
1328                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1329                 }
1330                 return;
1331         }
1332 
1333         /*
1334          *      Assemble a suitable TCP frame
1335          */
1336          
1337         buff->len = sizeof(struct tcphdr);
1338         buff->sk = sk;
1339         buff->localroute = sk->localroute;
1340         t1 =(struct tcphdr *) buff->data;
1341 
1342         /* 
1343          *      Put in the IP header and routing stuff. 
1344          */
1345          
1346         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1347                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1348         if (tmp < 0) 
1349         {
1350                 buff->free = 1;
1351                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1352                 return;
1353         }
1354         buff->len += tmp;
1355         t1 =(struct tcphdr *)((char *)t1 +tmp);
1356 
1357         memcpy(t1, th, sizeof(*t1));
1358 
1359         /*
1360          *      Swap the send and the receive. 
1361          */
1362          
1363         t1->dest = th->source;
1364         t1->source = th->dest;
1365         t1->seq = ntohl(sequence);
1366         t1->ack = 1;
1367         sk->window = tcp_select_window(sk);
1368         t1->window = ntohs(sk->window);
1369         t1->res1 = 0;
1370         t1->res2 = 0;
1371         t1->rst = 0;
1372         t1->urg = 0;
1373         t1->syn = 0;
1374         t1->psh = 0;
1375         t1->fin = 0;
1376         
1377         /*
1378          *      If we have nothing queued for transmit and the transmit timer
1379          *      is on we are just doing an ACK timeout and need to switch
1380          *      to a keepalive.
1381          */
1382          
1383         if (ack == sk->acked_seq) 
1384         {
1385                 sk->ack_backlog = 0;
1386                 sk->bytes_rcv = 0;
1387                 sk->ack_timed = 0;
1388                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1389                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1390                 {
1391                         if(sk->keepopen) {
1392                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1393                         } else {
1394                                 delete_timer(sk);
1395                         }
1396                 }
1397         }
1398         
1399         /*
1400          *      Fill in the packet and send it
1401          */
1402          
1403         t1->ack_seq = ntohl(ack);
1404         t1->doff = sizeof(*t1)/4;
1405         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1406         if (sk->debug)
1407                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1408         tcp_statistics.TcpOutSegs++;
1409         sk->prot->queue_xmit(sk, dev, buff, 1);
1410 }
1411 
1412 
1413 /* 
1414  *      This routine builds a generic TCP header. 
1415  */
1416  
1417 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1418 {
1419 
1420         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1421         th->seq = htonl(sk->write_seq);
1422         th->psh =(push == 0) ? 1 : 0;
1423         th->doff = sizeof(*th)/4;
1424         th->ack = 1;
1425         th->fin = 0;
1426         sk->ack_backlog = 0;
1427         sk->bytes_rcv = 0;
1428         sk->ack_timed = 0;
1429         th->ack_seq = htonl(sk->acked_seq);
1430         sk->window = tcp_select_window(sk);
1431         th->window = htons(sk->window);
1432 
1433         return(sizeof(*th));
1434 }
1435 
1436 /*
1437  *      This routine copies from a user buffer into a socket,
1438  *      and starts the transmit system.
1439  */
1440 
1441 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1442           int len, int nonblock, unsigned flags)
1443 {
1444         int copied = 0;
1445         int copy;
1446         int tmp;
1447         struct sk_buff *skb;
1448         struct sk_buff *send_tmp;
1449         unsigned char *buff;
1450         struct proto *prot;
1451         struct device *dev = NULL;
1452 
1453         sk->inuse=1;
1454         prot = sk->prot;
1455         while(len > 0) 
1456         {
1457                 if (sk->err) 
1458                 {                       /* Stop on an error */
1459                         release_sock(sk);
1460                         if (copied) 
1461                                 return(copied);
1462                         tmp = -sk->err;
1463                         sk->err = 0;
1464                         return(tmp);
1465                 }
1466 
1467                 /*
1468                  *      First thing we do is make sure that we are established. 
1469                  */
1470         
1471                 if (sk->shutdown & SEND_SHUTDOWN) 
1472                 {
1473                         release_sock(sk);
1474                         sk->err = EPIPE;
1475                         if (copied) 
1476                                 return(copied);
1477                         sk->err = 0;
1478                         return(-EPIPE);
1479                 }
1480 
1481                 /* 
1482                  *      Wait for a connection to finish.
1483                  */
1484         
1485                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1486                 {
1487                         if (sk->err) 
1488                         {
1489                                 release_sock(sk);
1490                                 if (copied) 
1491                                         return(copied);
1492                                 tmp = -sk->err;
1493                                 sk->err = 0;
1494                                 return(tmp);
1495                         }
1496 
1497                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1498                         {
1499                                 release_sock(sk);
1500                                 if (copied) 
1501                                         return(copied);
1502 
1503                                 if (sk->err) 
1504                                 {
1505                                         tmp = -sk->err;
1506                                         sk->err = 0;
1507                                         return(tmp);
1508                                 }
1509 
1510                                 if (sk->keepopen) 
1511                                 {
1512                                         send_sig(SIGPIPE, current, 0);
1513                                 }
1514                                 return(-EPIPE);
1515                         }
1516 
1517                         if (nonblock || copied) 
1518                         {
1519                                 release_sock(sk);
1520                                 if (copied) 
1521                                         return(copied);
1522                                 return(-EAGAIN);
1523                         }
1524 
1525                         release_sock(sk);
1526                         cli();
1527                 
1528                         if (sk->state != TCP_ESTABLISHED &&
1529                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1530                         {
1531                                 interruptible_sleep_on(sk->sleep);
1532                                 if (current->signal & ~current->blocked) 
1533                                 {
1534                                         sti();
1535                                         if (copied) 
1536                                                 return(copied);
1537                                         return(-ERESTARTSYS);
1538                                 }
1539                         }
1540                         sk->inuse = 1;
1541                         sti();
1542                 }
1543 
1544         /*
1545          * The following code can result in copy <= if sk->mss is ever
1546          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1547          * sk->mtu is constant once SYN processing is finished.  I.e. we
1548          * had better not get here until we've seen his SYN and at least one
1549          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1550          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1551          * non-decreasing.  Note that any ioctl to set user_mss must be done
1552          * before the exchange of SYN's.  If the initial ack from the other
1553          * end has a window of 0, max_window and thus mss will both be 0.
1554          */
1555 
1556         /* 
1557          *      Now we need to check if we have a half built packet. 
1558          */
1559 
1560                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1561                 {
1562                         int hdrlen;
1563 
1564                          /* IP header + TCP header */
1565                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1566                                  + sizeof(struct tcphdr);
1567         
1568                         /* Add more stuff to the end of skb->len */
1569                         if (!(flags & MSG_OOB)) 
1570                         {
1571                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1572                                 /* FIXME: this is really a bug. */
1573                                 if (copy <= 0) 
1574                                 {
1575                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1576                                         copy = 0;
1577                                 }
1578           
1579                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1580                                 skb->len += copy;
1581                                 from += copy;
1582                                 copied += copy;
1583                                 len -= copy;
1584                                 sk->write_seq += copy;
1585                         }
1586                         if ((skb->len - hdrlen) >= sk->mss ||
1587                                 (flags & MSG_OOB) || !sk->packets_out)
1588                                 tcp_send_skb(sk, skb);
1589                         else
1590                                 tcp_enqueue_partial(skb, sk);
1591                         continue;
1592                 }
1593 
1594         /*
1595          * We also need to worry about the window.
1596          * If window < 1/2 the maximum window we've seen from this
1597          *   host, don't use it.  This is sender side
1598          *   silly window prevention, as specified in RFC1122.
1599          *   (Note that this is different than earlier versions of
1600          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1601          *   use the whole MSS.  Since the results in the right
1602          *   edge of the packet being outside the window, it will
1603          *   be queued for later rather than sent.
1604          */
1605 
1606                 copy = sk->window_seq - sk->write_seq;
1607                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1608                         copy = sk->mss;
1609                 if (copy > len)
1610                         copy = len;
1611 
1612         /*
1613          *      We should really check the window here also. 
1614          */
1615          
1616                 send_tmp = NULL;
1617                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1618                 {
1619                         /*
1620                          *      We will release the socket in case we sleep here. 
1621                          */
1622                         release_sock(sk);
1623                         /*
1624                          *      NB: following must be mtu, because mss can be increased.
1625                          *      mss is always <= mtu 
1626                          */
1627                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1628                         sk->inuse = 1;
1629                         send_tmp = skb;
1630                 } 
1631                 else 
1632                 {
1633                         /*
1634                          *      We will release the socket in case we sleep here. 
1635                          */
1636                         release_sock(sk);
1637                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1638                         sk->inuse = 1;
1639                 }
1640 
1641                 /*
1642                  *      If we didn't get any memory, we need to sleep. 
1643                  */
1644 
1645                 if (skb == NULL) 
1646                 {
1647                         sk->socket->flags |= SO_NOSPACE;
1648                         if (nonblock) 
1649                         {
1650                                 release_sock(sk);
1651                                 if (copied) 
1652                                         return(copied);
1653                                 return(-EAGAIN);
1654                         }
1655 
1656                         /*
1657                          *      FIXME: here is another race condition. 
1658                          */
1659 
1660                         tmp = sk->wmem_alloc;
1661                         release_sock(sk);
1662                         cli();
1663                         /*
1664                          *      Again we will try to avoid it. 
1665                          */
1666                         if (tmp <= sk->wmem_alloc &&
1667                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1668                                 && sk->err == 0) 
1669                         {
1670                                 sk->socket->flags &= ~SO_NOSPACE;
1671                                 interruptible_sleep_on(sk->sleep);
1672                                 if (current->signal & ~current->blocked) 
1673                                 {
1674                                         sti();
1675                                         if (copied) 
1676                                                 return(copied);
1677                                         return(-ERESTARTSYS);
1678                                 }
1679                         }
1680                         sk->inuse = 1;
1681                         sti();
1682                         continue;
1683                 }
1684 
1685                 skb->len = 0;
1686                 skb->sk = sk;
1687                 skb->free = 0;
1688                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1689         
1690                 buff = skb->data;
1691         
1692                 /*
1693                  * FIXME: we need to optimize this.
1694                  * Perhaps some hints here would be good.
1695                  */
1696                 
1697                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1698                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1699                 if (tmp < 0 ) 
1700                 {
1701                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1702                         release_sock(sk);
1703                         if (copied) 
1704                                 return(copied);
1705                         return(tmp);
1706                 }
1707                 skb->len += tmp;
1708                 skb->dev = dev;
1709                 buff += tmp;
1710                 skb->h.th =(struct tcphdr *) buff;
1711                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1712                 if (tmp < 0) 
1713                 {
1714                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1715                         release_sock(sk);
1716                         if (copied) 
1717                                 return(copied);
1718                         return(tmp);
1719                 }
1720 
1721                 if (flags & MSG_OOB) 
1722                 {
1723                         ((struct tcphdr *)buff)->urg = 1;
1724                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1725                 }
1726                 skb->len += tmp;
1727                 memcpy_fromfs(buff+tmp, from, copy);
1728 
1729                 from += copy;
1730                 copied += copy;
1731                 len -= copy;
1732                 skb->len += copy;
1733                 skb->free = 0;
1734                 sk->write_seq += copy;
1735         
1736                 if (send_tmp != NULL && sk->packets_out) 
1737                 {
1738                         tcp_enqueue_partial(send_tmp, sk);
1739                         continue;
1740                 }
1741                 tcp_send_skb(sk, skb);
1742         }
1743         sk->err = 0;
1744 
1745 /*
1746  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1747  *      interactive fast network servers. It's meant to be on and
1748  *      it really improves the throughput though not the echo time
1749  *      on my slow slip link - Alan
1750  */
1751 
1752 /*
1753  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1754  */
1755  
1756         if(sk->partial && ((!sk->packets_out) 
1757      /* If not nagling we can send on the before case too.. */
1758               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1759         ))
1760                 tcp_send_partial(sk);
1761 
1762         release_sock(sk);
1763         return(copied);
1764 }
1765 
1766 /*
1767  *      This is just a wrapper. 
1768  */
1769 
1770 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1771            int len, int nonblock, unsigned flags,
1772            struct sockaddr_in *addr, int addr_len)
1773 {
1774         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1775                 return -EINVAL;
1776         if (sk->state == TCP_CLOSE)
1777                 return -ENOTCONN;
1778         if (addr_len < sizeof(*addr))
1779                 return -EINVAL;
1780         if (addr->sin_family && addr->sin_family != AF_INET) 
1781                 return -EINVAL;
1782         if (addr->sin_port != sk->dummy_th.dest) 
1783                 return -EISCONN;
1784         if (addr->sin_addr.s_addr != sk->daddr) 
1785                 return -EISCONN;
1786         return tcp_write(sk, from, len, nonblock, flags);
1787 }
1788 
1789 
1790 /*
1791  *      Send an ack if one is backlogged at this point. Ought to merge
1792  *      this with tcp_send_ack().
1793  */
1794  
1795 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1796 {
1797         int tmp;
1798         struct device *dev = NULL;
1799         struct tcphdr *t1;
1800         struct sk_buff *buff;
1801 
1802         if (!sk->ack_backlog) 
1803                 return;
1804 
1805         /*
1806          * FIXME: we need to put code here to prevent this routine from
1807          * being called.  Being called once in a while is ok, so only check
1808          * if this is the second time in a row.
1809          */
1810 
1811         /*
1812          * We need to grab some memory, and put together an ack,
1813          * and then put it into the queue to be sent.
1814          */
1815 
1816         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1817         if (buff == NULL) 
1818         {
1819                 /* Try again real soon. */
1820                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1821                 return;
1822         }
1823 
1824         buff->len = sizeof(struct tcphdr);
1825         buff->sk = sk;
1826         buff->localroute = sk->localroute;
1827         
1828         /*
1829          *      Put in the IP header and routing stuff. 
1830          */
1831 
1832         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1833                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1834         if (tmp < 0) 
1835         {
1836                 buff->free = 1;
1837                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1838                 return;
1839         }
1840 
1841         buff->len += tmp;
1842         t1 =(struct tcphdr *)(buff->data +tmp);
1843 
1844         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1845         t1->seq = htonl(sk->sent_seq);
1846         t1->ack = 1;
1847         t1->res1 = 0;
1848         t1->res2 = 0;
1849         t1->rst = 0;
1850         t1->urg = 0;
1851         t1->syn = 0;
1852         t1->psh = 0;
1853         sk->ack_backlog = 0;
1854         sk->bytes_rcv = 0;
1855         sk->window = tcp_select_window(sk);
1856         t1->window = ntohs(sk->window);
1857         t1->ack_seq = ntohl(sk->acked_seq);
1858         t1->doff = sizeof(*t1)/4;
1859         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1860         sk->prot->queue_xmit(sk, dev, buff, 1);
1861         tcp_statistics.TcpOutSegs++;
1862 }
1863 
1864 
1865 /*
1866  *      FIXME:
1867  *      This routine frees used buffers.
1868  *      It should consider sending an ACK to let the
1869  *      other end know we now have a bigger window.
1870  */
1871 
1872 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1873 {
1874         unsigned long flags;
1875         unsigned long left;
1876         struct sk_buff *skb;
1877         unsigned long rspace;
1878 
1879         if(sk->debug)
1880                 printk("cleaning rbuf for sk=%p\n", sk);
1881   
1882         save_flags(flags);
1883         cli();
1884   
1885         left = sk->prot->rspace(sk);
1886  
1887         /*
1888          *      We have to loop through all the buffer headers,
1889          *      and try to free up all the space we can.
1890          */
1891 
1892         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1893         {
1894                 if (!skb->used || skb->users) 
1895                         break;
1896                 skb_unlink(skb);
1897                 skb->sk = sk;
1898                 kfree_skb(skb, FREE_READ);
1899         }
1900 
1901         restore_flags(flags);
1902 
1903         /*
1904          *      FIXME:
1905          *      At this point we should send an ack if the difference
1906          *      in the window, and the amount of space is bigger than
1907          *      TCP_WINDOW_DIFF.
1908          */
1909 
1910         if(sk->debug)
1911                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1912                                             left);
1913         if ((rspace=sk->prot->rspace(sk)) != left) 
1914         {
1915                 /*
1916                  * This area has caused the most trouble.  The current strategy
1917                  * is to simply do nothing if the other end has room to send at
1918                  * least 3 full packets, because the ack from those will auto-
1919                  * matically update the window.  If the other end doesn't think
1920                  * we have much space left, but we have room for at least 1 more
1921                  * complete packet than it thinks we do, we will send an ack
1922                  * immediately.  Otherwise we will wait up to .5 seconds in case
1923                  * the user reads some more.
1924                  */
1925                 sk->ack_backlog++;
1926         /*
1927          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1928          * if the other end is offering a window smaller than the agreed on MSS
1929          * (called sk->mtu here).  In theory there's no connection between send
1930          * and receive, and so no reason to think that they're going to send
1931          * small packets.  For the moment I'm using the hack of reducing the mss
1932          * only on the send side, so I'm putting mtu here.
1933          */
1934 
1935                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1936                 {
1937                         /* Send an ack right now. */
1938                         tcp_read_wakeup(sk);
1939                 } 
1940                 else 
1941                 {
1942                         /* Force it to send an ack soon. */
1943                         int was_active = del_timer(&sk->retransmit_timer);
1944                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1945                         {
1946                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1947                         } 
1948                         else
1949                                 add_timer(&sk->retransmit_timer);
1950                 }
1951         }
1952 } 
1953 
1954 
1955 /*
1956  *      Handle reading urgent data. BSD has very simple semantics for
1957  *      this, no blocking and very strange errors 8)
1958  */
1959  
1960 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1961              unsigned char *to, int len, unsigned flags)
1962 {
1963         /*
1964          *      No URG data to read
1965          */
1966         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1967                 return -EINVAL; /* Yes this is right ! */
1968                 
1969         if (sk->err) 
1970         {
1971                 int tmp = -sk->err;
1972                 sk->err = 0;
1973                 return tmp;
1974         }
1975 
1976         if (sk->state == TCP_CLOSE || sk->done) 
1977         {
1978                 if (!sk->done) {
1979                         sk->done = 1;
1980                         return 0;
1981                 }
1982                 return -ENOTCONN;
1983         }
1984 
1985         if (sk->shutdown & RCV_SHUTDOWN) 
1986         {
1987                 sk->done = 1;
1988                 return 0;
1989         }
1990         sk->inuse = 1;
1991         if (sk->urg_data & URG_VALID) 
1992         {
1993                 char c = sk->urg_data;
1994                 if (!(flags & MSG_PEEK))
1995                         sk->urg_data = URG_READ;
1996                 put_fs_byte(c, to);
1997                 release_sock(sk);
1998                 return 1;
1999         }
2000         release_sock(sk);
2001         
2002         /*
2003          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2004          * the available implementations agree in this case:
2005          * this call should never block, independent of the
2006          * blocking state of the socket.
2007          * Mike <pall@rz.uni-karlsruhe.de>
2008          */
2009         return -EAGAIN;
2010 }
2011 
2012 
2013 /*
2014  *      This routine copies from a sock struct into the user buffer. 
2015  */
2016  
2017 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2018         int len, int nonblock, unsigned flags)
2019 {
2020         struct wait_queue wait = { current, NULL };
2021         int copied = 0;
2022         unsigned long peek_seq;
2023         volatile unsigned long *seq;    /* So gcc doesn't overoptimise */
2024         unsigned long used;
2025 
2026         /* 
2027          *      This error should be checked. 
2028          */
2029          
2030         if (sk->state == TCP_LISTEN)
2031                 return -ENOTCONN;
2032 
2033         /*
2034          *      Urgent data needs to be handled specially. 
2035          */
2036          
2037         if (flags & MSG_OOB)
2038                 return tcp_read_urg(sk, nonblock, to, len, flags);
2039 
2040         /*
2041          *      Copying sequence to update. This is volatile to handle
2042          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2043          *      inline and thus not flush cached variables otherwise).
2044          */
2045          
2046         peek_seq = sk->copied_seq;
2047         seq = &sk->copied_seq;
2048         if (flags & MSG_PEEK)
2049                 seq = &peek_seq;
2050 
2051         add_wait_queue(sk->sleep, &wait);
2052         sk->inuse = 1;
2053         while (len > 0) 
2054         {
2055                 struct sk_buff * skb;
2056                 unsigned long offset;
2057         
2058                 /*
2059                  * Are we at urgent data? Stop if we have read anything.
2060                  */
2061                  
2062                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2063                         break;
2064 
2065                 /*
2066                  *      Next get a buffer.
2067                  */
2068                  
2069                 current->state = TASK_INTERRUPTIBLE;
2070 
2071                 skb = skb_peek(&sk->receive_queue);
2072                 do 
2073                 {
2074                         if (!skb)
2075                                 break;
2076                         if (before(*seq, skb->h.th->seq))
2077                                 break;
2078                         offset = *seq - skb->h.th->seq;
2079                         if (skb->h.th->syn)
2080                                 offset--;
2081                         if (offset < skb->len)
2082                                 goto found_ok_skb;
2083                         if (skb->h.th->fin)
2084                                 goto found_fin_ok;
2085                         if (!(flags & MSG_PEEK))
2086                                 skb->used = 1;
2087                         skb = skb->next;
2088                 }
2089                 while (skb != (struct sk_buff *)&sk->receive_queue);
2090 
2091                 if (copied)
2092                         break;
2093 
2094                 if (sk->err) 
2095                 {
2096                         copied = -sk->err;
2097                         sk->err = 0;
2098                         break;
2099                 }
2100 
2101                 if (sk->state == TCP_CLOSE) 
2102                 {
2103                         if (!sk->done) 
2104                         {
2105                                 sk->done = 1;
2106                                 break;
2107                         }
2108                         copied = -ENOTCONN;
2109                         break;
2110                 }
2111 
2112                 if (sk->shutdown & RCV_SHUTDOWN) 
2113                 {
2114                         sk->done = 1;
2115                         break;
2116                 }
2117                         
2118                 if (nonblock) 
2119                 {
2120                         copied = -EAGAIN;
2121                         break;
2122                 }
2123 
2124                 cleanup_rbuf(sk);
2125                 release_sock(sk);
2126                 sk->socket->flags |= SO_WAITDATA;
2127                 schedule();
2128                 sk->socket->flags &= ~SO_WAITDATA;
2129                 sk->inuse = 1;
2130 
2131                 if (current->signal & ~current->blocked) 
2132                 {
2133                         copied = -ERESTARTSYS;
2134                         break;
2135                 }
2136                 continue;
2137 
2138         found_ok_skb:
2139                 /*
2140                  *      Lock the buffer. We can be fairly relaxed as
2141                  *      an interrupt will never steal a buffer we are 
2142                  *      using unless I've missed something serious in
2143                  *      tcp_data.
2144                  */
2145                 
2146                 skb->users++;
2147                 
2148                 /*
2149                  *      Ok so how much can we use ? 
2150                  */
2151                  
2152                 used = skb->len - offset;
2153                 if (len < used)
2154                         used = len;
2155                 /*
2156                  *      Do we have urgent data here? 
2157                  */
2158                 
2159                 if (sk->urg_data) 
2160                 {
2161                         unsigned long urg_offset = sk->urg_seq - *seq;
2162                         if (urg_offset < used) 
2163                         {
2164                                 if (!urg_offset) 
2165                                 {
2166                                         if (!sk->urginline) 
2167                                         {
2168                                                 ++*seq;
2169                                                 offset++;
2170                                                 used--;
2171                                         }
2172                                 }
2173                                 else
2174                                         used = urg_offset;
2175                         }
2176                 }
2177                 
2178                 /*
2179                  *      Copy it - We _MUST_ update *seq first so that we
2180                  *      don't ever double read when we have dual readers
2181                  */
2182                  
2183                 *seq += used;
2184 
2185                 /*
2186                  *      This memcpy_tofs can sleep. If it sleeps and we
2187                  *      do a second read it relies on the skb->users to avoid
2188                  *      a crash when cleanup_rbuf() gets called.
2189                  */
2190                  
2191                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2192                         skb->h.th->doff*4 + offset, used);
2193                 copied += used;
2194                 len -= used;
2195                 to += used;
2196                 
2197                 /*
2198                  *      We now will not sleep again until we are finished
2199                  *      with skb. Sorry if you are doing the SMP port
2200                  *      but you'll just have to fix it neatly ;)
2201                  */
2202                  
2203                 skb->users --;
2204                 
2205                 if (after(sk->copied_seq,sk->urg_seq))
2206                         sk->urg_data = 0;
2207                 if (used + offset < skb->len)
2208                         continue;
2209                 
2210                 /*
2211                  *      Process the FIN.
2212                  */
2213 
2214                 if (skb->h.th->fin)
2215                         goto found_fin_ok;
2216                 if (flags & MSG_PEEK)
2217                         continue;
2218                 skb->used = 1;
2219                 continue;
2220 
2221         found_fin_ok:
2222                 ++*seq;
2223                 if (flags & MSG_PEEK)
2224                         break;
2225                         
2226                 /*
2227                  *      All is done
2228                  */
2229                  
2230                 skb->used = 1;
2231                 sk->shutdown |= RCV_SHUTDOWN;
2232                 break;
2233 
2234         }
2235         remove_wait_queue(sk->sleep, &wait);
2236         current->state = TASK_RUNNING;
2237 
2238         /* Clean up data we have read: This will do ACK frames */
2239         cleanup_rbuf(sk);
2240         release_sock(sk);
2241         return copied;
2242 }
2243 
2244 /*
2245  *      State processing on a close. This implements the state shift for
2246  *      sending our FIN frame. Note that we only send a FIN for some 
2247  *      states. A shutdown() may have already sent the FIN, or we may be
2248  *      closed.
2249  */
2250  
2251 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2252 {
2253         int ns=TCP_CLOSE;
2254         int send_fin=0;
2255         switch(sk->state)
2256         {
2257                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2258                         break;
2259                 case TCP_SYN_RECV:
2260                 case TCP_ESTABLISHED:   /* Closedown begin */
2261                         ns=TCP_FIN_WAIT1;
2262                         send_fin=1;
2263                         break;
2264                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2265                 case TCP_FIN_WAIT2:
2266                 case TCP_CLOSING:
2267                         ns=sk->state;
2268                         break;
2269                 case TCP_CLOSE:
2270                 case TCP_LISTEN:
2271                         break;
2272                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2273                                            wait only for the ACK */
2274                         ns=TCP_LAST_ACK;
2275                         send_fin=1;
2276         }
2277         
2278         tcp_set_state(sk,ns);
2279                 
2280         /*
2281          *      This is a (useful) BSD violating of the RFC. There is a
2282          *      problem with TCP as specified in that the other end could
2283          *      keep a socket open forever with no application left this end.
2284          *      We use a 3 minute timeout (about the same as BSD) then kill
2285          *      our end. If they send after that then tough - BUT: long enough
2286          *      that we won't make the old 4*rto = almost no time - whoops
2287          *      reset mistake.
2288          */
2289         if(dead && ns==TCP_FIN_WAIT2)
2290         {
2291                 int timer_active=del_timer(&sk->timer);
2292                 if(timer_active)
2293                         add_timer(&sk->timer);
2294                 else
2295                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2296         }
2297         
2298         return send_fin;
2299 }
2300 
2301 /*
2302  *      Send a fin.
2303  */
2304 
2305 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2306 {
2307         struct proto *prot =(struct proto *)sk->prot;
2308         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2309         struct tcphdr *t1;
2310         struct sk_buff *buff;
2311         struct device *dev=NULL;
2312         int tmp;
2313                 
2314         release_sock(sk); /* in case the malloc sleeps. */
2315         
2316         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2317         sk->inuse = 1;
2318 
2319         if (buff == NULL)
2320         {
2321                 /* This is a disaster if it occurs */
2322                 printk("tcp_send_fin: Impossible malloc failure");
2323                 return;
2324         }
2325 
2326         /*
2327          *      Administrivia
2328          */
2329          
2330         buff->sk = sk;
2331         buff->len = sizeof(*t1);
2332         buff->localroute = sk->localroute;
2333         t1 =(struct tcphdr *) buff->data;
2334 
2335         /*
2336          *      Put in the IP header and routing stuff. 
2337          */
2338 
2339         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2340                            IPPROTO_TCP, sk->opt,
2341                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2342         if (tmp < 0) 
2343         {
2344                 int t;
2345                 /*
2346                  *      Finish anyway, treat this as a send that got lost. 
2347                  *      (Not good).
2348                  */
2349                  
2350                 buff->free = 1;
2351                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2352                 sk->write_seq++;
2353                 t=del_timer(&sk->timer);
2354                 if(t)
2355                         add_timer(&sk->timer);
2356                 else
2357                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2358                 return;
2359         }
2360         
2361         /*
2362          *      We ought to check if the end of the queue is a buffer and
2363          *      if so simply add the fin to that buffer, not send it ahead.
2364          */
2365 
2366         t1 =(struct tcphdr *)((char *)t1 +tmp);
2367         buff->len += tmp;
2368         buff->dev = dev;
2369         memcpy(t1, th, sizeof(*t1));
2370         t1->seq = ntohl(sk->write_seq);
2371         sk->write_seq++;
2372         buff->h.seq = sk->write_seq;
2373         t1->ack = 1;
2374         t1->ack_seq = ntohl(sk->acked_seq);
2375         t1->window = ntohs(sk->window=tcp_select_window(sk));
2376         t1->fin = 1;
2377         t1->rst = 0;
2378         t1->doff = sizeof(*t1)/4;
2379         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2380 
2381         /*
2382          * If there is data in the write queue, the fin must be appended to
2383          * the write queue.
2384          */
2385         
2386         if (skb_peek(&sk->write_queue) != NULL) 
2387         {
2388                 buff->free = 0;
2389                 if (buff->next != NULL) 
2390                 {
2391                         printk("tcp_send_fin: next != NULL\n");
2392                         skb_unlink(buff);
2393                 }
2394                 skb_queue_tail(&sk->write_queue, buff);
2395         } 
2396         else 
2397         {
2398                 sk->sent_seq = sk->write_seq;
2399                 sk->prot->queue_xmit(sk, dev, buff, 0);
2400                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2401         }
2402 }
2403 
2404 /*
2405  *      Shutdown the sending side of a connection. Much like close except
2406  *      that we don't receive shut down or set sk->dead=1.
2407  */
2408 
2409 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2410 {
2411         /*
2412          *      We need to grab some memory, and put together a FIN,
2413          *      and then put it into the queue to be sent.
2414          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2415          */
2416 
2417         if (!(how & SEND_SHUTDOWN)) 
2418                 return;
2419          
2420         /*
2421          *      If we've already sent a FIN, or it's a closed state
2422          */
2423          
2424         if (sk->state == TCP_FIN_WAIT1 ||
2425             sk->state == TCP_FIN_WAIT2 ||
2426             sk->state == TCP_CLOSING ||
2427             sk->state == TCP_LAST_ACK ||
2428             sk->state == TCP_TIME_WAIT || 
2429             sk->state == TCP_CLOSE ||
2430             sk->state == TCP_LISTEN
2431           )
2432         {
2433                 return;
2434         }
2435         sk->inuse = 1;
2436 
2437         /*
2438          * flag that the sender has shutdown
2439          */
2440 
2441         sk->shutdown |= SEND_SHUTDOWN;
2442 
2443         /*
2444          *  Clear out any half completed packets. 
2445          */
2446 
2447         if (sk->partial)
2448                 tcp_send_partial(sk);
2449                 
2450         /*
2451          *      FIN if needed
2452          */
2453          
2454         if(tcp_close_state(sk,0))
2455                 tcp_send_fin(sk);
2456                 
2457         release_sock(sk);
2458 }
2459 
2460 
2461 static int
2462 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2463              int to_len, int nonblock, unsigned flags,
2464              struct sockaddr_in *addr, int *addr_len)
2465 {
2466         int result;
2467   
2468         /* 
2469          *      Have to check these first unlike the old code. If 
2470          *      we check them after we lose data on an error
2471          *      which is wrong 
2472          */
2473 
2474         if(addr_len)
2475                 *addr_len = sizeof(*addr);
2476         result=tcp_read(sk, to, to_len, nonblock, flags);
2477 
2478         if (result < 0) 
2479                 return(result);
2480   
2481         if(addr)
2482         {
2483                 addr->sin_family = AF_INET;
2484                 addr->sin_port = sk->dummy_th.dest;
2485                 addr->sin_addr.s_addr = sk->daddr;
2486         }
2487         return(result);
2488 }
2489 
2490 
2491 /*
2492  *      This routine will send an RST to the other tcp. 
2493  */
2494  
2495 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2496           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2497 {
2498         struct sk_buff *buff;
2499         struct tcphdr *t1;
2500         int tmp;
2501         struct device *ndev=NULL;
2502 
2503         /*
2504          *      Cannot reset a reset (Think about it).
2505          */
2506          
2507         if(th->rst)
2508                 return;
2509   
2510         /*
2511          * We need to grab some memory, and put together an RST,
2512          * and then put it into the queue to be sent.
2513          */
2514 
2515         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2516         if (buff == NULL) 
2517                 return;
2518 
2519         buff->len = sizeof(*t1);
2520         buff->sk = NULL;
2521         buff->dev = dev;
2522         buff->localroute = 0;
2523 
2524         t1 =(struct tcphdr *) buff->data;
2525 
2526         /*
2527          *      Put in the IP header and routing stuff. 
2528          */
2529 
2530         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2531                            sizeof(struct tcphdr),tos,ttl);
2532         if (tmp < 0) 
2533         {
2534                 buff->free = 1;
2535                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2536                 return;
2537         }
2538 
2539         t1 =(struct tcphdr *)((char *)t1 +tmp);
2540         buff->len += tmp;
2541         memcpy(t1, th, sizeof(*t1));
2542 
2543         /*
2544          *      Swap the send and the receive. 
2545          */
2546 
2547         t1->dest = th->source;
2548         t1->source = th->dest;
2549         t1->rst = 1;  
2550         t1->window = 0;
2551   
2552         if(th->ack)
2553         {
2554                 t1->ack = 0;
2555                 t1->seq = th->ack_seq;
2556                 t1->ack_seq = 0;
2557         }
2558         else
2559         {
2560                 t1->ack = 1;
2561                 if(!th->syn)
2562                         t1->ack_seq=htonl(th->seq);
2563                 else
2564                         t1->ack_seq=htonl(th->seq+1);
2565                 t1->seq=0;
2566         }
2567 
2568         t1->syn = 0;
2569         t1->urg = 0;
2570         t1->fin = 0;
2571         t1->psh = 0;
2572         t1->doff = sizeof(*t1)/4;
2573         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2574         prot->queue_xmit(NULL, ndev, buff, 1);
2575         tcp_statistics.TcpOutSegs++;
2576 }
2577 
2578 
2579 /*
2580  *      Look for tcp options. Parses everything but only knows about MSS.
2581  *      This routine is always called with the packet containing the SYN.
2582  *      However it may also be called with the ack to the SYN.  So you
2583  *      can't assume this is always the SYN.  It's always called after
2584  *      we have set up sk->mtu to our own MTU.
2585  *
2586  *      We need at minimum to add PAWS support here. Possibly large windows
2587  *      as Linux gets deployed on 100Mb/sec networks.
2588  */
2589  
2590 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2591 {
2592         unsigned char *ptr;
2593         int length=(th->doff*4)-sizeof(struct tcphdr);
2594         int mss_seen = 0;
2595     
2596         ptr = (unsigned char *)(th + 1);
2597   
2598         while(length>0)
2599         {
2600                 int opcode=*ptr++;
2601                 int opsize=*ptr++;
2602                 switch(opcode)
2603                 {
2604                         case TCPOPT_EOL:
2605                                 return;
2606                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2607                                 length--;
2608                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2609                                 continue;
2610                         
2611                         default:
2612                                 if(opsize<=2)   /* Avoid silly options looping forever */
2613                                         return;
2614                                 switch(opcode)
2615                                 {
2616                                         case TCPOPT_MSS:
2617                                                 if(opsize==4 && th->syn)
2618                                                 {
2619                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2620                                                         mss_seen = 1;
2621                                                 }
2622                                                 break;
2623                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2624                                 }
2625                                 ptr+=opsize-2;
2626                                 length-=opsize;
2627                 }
2628         }
2629         if (th->syn) 
2630         {
2631                 if (! mss_seen)
2632                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2633         }
2634 #ifdef CONFIG_INET_PCTCP
2635         sk->mss = min(sk->max_window >> 1, sk->mtu);
2636 #else    
2637         sk->mss = min(sk->max_window, sk->mtu);
2638 #endif  
2639 }
2640 
2641 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2642 {
2643         dst = ntohl(dst);
2644         if (IN_CLASSA(dst))
2645                 return htonl(IN_CLASSA_NET);
2646         if (IN_CLASSB(dst))
2647                 return htonl(IN_CLASSB_NET);
2648         return htonl(IN_CLASSC_NET);
2649 }
2650 
2651 /*
2652  *      Default sequence number picking algorithm.
2653  *      As close as possible to RFC 793, which
2654  *      suggests using a 250kHz clock.
2655  *      Further reading shows this assumes 2MB/s networks.
2656  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2657  *      That's funny, Linux has one built in!  Use it!
2658  */
2659 
2660 extern inline unsigned long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2661 {
2662         struct timeval tv;
2663         do_gettimeofday(&tv);
2664         return tv.tv_usec+tv.tv_sec*1000000;
2665 }
2666 
2667 /*
2668  *      This routine handles a connection request.
2669  *      It should make sure we haven't already responded.
2670  *      Because of the way BSD works, we have to send a syn/ack now.
2671  *      This also means it will be harder to close a socket which is
2672  *      listening.
2673  */
2674  
2675 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2676                  unsigned long daddr, unsigned long saddr,
2677                  struct options *opt, struct device *dev, unsigned long seq)
2678 {
2679         struct sk_buff *buff;
2680         struct tcphdr *t1;
2681         unsigned char *ptr;
2682         struct sock *newsk;
2683         struct tcphdr *th;
2684         struct device *ndev=NULL;
2685         int tmp;
2686         struct rtable *rt;
2687   
2688         th = skb->h.th;
2689 
2690         /* If the socket is dead, don't accept the connection. */
2691         if (!sk->dead) 
2692         {
2693                 sk->data_ready(sk,0);
2694         }
2695         else 
2696         {
2697                 if(sk->debug)
2698                         printk("Reset on %p: Connect on dead socket.\n",sk);
2699                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2700                 tcp_statistics.TcpAttemptFails++;
2701                 kfree_skb(skb, FREE_READ);
2702                 return;
2703         }
2704 
2705         /*
2706          * Make sure we can accept more.  This will prevent a
2707          * flurry of syns from eating up all our memory.
2708          */
2709 
2710         if (sk->ack_backlog >= sk->max_ack_backlog) 
2711         {
2712                 tcp_statistics.TcpAttemptFails++;
2713                 kfree_skb(skb, FREE_READ);
2714                 return;
2715         }
2716 
2717         /*
2718          * We need to build a new sock struct.
2719          * It is sort of bad to have a socket without an inode attached
2720          * to it, but the wake_up's will just wake up the listening socket,
2721          * and if the listening socket is destroyed before this is taken
2722          * off of the queue, this will take care of it.
2723          */
2724 
2725         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2726         if (newsk == NULL) 
2727         {
2728                 /* just ignore the syn.  It will get retransmitted. */
2729                 tcp_statistics.TcpAttemptFails++;
2730                 kfree_skb(skb, FREE_READ);
2731                 return;
2732         }
2733 
2734         memcpy(newsk, sk, sizeof(*newsk));
2735         skb_queue_head_init(&newsk->write_queue);
2736         skb_queue_head_init(&newsk->receive_queue);
2737         newsk->send_head = NULL;
2738         newsk->send_tail = NULL;
2739         skb_queue_head_init(&newsk->back_log);
2740         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2741         newsk->rto = TCP_TIMEOUT_INIT;
2742         newsk->mdev = 0;
2743         newsk->max_window = 0;
2744         newsk->cong_window = 1;
2745         newsk->cong_count = 0;
2746         newsk->ssthresh = 0;
2747         newsk->backoff = 0;
2748         newsk->blog = 0;
2749         newsk->intr = 0;
2750         newsk->proc = 0;
2751         newsk->done = 0;
2752         newsk->partial = NULL;
2753         newsk->pair = NULL;
2754         newsk->wmem_alloc = 0;
2755         newsk->rmem_alloc = 0;
2756         newsk->localroute = sk->localroute;
2757 
2758         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2759 
2760         newsk->err = 0;
2761         newsk->shutdown = 0;
2762         newsk->ack_backlog = 0;
2763         newsk->acked_seq = skb->h.th->seq+1;
2764         newsk->copied_seq = skb->h.th->seq+1;
2765         newsk->fin_seq = skb->h.th->seq;
2766         newsk->state = TCP_SYN_RECV;
2767         newsk->timeout = 0;
2768         newsk->ip_xmit_timeout = 0;
2769         newsk->write_seq = seq; 
2770         newsk->window_seq = newsk->write_seq;
2771         newsk->rcv_ack_seq = newsk->write_seq;
2772         newsk->urg_data = 0;
2773         newsk->retransmits = 0;
2774         newsk->linger=0;
2775         newsk->destroy = 0;
2776         init_timer(&newsk->timer);
2777         newsk->timer.data = (unsigned long)newsk;
2778         newsk->timer.function = &net_timer;
2779         init_timer(&newsk->retransmit_timer);
2780         newsk->retransmit_timer.data = (unsigned long)newsk;
2781         newsk->retransmit_timer.function=&retransmit_timer;
2782         newsk->dummy_th.source = skb->h.th->dest;
2783         newsk->dummy_th.dest = skb->h.th->source;
2784         
2785         /*
2786          *      Swap these two, they are from our point of view. 
2787          */
2788          
2789         newsk->daddr = saddr;
2790         newsk->saddr = daddr;
2791 
2792         put_sock(newsk->num,newsk);
2793         newsk->dummy_th.res1 = 0;
2794         newsk->dummy_th.doff = 6;
2795         newsk->dummy_th.fin = 0;
2796         newsk->dummy_th.syn = 0;
2797         newsk->dummy_th.rst = 0;        
2798         newsk->dummy_th.psh = 0;
2799         newsk->dummy_th.ack = 0;
2800         newsk->dummy_th.urg = 0;
2801         newsk->dummy_th.res2 = 0;
2802         newsk->acked_seq = skb->h.th->seq + 1;
2803         newsk->copied_seq = skb->h.th->seq + 1;
2804         newsk->socket = NULL;
2805 
2806         /*
2807          *      Grab the ttl and tos values and use them 
2808          */
2809 
2810         newsk->ip_ttl=sk->ip_ttl;
2811         newsk->ip_tos=skb->ip_hdr->tos;
2812 
2813         /*
2814          *      Use 512 or whatever user asked for 
2815          */
2816 
2817         /*
2818          *      Note use of sk->user_mss, since user has no direct access to newsk 
2819          */
2820 
2821         rt=ip_rt_route(saddr, NULL,NULL);
2822         
2823         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2824                 newsk->window_clamp = rt->rt_window;
2825         else
2826                 newsk->window_clamp = 0;
2827                 
2828         if (sk->user_mss)
2829                 newsk->mtu = sk->user_mss;
2830         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2831                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2832         else 
2833         {
2834 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2835                 if ((saddr ^ daddr) & default_mask(saddr))
2836 #else
2837                 if ((saddr ^ daddr) & dev->pa_mask)
2838 #endif
2839                         newsk->mtu = 576 - HEADER_SIZE;
2840                 else
2841                         newsk->mtu = MAX_WINDOW;
2842         }
2843 
2844         /*
2845          *      But not bigger than device MTU 
2846          */
2847 
2848         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2849 
2850         /*
2851          *      This will min with what arrived in the packet 
2852          */
2853 
2854         tcp_options(newsk,skb->h.th);
2855 
2856         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2857         if (buff == NULL) 
2858         {
2859                 sk->err = ENOMEM;
2860                 newsk->dead = 1;
2861                 newsk->state = TCP_CLOSE;
2862                 /* And this will destroy it */
2863                 release_sock(newsk);
2864                 kfree_skb(skb, FREE_READ);
2865                 tcp_statistics.TcpAttemptFails++;
2866                 return;
2867         }
2868   
2869         buff->len = sizeof(struct tcphdr)+4;
2870         buff->sk = newsk;
2871         buff->localroute = newsk->localroute;
2872 
2873         t1 =(struct tcphdr *) buff->data;
2874 
2875         /*
2876          *      Put in the IP header and routing stuff. 
2877          */
2878 
2879         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2880                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2881 
2882         /*
2883          *      Something went wrong. 
2884          */
2885 
2886         if (tmp < 0) 
2887         {
2888                 sk->err = tmp;
2889                 buff->free = 1;
2890                 kfree_skb(buff,FREE_WRITE);
2891                 newsk->dead = 1;
2892                 newsk->state = TCP_CLOSE;
2893                 release_sock(newsk);
2894                 skb->sk = sk;
2895                 kfree_skb(skb, FREE_READ);
2896                 tcp_statistics.TcpAttemptFails++;
2897                 return;
2898         }
2899 
2900         buff->len += tmp;
2901         t1 =(struct tcphdr *)((char *)t1 +tmp);
2902   
2903         memcpy(t1, skb->h.th, sizeof(*t1));
2904         buff->h.seq = newsk->write_seq;
2905         /*
2906          *      Swap the send and the receive. 
2907          */
2908         t1->dest = skb->h.th->source;
2909         t1->source = newsk->dummy_th.source;
2910         t1->seq = ntohl(newsk->write_seq++);
2911         t1->ack = 1;
2912         newsk->window = tcp_select_window(newsk);
2913         newsk->sent_seq = newsk->write_seq;
2914         t1->window = ntohs(newsk->window);
2915         t1->res1 = 0;
2916         t1->res2 = 0;
2917         t1->rst = 0;
2918         t1->urg = 0;
2919         t1->psh = 0;
2920         t1->syn = 1;
2921         t1->ack_seq = ntohl(skb->h.th->seq+1);
2922         t1->doff = sizeof(*t1)/4+1;
2923         ptr =(unsigned char *)(t1+1);
2924         ptr[0] = 2;
2925         ptr[1] = 4;
2926         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2927         ptr[3] =(newsk->mtu) & 0xff;
2928 
2929         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2930         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2931         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2932         skb->sk = newsk;
2933 
2934         /*
2935          *      Charge the sock_buff to newsk. 
2936          */
2937          
2938         sk->rmem_alloc -= skb->mem_len;
2939         newsk->rmem_alloc += skb->mem_len;
2940         
2941         skb_queue_tail(&sk->receive_queue,skb);
2942         sk->ack_backlog++;
2943         release_sock(newsk);
2944         tcp_statistics.TcpOutSegs++;
2945 }
2946 
2947 
2948 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2949 {
2950         /*
2951          * We need to grab some memory, and put together a FIN, 
2952          * and then put it into the queue to be sent.
2953          */
2954         
2955         sk->inuse = 1;
2956         
2957         if(sk->state == TCP_LISTEN)
2958         {
2959                 /* Special case */
2960                 tcp_set_state(sk, TCP_CLOSE);
2961                 tcp_close_pending(sk);
2962                 release_sock(sk);
2963                 return;
2964         }
2965         
2966         sk->keepopen = 1;
2967         sk->shutdown = SHUTDOWN_MASK;
2968 
2969         if (!sk->dead) 
2970                 sk->state_change(sk);
2971 
2972         if (timeout == 0) 
2973         {
2974                 struct sk_buff *skb;
2975                 
2976                 /*
2977                  *  We need to flush the recv. buffs.  We do this only on the
2978                  *  descriptor close, not protocol-sourced closes, because the
2979                  *  reader process may not have drained the data yet!
2980                  */
2981                  
2982                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2983                         kfree_skb(skb, FREE_READ);
2984                 /*
2985                  *      Get rid off any half-completed packets. 
2986                  */
2987 
2988                 if (sk->partial) 
2989                         tcp_send_partial(sk);
2990         }
2991 
2992                 
2993         /*
2994          *      Timeout is not the same thing - however the code likes
2995          *      to send both the same way (sigh).
2996          */
2997          
2998         if(timeout)
2999         {
3000                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3001         }
3002         else
3003         {
3004                 if(tcp_close_state(sk,1)==1)
3005                 {
3006                         tcp_send_fin(sk);
3007                 }
3008         }
3009         release_sock(sk);
3010 }
3011 
3012 
3013 /*
3014  *      This routine takes stuff off of the write queue,
3015  *      and puts it in the xmit queue. This happens as incoming acks
3016  *      open up the remote window for us.
3017  */
3018  
3019 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3020 {
3021         struct sk_buff *skb;
3022 
3023         /*
3024          *      The bytes will have to remain here. In time closedown will
3025          *      empty the write queue and all will be happy 
3026          */
3027 
3028         if(sk->zapped)
3029                 return;
3030 
3031         /*
3032          *      Anything on the transmit queue that fits the window can
3033          *      be added providing we are not
3034          *
3035          *      a) retransmitting (Nagle's rule)
3036          *      b) exceeding our congestion window.
3037          */
3038          
3039         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3040                 before(skb->h.seq, sk->window_seq + 1) &&
3041                 (sk->retransmits == 0 ||
3042                  sk->ip_xmit_timeout != TIME_WRITE ||
3043                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3044                 && sk->packets_out < sk->cong_window) 
3045         {
3046                 IS_SKB(skb);
3047                 skb_unlink(skb);
3048                 
3049                 /*
3050                  *      See if we really need to send the packet. 
3051                  */
3052                  
3053                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3054                 {
3055                         /*
3056                          *      This is acked data. We can discard it. This 
3057                          *      cannot currently occur.
3058                          */
3059                          
3060                         sk->retransmits = 0;
3061                         kfree_skb(skb, FREE_WRITE);
3062                         if (!sk->dead) 
3063                                 sk->write_space(sk);
3064                 } 
3065                 else
3066                 {
3067                         struct tcphdr *th;
3068                         struct iphdr *iph;
3069                         int size;
3070 /*
3071  * put in the ack seq and window at this point rather than earlier,
3072  * in order to keep them monotonic.  We really want to avoid taking
3073  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3074  * Ack and window will in general have changed since this packet was put
3075  * on the write queue.
3076  */
3077                         iph = (struct iphdr *)(skb->data +
3078                                                skb->dev->hard_header_len);
3079                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3080                         size = skb->len - (((unsigned char *) th) - skb->data);
3081                         
3082                         th->ack_seq = ntohl(sk->acked_seq);
3083                         th->window = ntohs(tcp_select_window(sk));
3084 
3085                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3086 
3087                         sk->sent_seq = skb->h.seq;
3088                         
3089                         /*
3090                          *      IP manages our queue for some crazy reason
3091                          */
3092                          
3093                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3094                         
3095                         /*
3096                          *      Again we slide the timer wrongly
3097                          */
3098                          
3099                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3100                 }
3101         }
3102 }
3103 
3104 
3105 /*
3106  *      This routine deals with incoming acks, but not outgoing ones.
3107  */
3108 
3109 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3110 {
3111         unsigned long ack;
3112         int flag = 0;
3113 
3114         /* 
3115          * 1 - there was data in packet as well as ack or new data is sent or 
3116          *     in shutdown state
3117          * 2 - data from retransmit queue was acked and removed
3118          * 4 - window shrunk or data from retransmit queue was acked and removed
3119          */
3120 
3121         if(sk->zapped)
3122                 return(1);      /* Dead, cant ack any more so why bother */
3123 
3124         /*
3125          *      Have we discovered a larger window
3126          */
3127          
3128         ack = ntohl(th->ack_seq);
3129 
3130         if (ntohs(th->window) > sk->max_window) 
3131         {
3132                 sk->max_window = ntohs(th->window);
3133 #ifdef CONFIG_INET_PCTCP
3134                 /* Hack because we don't send partial packets to non SWS
3135                    handling hosts */
3136                 sk->mss = min(sk->max_window>>1, sk->mtu);
3137 #else
3138                 sk->mss = min(sk->max_window, sk->mtu);
3139 #endif  
3140         }
3141 
3142         /*
3143          *      We have dropped back to keepalive timeouts. Thus we have
3144          *      no retransmits pending.
3145          */
3146          
3147         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3148                 sk->retransmits = 0;
3149 
3150         /*
3151          *      If the ack is newer than sent or older than previous acks
3152          *      then we can probably ignore it.
3153          */
3154          
3155         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3156         {
3157                 if(sk->debug)
3158                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3159                         
3160                 /*
3161                  *      Keepalive processing.
3162                  */
3163                  
3164                 if (after(ack, sk->sent_seq)) 
3165                 {
3166                         return(0);
3167                 }
3168                 
3169                 /*
3170                  *      Restart the keepalive timer.
3171                  */
3172                  
3173                 if (sk->keepopen) 
3174                 {
3175                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3176                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3177                 }
3178                 return(1);
3179         }
3180 
3181         /*
3182          *      If there is data set flag 1
3183          */
3184          
3185         if (len != th->doff*4) 
3186                 flag |= 1;
3187 
3188         /*
3189          *      See if our window has been shrunk. 
3190          */
3191 
3192         if (after(sk->window_seq, ack+ntohs(th->window))) 
3193         {
3194                 /*
3195                  * We may need to move packets from the send queue
3196                  * to the write queue, if the window has been shrunk on us.
3197                  * The RFC says you are not allowed to shrink your window
3198                  * like this, but if the other end does, you must be able
3199                  * to deal with it.
3200                  */
3201                 struct sk_buff *skb;
3202                 struct sk_buff *skb2;
3203                 struct sk_buff *wskb = NULL;
3204         
3205                 skb2 = sk->send_head;
3206                 sk->send_head = NULL;
3207                 sk->send_tail = NULL;
3208         
3209                 /*
3210                  *      This is an artifact of a flawed concept. We want one
3211                  *      queue and a smarter send routine when we send all.
3212                  */
3213         
3214                 flag |= 4;      /* Window changed */
3215         
3216                 sk->window_seq = ack + ntohs(th->window);
3217                 cli();
3218                 while (skb2 != NULL) 
3219                 {
3220                         skb = skb2;
3221                         skb2 = skb->link3;
3222                         skb->link3 = NULL;
3223                         if (after(skb->h.seq, sk->window_seq)) 
3224                         {
3225                                 if (sk->packets_out > 0) 
3226                                         sk->packets_out--;
3227                                 /* We may need to remove this from the dev send list. */
3228                                 if (skb->next != NULL) 
3229                                 {
3230                                         skb_unlink(skb);                                
3231                                 }
3232                                 /* Now add it to the write_queue. */
3233                                 if (wskb == NULL)
3234                                         skb_queue_head(&sk->write_queue,skb);
3235                                 else
3236                                         skb_append(wskb,skb);
3237                                 wskb = skb;
3238                         } 
3239                         else 
3240                         {
3241                                 if (sk->send_head == NULL) 
3242                                 {
3243                                         sk->send_head = skb;
3244                                         sk->send_tail = skb;
3245                                 }
3246                                 else
3247                                 {
3248                                         sk->send_tail->link3 = skb;
3249                                         sk->send_tail = skb;
3250                                 }
3251                                 skb->link3 = NULL;
3252                         }
3253                 }
3254                 sti();
3255         }
3256 
3257         /*
3258          *      Pipe has emptied
3259          */
3260          
3261         if (sk->send_tail == NULL || sk->send_head == NULL) 
3262         {
3263                 sk->send_head = NULL;
3264                 sk->send_tail = NULL;
3265                 sk->packets_out= 0;
3266         }
3267 
3268         /*
3269          *      Update the right hand window edge of the host
3270          */
3271          
3272         sk->window_seq = ack + ntohs(th->window);
3273 
3274         /*
3275          *      We don't want too many packets out there. 
3276          */
3277          
3278         if (sk->ip_xmit_timeout == TIME_WRITE && 
3279                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3280         {
3281                 /* 
3282                  * This is Jacobson's slow start and congestion avoidance. 
3283                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3284                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3285                  * counter and increment it once every cwnd times.  It's possible
3286                  * that this should be done only if sk->retransmits == 0.  I'm
3287                  * interpreting "new data is acked" as including data that has
3288                  * been retransmitted but is just now being acked.
3289                  */
3290                 if (sk->cong_window < sk->ssthresh)  
3291                         /* 
3292                          *      In "safe" area, increase
3293                          */
3294                         sk->cong_window++;
3295                 else 
3296                 {
3297                         /*
3298                          *      In dangerous area, increase slowly.  In theory this is
3299                          *      sk->cong_window += 1 / sk->cong_window
3300                          */
3301                         if (sk->cong_count >= sk->cong_window) 
3302                         {
3303                                 sk->cong_window++;
3304                                 sk->cong_count = 0;
3305                         }
3306                         else 
3307                                 sk->cong_count++;
3308                 }
3309         }
3310 
3311         /*
3312          *      Remember the highest ack received.
3313          */
3314          
3315         sk->rcv_ack_seq = ack;
3316 
3317         /*
3318          *      If this ack opens up a zero window, clear backoff.  It was
3319          *      being used to time the probes, and is probably far higher than
3320          *      it needs to be for normal retransmission.
3321          */
3322 
3323         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3324         {
3325                 sk->retransmits = 0;    /* Our probe was answered */
3326                 
3327                 /*
3328                  *      Was it a usable window open ?
3329                  */
3330                  
3331                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3332                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3333                 {
3334                         sk->backoff = 0;
3335                         
3336                         /*
3337                          *      Recompute rto from rtt.  this eliminates any backoff.
3338                          */
3339 
3340                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3341                         if (sk->rto > 120*HZ)
3342                                 sk->rto = 120*HZ;
3343                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3344                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3345                                                    .2 of a second is going to need huge windows (SIGH) */
3346                         sk->rto = 20;
3347                 }
3348         }
3349 
3350         /* 
3351          *      See if we can take anything off of the retransmit queue.
3352          */
3353    
3354         while(sk->send_head != NULL) 
3355         {
3356                 /* Check for a bug. */
3357                 if (sk->send_head->link3 &&
3358                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3359                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3360                         
3361                 /*
3362                  *      If our packet is before the ack sequence we can
3363                  *      discard it as it's confirmed to have arrived the other end.
3364                  */
3365                  
3366                 if (before(sk->send_head->h.seq, ack+1)) 
3367                 {
3368                         struct sk_buff *oskb;   
3369                         if (sk->retransmits) 
3370                         {       
3371                                 /*
3372                                  *      We were retransmitting.  don't count this in RTT est 
3373                                  */
3374                                 flag |= 2;
3375 
3376                                 /*
3377                                  * even though we've gotten an ack, we're still
3378                                  * retransmitting as long as we're sending from
3379                                  * the retransmit queue.  Keeping retransmits non-zero
3380                                  * prevents us from getting new data interspersed with
3381                                  * retransmissions.
3382                                  */
3383 
3384                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3385                                         sk->retransmits = 1;
3386                                 else
3387                                         sk->retransmits = 0;
3388                         }
3389                         /*
3390                          * Note that we only reset backoff and rto in the
3391                          * rtt recomputation code.  And that doesn't happen
3392                          * if there were retransmissions in effect.  So the
3393                          * first new packet after the retransmissions is
3394                          * sent with the backoff still in effect.  Not until
3395                          * we get an ack from a non-retransmitted packet do
3396                          * we reset the backoff and rto.  This allows us to deal
3397                          * with a situation where the network delay has increased
3398                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3399                          */
3400 
3401                         /*
3402                          *      We have one less packet out there. 
3403                          */
3404                          
3405                         if (sk->packets_out > 0) 
3406                                 sk->packets_out --;
3407                         /* 
3408                          *      Wake up the process, it can probably write more. 
3409                          */
3410                         if (!sk->dead) 
3411                                 sk->write_space(sk);
3412                         oskb = sk->send_head;
3413 
3414                         if (!(flag&2))  /* Not retransmitting */
3415                         {
3416                                 long m;
3417         
3418                                 /*
3419                                  *      The following amusing code comes from Jacobson's
3420                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3421                                  *      are scaled versions of rtt and mean deviation.
3422                                  *      This is designed to be as fast as possible 
3423                                  *      m stands for "measurement".
3424                                  */
3425         
3426                                 m = jiffies - oskb->when;  /* RTT */
3427                                 if(m<=0)
3428                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3429                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3430                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3431                                 if (m < 0)
3432                                         m = -m;         /* m is now abs(error) */
3433                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3434                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3435         
3436                                 /*
3437                                  *      Now update timeout.  Note that this removes any backoff.
3438                                  */
3439                          
3440                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3441                                 if (sk->rto > 120*HZ)
3442                                         sk->rto = 120*HZ;
3443                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3444                                         sk->rto = 20;
3445                                 sk->backoff = 0;
3446                         }
3447                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3448                                            In this case as we just set it up */
3449                         cli();
3450                         oskb = sk->send_head;
3451                         IS_SKB(oskb);
3452                         sk->send_head = oskb->link3;
3453                         if (sk->send_head == NULL) 
3454                         {
3455                                 sk->send_tail = NULL;
3456                         }
3457 
3458                 /*
3459                  *      We may need to remove this from the dev send list. 
3460                  */
3461 
3462                         if (oskb->next)
3463                                 skb_unlink(oskb);
3464                         sti();
3465                         kfree_skb(oskb, FREE_WRITE); /* write. */
3466                         if (!sk->dead) 
3467                                 sk->write_space(sk);
3468                 }
3469                 else
3470                 {
3471                         break;
3472                 }
3473         }
3474 
3475         /*
3476          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3477          * returns non-NULL, we complete ignore the timer stuff in the else
3478          * clause.  We ought to organize the code so that else clause can
3479          * (should) be executed regardless, possibly moving the PROBE timer
3480          * reset over.  The skb_peek() thing should only move stuff to the
3481          * write queue, NOT also manage the timer functions.
3482          */
3483 
3484         /*
3485          * Maybe we can take some stuff off of the write queue,
3486          * and put it onto the xmit queue.
3487          */
3488         if (skb_peek(&sk->write_queue) != NULL) 
3489         {
3490                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3491                         (sk->retransmits == 0 || 
3492                          sk->ip_xmit_timeout != TIME_WRITE ||
3493                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3494                         && sk->packets_out < sk->cong_window) 
3495                 {
3496                         /*
3497                          *      Add more data to the send queue.
3498                          */
3499                         flag |= 1;
3500                         tcp_write_xmit(sk);
3501                 }
3502                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3503                         sk->send_head == NULL &&
3504                         sk->ack_backlog == 0 &&
3505                         sk->state != TCP_TIME_WAIT) 
3506                 {
3507                         /*
3508                          *      Data to queue but no room.
3509                          */
3510                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3511                 }               
3512         }
3513         else
3514         {
3515                 /*
3516                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3517                  * from TCP_CLOSE we don't do anything
3518                  *
3519                  * from anything else, if there is write data (or fin) pending,
3520                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3521                  * a KEEPALIVE timeout, else we delete the timer.
3522                  *
3523                  * We do not set flag for nominal write data, otherwise we may
3524                  * force a state where we start to write itsy bitsy tidbits
3525                  * of data.
3526                  */
3527 
3528                 switch(sk->state) {
3529                 case TCP_TIME_WAIT:
3530                         /*
3531                          * keep us in TIME_WAIT until we stop getting packets,
3532                          * reset the timeout.
3533                          */
3534                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3535                         break;
3536                 case TCP_CLOSE:
3537                         /*
3538                          * don't touch the timer.
3539                          */
3540                         break;
3541                 default:
3542                         /*
3543                          *      Must check send_head, write_queue, and ack_backlog
3544                          *      to determine which timeout to use.
3545                          */
3546                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3547                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3548                         } else if (sk->keepopen) {
3549                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3550                         } else {
3551                                 del_timer(&sk->retransmit_timer);
3552                                 sk->ip_xmit_timeout = 0;
3553                         }
3554                         break;
3555                 }
3556         }
3557 
3558         /*
3559          *      We have nothing queued but space to send. Send any partial
3560          *      packets immediately (end of Nagle rule application).
3561          */
3562          
3563         if (sk->packets_out == 0 && sk->partial != NULL &&
3564                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3565         {
3566                 flag |= 1;
3567                 tcp_send_partial(sk);
3568         }
3569 
3570         /*
3571          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3572          * we are now waiting for an acknowledge to our FIN.  The other end is
3573          * already in TIME_WAIT.
3574          *
3575          * Move to TCP_CLOSE on success.
3576          */
3577 
3578         if (sk->state == TCP_LAST_ACK) 
3579         {
3580                 if (!sk->dead)
3581                         sk->state_change(sk);
3582                 if(sk->debug)
3583                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3584                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3585                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3586                 {
3587                         flag |= 1;
3588                         tcp_set_state(sk,TCP_CLOSE);
3589                         sk->shutdown = SHUTDOWN_MASK;
3590                 }
3591         }
3592 
3593         /*
3594          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3595          *
3596          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3597          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3598          */
3599 
3600         if (sk->state == TCP_FIN_WAIT1) 
3601         {
3602 
3603                 if (!sk->dead) 
3604                         sk->state_change(sk);
3605                 if (sk->rcv_ack_seq == sk->write_seq) 
3606                 {
3607                         flag |= 1;
3608                         sk->shutdown |= SEND_SHUTDOWN;
3609                         tcp_set_state(sk, TCP_FIN_WAIT2);
3610                 }
3611         }
3612 
3613         /*
3614          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3615          *
3616          *      Move to TIME_WAIT
3617          */
3618 
3619         if (sk->state == TCP_CLOSING) 
3620         {
3621 
3622                 if (!sk->dead) 
3623                         sk->state_change(sk);
3624                 if (sk->rcv_ack_seq == sk->write_seq) 
3625                 {
3626                         flag |= 1;
3627                         tcp_time_wait(sk);
3628                 }
3629         }
3630         
3631         /*
3632          *      Final ack of a three way shake 
3633          */
3634          
3635         if(sk->state==TCP_SYN_RECV)
3636         {
3637                 tcp_set_state(sk, TCP_ESTABLISHED);
3638                 tcp_options(sk,th);
3639                 sk->dummy_th.dest=th->source;
3640                 sk->copied_seq = sk->acked_seq;
3641                 if(!sk->dead)
3642                         sk->state_change(sk);
3643                 if(sk->max_window==0)
3644                 {
3645                         sk->max_window=32;      /* Sanity check */
3646                         sk->mss=min(sk->max_window,sk->mtu);
3647                 }
3648         }
3649         
3650         /*
3651          * I make no guarantees about the first clause in the following
3652          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3653          * what conditions "!flag" would be true.  However I think the rest
3654          * of the conditions would prevent that from causing any
3655          * unnecessary retransmission. 
3656          *   Clearly if the first packet has expired it should be 
3657          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3658          * harder to explain:  You have to look carefully at how and when the
3659          * timer is set and with what timeout.  The most recent transmission always
3660          * sets the timer.  So in general if the most recent thing has timed
3661          * out, everything before it has as well.  So we want to go ahead and
3662          * retransmit some more.  If we didn't explicitly test for this
3663          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3664          * would not be true.  If you look at the pattern of timing, you can
3665          * show that rto is increased fast enough that the next packet would
3666          * almost never be retransmitted immediately.  Then you'd end up
3667          * waiting for a timeout to send each packet on the retransmission
3668          * queue.  With my implementation of the Karn sampling algorithm,
3669          * the timeout would double each time.  The net result is that it would
3670          * take a hideous amount of time to recover from a single dropped packet.
3671          * It's possible that there should also be a test for TIME_WRITE, but
3672          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3673          * got to be in real retransmission mode.
3674          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3675          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3676          * As long as no further losses occur, this seems reasonable.
3677          */
3678         
3679         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3680                (((flag&2) && sk->retransmits) ||
3681                (sk->send_head->when + sk->rto < jiffies))) 
3682         {
3683                 if(sk->send_head->when + sk->rto < jiffies)
3684                         tcp_retransmit(sk,0);   
3685                 else
3686                 {
3687                         tcp_do_retransmit(sk, 1);
3688                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3689                 }
3690         }
3691 
3692         return(1);
3693 }
3694 
3695 
3696 /*
3697  *      Process the FIN bit. This now behaves as it is supposed to work
3698  *      and the FIN takes effect when it is validly part of sequence
3699  *      space. Not before when we get holes.
3700  *
3701  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3702  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3703  *      TIME-WAIT)
3704  *
3705  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3706  *      close and we go into CLOSING (and later onto TIME-WAIT)
3707  *
3708  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3709  *
3710  */
3711  
3712 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3713 {
3714         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3715 
3716         if (!sk->dead) 
3717         {
3718                 sk->state_change(sk);
3719                 sock_wake_async(sk->socket, 1);
3720         }
3721 
3722         switch(sk->state) 
3723         {
3724                 case TCP_SYN_RECV:
3725                 case TCP_SYN_SENT:
3726                 case TCP_ESTABLISHED:
3727                         /*
3728                          * move to CLOSE_WAIT, tcp_data() already handled
3729                          * sending the ack.
3730                          */
3731                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3732                         if (th->rst)
3733                                 sk->shutdown = SHUTDOWN_MASK;
3734                         break;
3735 
3736                 case TCP_CLOSE_WAIT:
3737                 case TCP_CLOSING:
3738                         /*
3739                          * received a retransmission of the FIN, do
3740                          * nothing.
3741                          */
3742                         break;
3743                 case TCP_TIME_WAIT:
3744                         /*
3745                          * received a retransmission of the FIN,
3746                          * restart the TIME_WAIT timer.
3747                          */
3748                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3749                         return(0);
3750                 case TCP_FIN_WAIT1:
3751                         /*
3752                          * This case occurs when a simultaneous close
3753                          * happens, we must ack the received FIN and
3754                          * enter the CLOSING state.
3755                          *
3756                          * This causes a WRITE timeout, which will either
3757                          * move on to TIME_WAIT when we timeout, or resend
3758                          * the FIN properly (maybe we get rid of that annoying
3759                          * FIN lost hang). The TIME_WRITE code is already correct
3760                          * for handling this timeout.
3761                          */
3762 
3763                         if(sk->ip_xmit_timeout != TIME_WRITE)
3764                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3765                         tcp_set_state(sk,TCP_CLOSING);
3766                         break;
3767                 case TCP_FIN_WAIT2:
3768                         /*
3769                          * received a FIN -- send ACK and enter TIME_WAIT
3770                          */
3771                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3772                         sk->shutdown|=SHUTDOWN_MASK;
3773                         tcp_set_state(sk,TCP_TIME_WAIT);
3774                         break;
3775                 case TCP_CLOSE:
3776                         /*
3777                          * already in CLOSE
3778                          */
3779                         break;
3780                 default:
3781                         tcp_set_state(sk,TCP_LAST_ACK);
3782         
3783                         /* Start the timers. */
3784                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3785                         return(0);
3786         }
3787 
3788         return(0);
3789 }
3790 
3791 
3792 
3793 /*
3794  *      This routine handles the data.  If there is room in the buffer,
3795  *      it will be have already been moved into it.  If there is no
3796  *      room, then we will just have to discard the packet.
3797  */
3798 
3799 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3800          unsigned long saddr, unsigned short len)
3801 {
3802         struct sk_buff *skb1, *skb2;
3803         struct tcphdr *th;
3804         int dup_dumped=0;
3805         unsigned long new_seq;
3806         unsigned long shut_seq;
3807 
3808         th = skb->h.th;
3809         skb->len = len -(th->doff*4);
3810 
3811         /*
3812          *      The bytes in the receive read/assembly queue has increased. Needed for the
3813          *      low memory discard algorithm 
3814          */
3815            
3816         sk->bytes_rcv += skb->len;
3817         
3818         if (skb->len == 0 && !th->fin) 
3819         {
3820                 /* 
3821                  *      Don't want to keep passing ack's back and forth. 
3822                  *      (someone sent us dataless, boring frame)
3823                  */
3824                 if (!th->ack)
3825                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3826                 kfree_skb(skb, FREE_READ);
3827                 return(0);
3828         }
3829         
3830         /*
3831          *      We no longer have anyone receiving data on this connection.
3832          */
3833 
3834 #ifndef TCP_DONT_RST_SHUTDOWN            
3835 
3836         if(sk->shutdown & RCV_SHUTDOWN)
3837         {
3838                 /*
3839                  *      FIXME: BSD has some magic to avoid sending resets to
3840                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3841                  *      BSD stacks still have broken keepalives so we want to
3842                  *      cope with it.
3843                  */
3844 
3845                 if(skb->len)    /* We don't care if it's just an ack or
3846                                    a keepalive/window probe */
3847                 {
3848                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3849                         
3850                         /* Do this the way 4.4BSD treats it. Not what I'd
3851                            regard as the meaning of the spec but it's what BSD
3852                            does and clearly they know everything 8) */
3853 
3854                         /*
3855                          *      This is valid because of two things
3856                          *
3857                          *      a) The way tcp_data behaves at the bottom.
3858                          *      b) A fin takes effect when read not when received.
3859                          */
3860                          
3861                         shut_seq=sk->acked_seq+1;       /* Last byte */
3862                         
3863                         if(after(new_seq,shut_seq))
3864                         {
3865                                 if(sk->debug)
3866                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3867                                                 sk, new_seq, shut_seq, sk->blog);
3868                                 if(sk->dead)
3869                                 {
3870                                         sk->acked_seq = new_seq + th->fin;
3871                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3872                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3873                                         tcp_statistics.TcpEstabResets++;
3874                                         tcp_set_state(sk,TCP_CLOSE);
3875                                         sk->err = EPIPE;
3876                                         sk->shutdown = SHUTDOWN_MASK;
3877                                         kfree_skb(skb, FREE_READ);
3878                                         return 0;
3879                                 }
3880                         }
3881                 }
3882         }
3883 
3884 #endif
3885 
3886         /*
3887          *      Now we have to walk the chain, and figure out where this one
3888          *      goes into it.  This is set up so that the last packet we received
3889          *      will be the first one we look at, that way if everything comes
3890          *      in order, there will be no performance loss, and if they come
3891          *      out of order we will be able to fit things in nicely.
3892          *
3893          *      [AC: This is wrong. We should assume in order first and then walk
3894          *       forwards from the first hole based upon real traffic patterns.]
3895          *      
3896          */
3897 
3898         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3899         {
3900                 skb_queue_head(&sk->receive_queue,skb);
3901                 skb1= NULL;
3902         } 
3903         else
3904         {
3905                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3906                 {
3907                         if(sk->debug)
3908                         {
3909                                 printk("skb1=%p :", skb1);
3910                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3911                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3912                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3913                                                 sk->acked_seq);
3914                         }
3915                         
3916                         /*
3917                          *      Optimisation: Duplicate frame or extension of previous frame from
3918                          *      same sequence point (lost ack case).
3919                          *      The frame contains duplicate data or replaces a previous frame
3920                          *      discard the previous frame (safe as sk->inuse is set) and put
3921                          *      the new one in its place.
3922                          */
3923                          
3924                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3925                         {
3926                                 skb_append(skb1,skb);
3927                                 skb_unlink(skb1);
3928                                 kfree_skb(skb1,FREE_READ);
3929                                 dup_dumped=1;
3930                                 skb1=NULL;
3931                                 break;
3932                         }
3933                         
3934                         /*
3935                          *      Found where it fits
3936                          */
3937                          
3938                         if (after(th->seq+1, skb1->h.th->seq))
3939                         {
3940                                 skb_append(skb1,skb);
3941                                 break;
3942                         }
3943                         
3944                         /*
3945                          *      See if we've hit the start. If so insert.
3946                          */
3947                         if (skb1 == skb_peek(&sk->receive_queue))
3948                         {
3949                                 skb_queue_head(&sk->receive_queue, skb);
3950                                 break;
3951                         }
3952                 }
3953         }
3954 
3955         /*
3956          *      Figure out what the ack value for this frame is
3957          */
3958          
3959         th->ack_seq = th->seq + skb->len;
3960         if (th->syn) 
3961                 th->ack_seq++;
3962         if (th->fin)
3963                 th->ack_seq++;
3964 
3965         if (before(sk->acked_seq, sk->copied_seq)) 
3966         {
3967                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3968                 sk->acked_seq = sk->copied_seq;
3969         }
3970 
3971         /*
3972          *      Now figure out if we can ack anything. This is very messy because we really want two
3973          *      receive queues, a completed and an assembly queue. We also want only one transmit
3974          *      queue.
3975          */
3976 
3977         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3978         {
3979                 if (before(th->seq, sk->acked_seq+1)) 
3980                 {
3981                         int newwindow;
3982 
3983                         if (after(th->ack_seq, sk->acked_seq)) 
3984                         {
3985                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3986                                 if (newwindow < 0)
3987                                         newwindow = 0;  
3988                                 sk->window = newwindow;
3989                                 sk->acked_seq = th->ack_seq;
3990                         }
3991                         skb->acked = 1;
3992 
3993                         /*
3994                          *      When we ack the fin, we do the FIN 
3995                          *      processing.
3996                          */
3997 
3998                         if (skb->h.th->fin) 
3999                         {
4000                                 tcp_fin(skb,sk,skb->h.th);
4001                         }
4002           
4003                         for(skb2 = skb->next;
4004                             skb2 != (struct sk_buff *)&sk->receive_queue;
4005                             skb2 = skb2->next) 
4006                         {
4007                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4008                                 {
4009                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4010                                         {
4011                                                 newwindow = sk->window -
4012                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4013                                                 if (newwindow < 0)
4014                                                         newwindow = 0;  
4015                                                 sk->window = newwindow;
4016                                                 sk->acked_seq = skb2->h.th->ack_seq;
4017                                         }
4018                                         skb2->acked = 1;
4019                                         /*
4020                                          *      When we ack the fin, we do
4021                                          *      the fin handling.
4022                                          */
4023                                         if (skb2->h.th->fin) 
4024                                         {
4025                                                 tcp_fin(skb,sk,skb->h.th);
4026                                         }
4027 
4028                                         /*
4029                                          *      Force an immediate ack.
4030                                          */
4031                                          
4032                                         sk->ack_backlog = sk->max_ack_backlog;
4033                                 }
4034                                 else
4035                                 {
4036                                         break;
4037                                 }
4038                         }
4039 
4040                         /*
4041                          *      This also takes care of updating the window.
4042                          *      This if statement needs to be simplified.
4043                          */
4044                         if (!sk->delay_acks ||
4045                             sk->ack_backlog >= sk->max_ack_backlog || 
4046                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4047         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4048                         }
4049                         else 
4050                         {
4051                                 sk->ack_backlog++;
4052                                 if(sk->debug)
4053                                         printk("Ack queued.\n");
4054                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4055                         }
4056                 }
4057         }
4058 
4059         /*
4060          *      If we've missed a packet, send an ack.
4061          *      Also start a timer to send another.
4062          */
4063          
4064         if (!skb->acked) 
4065         {
4066         
4067         /*
4068          *      This is important.  If we don't have much room left,
4069          *      we need to throw out a few packets so we have a good
4070          *      window.  Note that mtu is used, not mss, because mss is really
4071          *      for the send side.  He could be sending us stuff as large as mtu.
4072          */
4073                  
4074                 while (sk->prot->rspace(sk) < sk->mtu) 
4075                 {
4076                         skb1 = skb_peek(&sk->receive_queue);
4077                         if (skb1 == NULL) 
4078                         {
4079                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4080                                 break;
4081                         }
4082 
4083                         /*
4084                          *      Don't throw out something that has been acked. 
4085                          */
4086                  
4087                         if (skb1->acked) 
4088                         {
4089                                 break;
4090                         }
4091                 
4092                         skb_unlink(skb1);
4093                         kfree_skb(skb1, FREE_READ);
4094                 }
4095                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4096                 sk->ack_backlog++;
4097                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4098         }
4099         else
4100         {
4101                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4102         }
4103 
4104         /*
4105          *      Now tell the user we may have some data. 
4106          */
4107          
4108         if (!sk->dead) 
4109         {
4110                 if(sk->debug)
4111                         printk("Data wakeup.\n");
4112                 sk->data_ready(sk,0);
4113         } 
4114         return(0);
4115 }
4116 
4117 
4118 /*
4119  *      This routine is only called when we have urgent data
4120  *      signalled. Its the 'slow' part of tcp_urg. It could be
4121  *      moved inline now as tcp_urg is only called from one
4122  *      place. We handle URGent data wrong. We have to - as
4123  *      BSD still doesn't use the correction from RFC961.
4124  */
4125  
4126 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4127 {
4128         unsigned long ptr = ntohs(th->urg_ptr);
4129 
4130         if (ptr)
4131                 ptr--;
4132         ptr += th->seq;
4133 
4134         /* ignore urgent data that we've already seen and read */
4135         if (after(sk->copied_seq, ptr))
4136                 return;
4137 
4138         /* do we already have a newer (or duplicate) urgent pointer? */
4139         if (sk->urg_data && !after(ptr, sk->urg_seq))
4140                 return;
4141 
4142         /* tell the world about our new urgent pointer */
4143         if (sk->proc != 0) {
4144                 if (sk->proc > 0) {
4145                         kill_proc(sk->proc, SIGURG, 1);
4146                 } else {
4147                         kill_pg(-sk->proc, SIGURG, 1);
4148                 }
4149         }
4150         sk->urg_data = URG_NOTYET;
4151         sk->urg_seq = ptr;
4152 }
4153 
4154 /*
4155  *      This is the 'fast' part of urgent handling.
4156  */
4157  
4158 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4159         unsigned long saddr, unsigned long len)
4160 {
4161         unsigned long ptr;
4162 
4163         /*
4164          *      Check if we get a new urgent pointer - normally not 
4165          */
4166          
4167         if (th->urg)
4168                 tcp_check_urg(sk,th);
4169 
4170         /*
4171          *      Do we wait for any urgent data? - normally not
4172          */
4173          
4174         if (sk->urg_data != URG_NOTYET)
4175                 return 0;
4176 
4177         /*
4178          *      Is the urgent pointer pointing into this packet? 
4179          */
4180          
4181         ptr = sk->urg_seq - th->seq + th->doff*4;
4182         if (ptr >= len)
4183                 return 0;
4184 
4185         /*
4186          *      Ok, got the correct packet, update info 
4187          */
4188          
4189         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4190         if (!sk->dead)
4191                 sk->data_ready(sk,0);
4192         return 0;
4193 }
4194 
4195 /*
4196  *      This will accept the next outstanding connection. 
4197  */
4198  
4199 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4200 {
4201         struct sock *newsk;
4202         struct sk_buff *skb;
4203   
4204   /*
4205    * We need to make sure that this socket is listening,
4206    * and that it has something pending.
4207    */
4208 
4209         if (sk->state != TCP_LISTEN) 
4210         {
4211                 sk->err = EINVAL;
4212                 return(NULL); 
4213         }
4214 
4215         /* Avoid the race. */
4216         cli();
4217         sk->inuse = 1;
4218 
4219         while((skb = tcp_dequeue_established(sk)) == NULL) 
4220         {
4221                 if (flags & O_NONBLOCK) 
4222                 {
4223                         sti();
4224                         release_sock(sk);
4225                         sk->err = EAGAIN;
4226                         return(NULL);
4227                 }
4228 
4229                 release_sock(sk);
4230                 interruptible_sleep_on(sk->sleep);
4231                 if (current->signal & ~current->blocked) 
4232                 {
4233                         sti();
4234                         sk->err = ERESTARTSYS;
4235                         return(NULL);
4236                 }
4237                 sk->inuse = 1;
4238         }
4239         sti();
4240 
4241         /*
4242          *      Now all we need to do is return skb->sk. 
4243          */
4244 
4245         newsk = skb->sk;
4246 
4247         kfree_skb(skb, FREE_READ);
4248         sk->ack_backlog--;
4249         release_sock(sk);
4250         return(newsk);
4251 }
4252 
4253 
4254 /*
4255  *      This will initiate an outgoing connection. 
4256  */
4257  
4258 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4259 {
4260         struct sk_buff *buff;
4261         struct device *dev=NULL;
4262         unsigned char *ptr;
4263         int tmp;
4264         int atype;
4265         struct tcphdr *t1;
4266         struct rtable *rt;
4267 
4268         if (sk->state != TCP_CLOSE) 
4269         {
4270                 return(-EISCONN);
4271         }
4272         
4273         if (addr_len < 8) 
4274                 return(-EINVAL);
4275 
4276         if (usin->sin_family && usin->sin_family != AF_INET) 
4277                 return(-EAFNOSUPPORT);
4278 
4279         /*
4280          *      connect() to INADDR_ANY means loopback (BSD'ism).
4281          */
4282         
4283         if(usin->sin_addr.s_addr==INADDR_ANY)
4284                 usin->sin_addr.s_addr=ip_my_addr();
4285                   
4286         /*
4287          *      Don't want a TCP connection going to a broadcast address 
4288          */
4289 
4290         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4291                 return -ENETUNREACH;
4292   
4293         sk->inuse = 1;
4294         sk->daddr = usin->sin_addr.s_addr;
4295         sk->write_seq = tcp_init_seq();
4296         sk->window_seq = sk->write_seq;
4297         sk->rcv_ack_seq = sk->write_seq -1;
4298         sk->err = 0;
4299         sk->dummy_th.dest = usin->sin_port;
4300         release_sock(sk);
4301 
4302         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4303         if (buff == NULL) 
4304         {
4305                 return(-ENOMEM);
4306         }
4307         sk->inuse = 1;
4308         buff->len = 24;
4309         buff->sk = sk;
4310         buff->free = 0;
4311         buff->localroute = sk->localroute;
4312         
4313         t1 = (struct tcphdr *) buff->data;
4314 
4315         /*
4316          *      Put in the IP header and routing stuff. 
4317          */
4318          
4319         rt=ip_rt_route(sk->daddr, NULL, NULL);
4320         
4321 
4322         /*
4323          *      We need to build the routing stuff from the things saved in skb. 
4324          */
4325 
4326         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4327                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4328         if (tmp < 0) 
4329         {
4330                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4331                 release_sock(sk);
4332                 return(-ENETUNREACH);
4333         }
4334 
4335         buff->len += tmp;
4336         t1 = (struct tcphdr *)((char *)t1 +tmp);
4337 
4338         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4339         t1->seq = ntohl(sk->write_seq++);
4340         sk->sent_seq = sk->write_seq;
4341         buff->h.seq = sk->write_seq;
4342         t1->ack = 0;
4343         t1->window = 2;
4344         t1->res1=0;
4345         t1->res2=0;
4346         t1->rst = 0;
4347         t1->urg = 0;
4348         t1->psh = 0;
4349         t1->syn = 1;
4350         t1->urg_ptr = 0;
4351         t1->doff = 6;
4352         /* use 512 or whatever user asked for */
4353         
4354         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4355                 sk->window_clamp=rt->rt_window;
4356         else
4357                 sk->window_clamp=0;
4358 
4359         if (sk->user_mss)
4360                 sk->mtu = sk->user_mss;
4361         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4362                 sk->mtu = rt->rt_mss;
4363         else 
4364         {
4365 #ifdef CONFIG_INET_SNARL
4366                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4367 #else
4368                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4369 #endif
4370                         sk->mtu = 576 - HEADER_SIZE;
4371                 else
4372                         sk->mtu = MAX_WINDOW;
4373         }
4374         /*
4375          *      but not bigger than device MTU 
4376          */
4377 
4378         if(sk->mtu <32)
4379                 sk->mtu = 32;   /* Sanity limit */
4380                 
4381         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4382         
4383         /*
4384          *      Put in the TCP options to say MTU. 
4385          */
4386 
4387         ptr = (unsigned char *)(t1+1);
4388         ptr[0] = 2;
4389         ptr[1] = 4;
4390         ptr[2] = (sk->mtu) >> 8;
4391         ptr[3] = (sk->mtu) & 0xff;
4392         tcp_send_check(t1, sk->saddr, sk->daddr,
4393                   sizeof(struct tcphdr) + 4, sk);
4394 
4395         /*
4396          *      This must go first otherwise a really quick response will get reset. 
4397          */
4398 
4399         tcp_set_state(sk,TCP_SYN_SENT);
4400         sk->rto = TCP_TIMEOUT_INIT;
4401 #if 0 /* we already did this */
4402         init_timer(&sk->retransmit_timer); 
4403 #endif
4404         sk->retransmit_timer.function=&retransmit_timer;
4405         sk->retransmit_timer.data = (unsigned long)sk;
4406         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4407         sk->retransmits = TCP_SYN_RETRIES;
4408 
4409         sk->prot->queue_xmit(sk, dev, buff, 0);  
4410         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4411         tcp_statistics.TcpActiveOpens++;
4412         tcp_statistics.TcpOutSegs++;
4413   
4414         release_sock(sk);
4415         return(0);
4416 }
4417 
4418 
4419 /* This functions checks to see if the tcp header is actually acceptable. */
4420 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4421              struct options *opt, unsigned long saddr, struct device *dev)
4422 {
4423         unsigned long next_seq;
4424 
4425         next_seq = len - 4*th->doff;
4426         if (th->fin)
4427                 next_seq++;
4428         /* if we have a zero window, we can't have any data in the packet.. */
4429         if (next_seq && !sk->window)
4430                 goto ignore_it;
4431         next_seq += th->seq;
4432 
4433         /*
4434          * This isn't quite right.  sk->acked_seq could be more recent
4435          * than sk->window.  This is however close enough.  We will accept
4436          * slightly more packets than we should, but it should not cause
4437          * problems unless someone is trying to forge packets.
4438          */
4439 
4440         /* have we already seen all of this packet? */
4441         if (!after(next_seq+1, sk->acked_seq))
4442                 goto ignore_it;
4443         /* or does it start beyond the window? */
4444         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4445                 goto ignore_it;
4446 
4447         /* ok, at least part of this packet would seem interesting.. */
4448         return 1;
4449 
4450 ignore_it:
4451         if (th->rst)
4452                 return 0;
4453 
4454         /*
4455          *      Send a reset if we get something not ours and we are
4456          *      unsynchronized. Note: We don't do anything to our end. We
4457          *      are just killing the bogus remote connection then we will
4458          *      connect again and it will work (with luck).
4459          */
4460          
4461         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4462         {
4463                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4464                 return 1;
4465         }
4466 
4467         /* Try to resync things. */
4468         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4469         return 0;
4470 }
4471 
4472 /*
4473  *      When we get a reset we do this.
4474  */
4475 
4476 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4477 {
4478         sk->zapped = 1;
4479         sk->err = ECONNRESET;
4480         if (sk->state == TCP_SYN_SENT)
4481                 sk->err = ECONNREFUSED;
4482         if (sk->state == TCP_CLOSE_WAIT)
4483                 sk->err = EPIPE;
4484 #ifdef TCP_DO_RFC1337           
4485         /*
4486          *      Time wait assassination protection [RFC1337]
4487          */
4488         if(sk->state!=TCP_TIME_WAIT)
4489         {       
4490                 tcp_set_state(sk,TCP_CLOSE);
4491                 sk->shutdown = SHUTDOWN_MASK;
4492         }
4493 #else   
4494         tcp_set_state(sk,TCP_CLOSE);
4495         sk->shutdown = SHUTDOWN_MASK;
4496 #endif  
4497         if (!sk->dead) 
4498                 sk->state_change(sk);
4499         kfree_skb(skb, FREE_READ);
4500         release_sock(sk);
4501         return(0);
4502 }
4503 
4504 /*
4505  *      A TCP packet has arrived.
4506  */
4507  
4508 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4509         unsigned long daddr, unsigned short len,
4510         unsigned long saddr, int redo, struct inet_protocol * protocol)
4511 {
4512         struct tcphdr *th;
4513         struct sock *sk;
4514         int syn_ok=0;
4515         
4516         if (!skb) 
4517         {
4518                 printk("IMPOSSIBLE 1\n");
4519                 return(0);
4520         }
4521 
4522         if (!dev) 
4523         {
4524                 printk("IMPOSSIBLE 2\n");
4525                 return(0);
4526         }
4527   
4528         tcp_statistics.TcpInSegs++;
4529   
4530         if(skb->pkt_type!=PACKET_HOST)
4531         {
4532                 kfree_skb(skb,FREE_READ);
4533                 return(0);
4534         }
4535   
4536         th = skb->h.th;
4537 
4538         /*
4539          *      Find the socket.
4540          */
4541 
4542         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4543 
4544         /*
4545          *      If this socket has got a reset it's to all intents and purposes 
4546          *      really dead. Count closed sockets as dead.
4547          *
4548          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4549          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4550          *      exist so should cause resets as if the port was unreachable.
4551          */
4552          
4553         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4554                 sk=NULL;
4555 
4556         if (!redo) 
4557         {
4558                 if (tcp_check(th, len, saddr, daddr )) 
4559                 {
4560                         skb->sk = NULL;
4561                         kfree_skb(skb,FREE_READ);
4562                         /*
4563                          *      We don't release the socket because it was
4564                          *      never marked in use.
4565                          */
4566                         return(0);
4567                 }
4568                 th->seq = ntohl(th->seq);
4569 
4570                 /* See if we know about the socket. */
4571                 if (sk == NULL) 
4572                 {
4573                         /*
4574                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4575                          */
4576                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4577                         skb->sk = NULL;
4578                         /*
4579                          *      Discard frame
4580                          */
4581                         kfree_skb(skb, FREE_READ);
4582                         return(0);
4583                 }
4584 
4585                 skb->len = len;
4586                 skb->acked = 0;
4587                 skb->used = 0;
4588                 skb->free = 0;
4589                 skb->saddr = daddr;
4590                 skb->daddr = saddr;
4591         
4592                 /* We may need to add it to the backlog here. */
4593                 cli();
4594                 if (sk->inuse) 
4595                 {
4596                         skb_queue_tail(&sk->back_log, skb);
4597                         sti();
4598                         return(0);
4599                 }
4600                 sk->inuse = 1;
4601                 sti();
4602         }
4603         else
4604         {
4605                 if (sk==NULL) 
4606                 {
4607                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4608                         skb->sk = NULL;
4609                         kfree_skb(skb, FREE_READ);
4610                         return(0);
4611                 }
4612         }
4613 
4614 
4615         if (!sk->prot) 
4616         {
4617                 printk("IMPOSSIBLE 3\n");
4618                 return(0);
4619         }
4620 
4621 
4622         /*
4623          *      Charge the memory to the socket. 
4624          */
4625          
4626         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4627         {
4628                 kfree_skb(skb, FREE_READ);
4629                 release_sock(sk);
4630                 return(0);
4631         }
4632 
4633         skb->sk=sk;
4634         sk->rmem_alloc += skb->mem_len;
4635 
4636         /*
4637          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4638          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4639          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4640          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4641          */
4642 
4643         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4644         {
4645         
4646                 /*
4647                  *      Now deal with unusual cases.
4648                  */
4649          
4650                 if(sk->state==TCP_LISTEN)
4651                 {
4652                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4653                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4654 
4655                         /*
4656                          *      We don't care for RST, and non SYN are absorbed (old segments)
4657                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4658                          *      netmask on a running connection it can go broadcast. Even Sun's have
4659                          *      this problem so I'm ignoring it 
4660                          */
4661                            
4662                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4663                         {
4664                                 kfree_skb(skb, FREE_READ);
4665                                 release_sock(sk);
4666                                 return 0;
4667                         }
4668                 
4669                         /*      
4670                          *      Guess we need to make a new socket up 
4671                          */
4672                 
4673                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4674                 
4675                         /*
4676                          *      Now we have several options: In theory there is nothing else
4677                          *      in the frame. KA9Q has an option to send data with the syn,
4678                          *      BSD accepts data with the syn up to the [to be] advertised window
4679                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4680                          *      it, that fits the spec precisely and avoids incompatibilities. It
4681                          *      would be nice in future to drop through and process the data.
4682                          */
4683                          
4684                         release_sock(sk);
4685                         return 0;
4686                 }
4687         
4688                 /* retransmitted SYN? */
4689                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4690                 {
4691                         kfree_skb(skb, FREE_READ);
4692                         release_sock(sk);
4693                         return 0;
4694                 }
4695                 
4696                 /*
4697                  *      SYN sent means we have to look for a suitable ack and either reset
4698                  *      for bad matches or go to connected 
4699                  */
4700            
4701                 if(sk->state==TCP_SYN_SENT)
4702                 {
4703                         /* Crossed SYN or previous junk segment */
4704                         if(th->ack)
4705                         {
4706                                 /* We got an ack, but it's not a good ack */
4707                                 if(!tcp_ack(sk,th,saddr,len))
4708                                 {
4709                                         /* Reset the ack - its an ack from a 
4710                                            different connection  [ th->rst is checked in tcp_reset()] */
4711                                         tcp_statistics.TcpAttemptFails++;
4712                                         tcp_reset(daddr, saddr, th,
4713                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4714                                         kfree_skb(skb, FREE_READ);
4715                                         release_sock(sk);
4716                                         return(0);
4717                                 }
4718                                 if(th->rst)
4719                                         return tcp_std_reset(sk,skb);
4720                                 if(!th->syn)
4721                                 {
4722                                         /* A valid ack from a different connection
4723                                            start. Shouldn't happen but cover it */
4724                                         kfree_skb(skb, FREE_READ);
4725                                         release_sock(sk);
4726                                         return 0;
4727                                 }
4728                                 /*
4729                                  *      Ok.. it's good. Set up sequence numbers and
4730                                  *      move to established.
4731                                  */
4732                                 syn_ok=1;       /* Don't reset this connection for the syn */
4733                                 sk->acked_seq=th->seq+1;
4734                                 sk->fin_seq=th->seq;
4735                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4736                                 tcp_set_state(sk, TCP_ESTABLISHED);
4737                                 tcp_options(sk,th);
4738                                 sk->dummy_th.dest=th->source;
4739                                 sk->copied_seq = sk->acked_seq;
4740                                 if(!sk->dead)
4741                                 {
4742                                         sk->state_change(sk);
4743                                         sock_wake_async(sk->socket, 0);
4744                                 }
4745                                 if(sk->max_window==0)
4746                                 {
4747                                         sk->max_window = 32;
4748                                         sk->mss = min(sk->max_window, sk->mtu);
4749                                 }
4750                         }
4751                         else
4752                         {
4753                                 /* See if SYN's cross. Drop if boring */
4754                                 if(th->syn && !th->rst)
4755                                 {
4756                                         /* Crossed SYN's are fine - but talking to
4757                                            yourself is right out... */
4758                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4759                                                 sk->dummy_th.source==th->source &&
4760                                                 sk->dummy_th.dest==th->dest)
4761                                         {
4762                                                 tcp_statistics.TcpAttemptFails++;
4763                                                 return tcp_std_reset(sk,skb);
4764                                         }
4765                                         tcp_set_state(sk,TCP_SYN_RECV);
4766                                         
4767                                         /*
4768                                          *      FIXME:
4769                                          *      Must send SYN|ACK here
4770                                          */
4771                                 }               
4772                                 /* Discard junk segment */
4773                                 kfree_skb(skb, FREE_READ);
4774                                 release_sock(sk);
4775                                 return 0;
4776                         }
4777                         /*
4778                          *      SYN_RECV with data maybe.. drop through
4779                          */
4780                         goto rfc_step6;
4781                 }
4782 
4783         /*
4784          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4785          *      a more complex suggestion for fixing these reuse issues in RFC1644
4786          *      but not yet ready for general use. Also see RFC1379.
4787          */
4788         
4789 #define BSD_TIME_WAIT
4790 #ifdef BSD_TIME_WAIT
4791                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4792                         after(th->seq, sk->acked_seq) && !th->rst)
4793                 {
4794                         long seq=sk->write_seq;
4795                         if(sk->debug)
4796                                 printk("Doing a BSD time wait\n");
4797                         tcp_statistics.TcpEstabResets++;           
4798                         sk->rmem_alloc -= skb->mem_len;
4799                         skb->sk = NULL;
4800                         sk->err=ECONNRESET;
4801                         tcp_set_state(sk, TCP_CLOSE);
4802                         sk->shutdown = SHUTDOWN_MASK;
4803                         release_sock(sk);
4804                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4805                         if (sk && sk->state==TCP_LISTEN)
4806                         {
4807                                 sk->inuse=1;
4808                                 skb->sk = sk;
4809                                 sk->rmem_alloc += skb->mem_len;
4810                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4811                                 release_sock(sk);
4812                                 return 0;
4813                         }
4814                         kfree_skb(skb, FREE_READ);
4815                         return 0;
4816                 }
4817 #endif  
4818         }
4819 
4820         /*
4821          *      We are now in normal data flow (see the step list in the RFC)
4822          *      Note most of these are inline now. I'll inline the lot when
4823          *      I have time to test it hard and look at what gcc outputs 
4824          */
4825         
4826         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4827         {
4828                 kfree_skb(skb, FREE_READ);
4829                 release_sock(sk);
4830                 return 0;
4831         }
4832 
4833         if(th->rst)
4834                 return tcp_std_reset(sk,skb);
4835         
4836         /*
4837          *      !syn_ok is effectively the state test in RFC793.
4838          */
4839          
4840         if(th->syn && !syn_ok)
4841         {
4842                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4843                 return tcp_std_reset(sk,skb);   
4844         }
4845 
4846         /*
4847          *      Process the ACK
4848          */
4849          
4850 
4851         if(th->ack && !tcp_ack(sk,th,saddr,len))
4852         {
4853                 /*
4854                  *      Our three way handshake failed.
4855                  */
4856                  
4857                 if(sk->state==TCP_SYN_RECV)
4858                 {
4859                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4860                 }
4861                 kfree_skb(skb, FREE_READ);
4862                 release_sock(sk);
4863                 return 0;
4864         }
4865         
4866 rfc_step6:              /* I'll clean this up later */
4867 
4868         /*
4869          *      Process urgent data
4870          */
4871                 
4872         if(tcp_urg(sk, th, saddr, len))
4873         {
4874                 kfree_skb(skb, FREE_READ);
4875                 release_sock(sk);
4876                 return 0;
4877         }
4878         
4879         
4880         /*
4881          *      Process the encapsulated data
4882          */
4883         
4884         if(tcp_data(skb,sk, saddr, len))
4885         {
4886                 kfree_skb(skb, FREE_READ);
4887                 release_sock(sk);
4888                 return 0;
4889         }
4890 
4891         /*
4892          *      And done
4893          */     
4894         
4895         release_sock(sk);
4896         return 0;
4897 }
4898 
4899 /*
4900  *      This routine sends a packet with an out of date sequence
4901  *      number. It assumes the other end will try to ack it.
4902  */
4903 
4904 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4905 {
4906         struct sk_buff *buff;
4907         struct tcphdr *t1;
4908         struct device *dev=NULL;
4909         int tmp;
4910 
4911         if (sk->zapped)
4912                 return; /* After a valid reset we can send no more */
4913 
4914         /*
4915          *      Write data can still be transmitted/retransmitted in the
4916          *      following states.  If any other state is encountered, return.
4917          *      [listen/close will never occur here anyway]
4918          */
4919 
4920         if (sk->state != TCP_ESTABLISHED && 
4921             sk->state != TCP_CLOSE_WAIT &&
4922             sk->state != TCP_FIN_WAIT1 && 
4923             sk->state != TCP_LAST_ACK &&
4924             sk->state != TCP_CLOSING
4925         ) 
4926         {
4927                 return;
4928         }
4929 
4930         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4931         if (buff == NULL) 
4932                 return;
4933 
4934         buff->len = sizeof(struct tcphdr);
4935         buff->free = 1;
4936         buff->sk = sk;
4937         buff->localroute = sk->localroute;
4938 
4939         t1 = (struct tcphdr *) buff->data;
4940 
4941         /* Put in the IP header and routing stuff. */
4942         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4943                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4944         if (tmp < 0) 
4945         {
4946                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4947                 return;
4948         }
4949 
4950         buff->len += tmp;
4951         t1 = (struct tcphdr *)((char *)t1 +tmp);
4952 
4953         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4954 
4955         /*
4956          *      Use a previous sequence.
4957          *      This should cause the other end to send an ack.
4958          */
4959          
4960         t1->seq = htonl(sk->sent_seq-1);
4961         t1->ack = 1; 
4962         t1->res1= 0;
4963         t1->res2= 0;
4964         t1->rst = 0;
4965         t1->urg = 0;
4966         t1->psh = 0;
4967         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4968         t1->syn = 0;
4969         t1->ack_seq = ntohl(sk->acked_seq);
4970         t1->window = ntohs(tcp_select_window(sk));
4971         t1->doff = sizeof(*t1)/4;
4972         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4973          /*
4974           *     Send it and free it.
4975           *     This will prevent the timer from automatically being restarted.
4976           */
4977         sk->prot->queue_xmit(sk, dev, buff, 1);
4978         tcp_statistics.TcpOutSegs++;
4979 }
4980 
4981 /*
4982  *      A window probe timeout has occurred.
4983  */
4984 
4985 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4986 {
4987         if (sk->zapped)
4988                 return;         /* After a valid reset we can send no more */
4989 
4990         tcp_write_wakeup(sk);
4991 
4992         sk->backoff++;
4993         sk->rto = min(sk->rto << 1, 120*HZ);
4994         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4995         sk->retransmits++;
4996         sk->prot->retransmits ++;
4997 }
4998 
4999 /*
5000  *      Socket option code for TCP. 
5001  */
5002   
5003 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5004 {
5005         int val,err;
5006 
5007         if(level!=SOL_TCP)
5008                 return ip_setsockopt(sk,level,optname,optval,optlen);
5009 
5010         if (optval == NULL) 
5011                 return(-EINVAL);
5012 
5013         err=verify_area(VERIFY_READ, optval, sizeof(int));
5014         if(err)
5015                 return err;
5016         
5017         val = get_fs_long((unsigned long *)optval);
5018 
5019         switch(optname)
5020         {
5021                 case TCP_MAXSEG:
5022 /*
5023  * values greater than interface MTU won't take effect.  however at
5024  * the point when this call is done we typically don't yet know
5025  * which interface is going to be used
5026  */
5027                         if(val<1||val>MAX_WINDOW)
5028                                 return -EINVAL;
5029                         sk->user_mss=val;
5030                         return 0;
5031                 case TCP_NODELAY:
5032                         sk->nonagle=(val==0)?0:1;
5033                         return 0;
5034                 default:
5035                         return(-ENOPROTOOPT);
5036         }
5037 }
5038 
5039 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5040 {
5041         int val,err;
5042 
5043         if(level!=SOL_TCP)
5044                 return ip_getsockopt(sk,level,optname,optval,optlen);
5045                         
5046         switch(optname)
5047         {
5048                 case TCP_MAXSEG:
5049                         val=sk->user_mss;
5050                         break;
5051                 case TCP_NODELAY:
5052                         val=sk->nonagle;
5053                         break;
5054                 default:
5055                         return(-ENOPROTOOPT);
5056         }
5057         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5058         if(err)
5059                 return err;
5060         put_fs_long(sizeof(int),(unsigned long *) optlen);
5061 
5062         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5063         if(err)
5064                 return err;
5065         put_fs_long(val,(unsigned long *)optval);
5066 
5067         return(0);
5068 }       
5069 
5070 
5071 struct proto tcp_prot = {
5072         sock_wmalloc,
5073         sock_rmalloc,
5074         sock_wfree,
5075         sock_rfree,
5076         sock_rspace,
5077         sock_wspace,
5078         tcp_close,
5079         tcp_read,
5080         tcp_write,
5081         tcp_sendto,
5082         tcp_recvfrom,
5083         ip_build_header,
5084         tcp_connect,
5085         tcp_accept,
5086         ip_queue_xmit,
5087         tcp_retransmit,
5088         tcp_write_wakeup,
5089         tcp_read_wakeup,
5090         tcp_rcv,
5091         tcp_select,
5092         tcp_ioctl,
5093         NULL,
5094         tcp_shutdown,
5095         tcp_setsockopt,
5096         tcp_getsockopt,
5097         128,
5098         0,
5099         {NULL,},
5100         "TCP",
5101         0, 0
5102 };

/* [previous][next][first][last][top][bottom][index][help] */