root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since it's
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *              Alan Cox        :       More 'per spec' fixes.
 135  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 136  *                                      only frames. At least one pc tcp stack
 137  *                                      generates them.
 138  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 139  *                                      ack if stat is TCP_CLOSED.
 140  *
 141  *
 142  * To Fix:
 143  *              Fast path the code. Two things here - fix the window calculation
 144  *              so it doesn't iterate over the queue, also spot packets with no funny
 145  *              options arriving in order and process directly.
 146  *
 147  *              Implement RFC 1191 [Path MTU discovery]
 148  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 149  *              Rewrite output state machine to use a single queue and do low window
 150  *              situations as per the spec (RFC 1122)
 151  *              Speed up input assembly algorithm.
 152  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 153  *              could do with it working on IPv4
 154  *              User settable/learned rtt/max window/mtu
 155  *              Cope with MTU/device switches when retransmitting in tcp.
 156  *              Fix the window handling to use PR's new code.
 157  *
 158  *              Change the fundamental structure to a single send queue maintained
 159  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 160  *              active routes too]). Cut the queue off in tcp_retransmit/
 161  *              tcp_transmit.
 162  *              Change the receive queue to assemble as it goes. This lets us
 163  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 164  *              tcp_data/tcp_read as well as the window shrink crud.
 165  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 166  *              tcp_queue_skb seem obvious routines to extract.
 167  *      
 168  *              This program is free software; you can redistribute it and/or
 169  *              modify it under the terms of the GNU General Public License
 170  *              as published by the Free Software Foundation; either version
 171  *              2 of the License, or(at your option) any later version.
 172  *
 173  * Description of States:
 174  *
 175  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 176  *
 177  *      TCP_SYN_RECV            received a connection request, sent ack,
 178  *                              waiting for final ack in three-way handshake.
 179  *
 180  *      TCP_ESTABLISHED         connection established
 181  *
 182  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 183  *                              transmission of remaining buffered data
 184  *
 185  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 186  *                              to shutdown
 187  *
 188  *      TCP_CLOSING             both sides have shutdown but we still have
 189  *                              data we have to finish sending
 190  *
 191  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 192  *                              closed, can only be entered from FIN_WAIT2
 193  *                              or CLOSING.  Required because the other end
 194  *                              may not have gotten our last ACK causing it
 195  *                              to retransmit the data packet (which we ignore)
 196  *
 197  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 198  *                              us to finish writing our data and to shutdown
 199  *                              (we have to close() to move on to LAST_ACK)
 200  *
 201  *      TCP_LAST_ACK            out side has shutdown after remote has
 202  *                              shutdown.  There may still be data in our
 203  *                              buffer that we have to finish sending
 204  *              
 205  *      TCP_CLOSE               socket is finished
 206  */
 207 
 208 #include <linux/types.h>
 209 #include <linux/sched.h>
 210 #include <linux/mm.h>
 211 #include <linux/time.h>
 212 #include <linux/string.h>
 213 #include <linux/config.h>
 214 #include <linux/socket.h>
 215 #include <linux/sockios.h>
 216 #include <linux/termios.h>
 217 #include <linux/in.h>
 218 #include <linux/fcntl.h>
 219 #include <linux/inet.h>
 220 #include <linux/netdevice.h>
 221 #include "snmp.h"
 222 #include "ip.h"
 223 #include "protocol.h"
 224 #include "icmp.h"
 225 #include "tcp.h"
 226 #include "arp.h"
 227 #include <linux/skbuff.h>
 228 #include "sock.h"
 229 #include "route.h"
 230 #include <linux/errno.h>
 231 #include <linux/timer.h>
 232 #include <asm/system.h>
 233 #include <asm/segment.h>
 234 #include <linux/mm.h>
 235 
 236 /*
 237  *      The MSL timer is the 'normal' timer.
 238  */
 239  
 240 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 241 
 242 #define SEQ_TICK 3
 243 unsigned long seq_offset;
 244 struct tcp_mib  tcp_statistics;
 245 
 246 static void tcp_close(struct sock *sk, int timeout);
 247 
 248 
 249 /*
 250  *      The less said about this the better, but it works and will do for 1.2 
 251  */
 252 
 253 static struct wait_queue *master_select_wakeup;
 254 
 255 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 256 {
 257         if (a < b) 
 258                 return(a);
 259         return(b);
 260 }
 261 
 262 #undef STATE_TRACE
 263 
 264 #ifdef STATE_TRACE
 265 static char *statename[]={
 266         "Unused","Established","Syn Sent","Syn Recv",
 267         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 268         "Close Wait","Last ACK","Listen","Closing"
 269 };
 270 #endif
 271 
 272 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 273 {
 274         if(sk->state==TCP_ESTABLISHED)
 275                 tcp_statistics.TcpCurrEstab--;
 276 #ifdef STATE_TRACE
 277         if(sk->debug)
 278                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 279 #endif  
 280         /* This is a hack but it doesn't occur often and it's going to
 281            be a real        to fix nicely */
 282            
 283         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 284         {
 285                 wake_up_interruptible(&master_select_wakeup);
 286         }
 287         sk->state=state;
 288         if(state==TCP_ESTABLISHED)
 289                 tcp_statistics.TcpCurrEstab++;
 290 }
 291 
 292 /*
 293  *      This routine picks a TCP windows for a socket based on
 294  *      the following constraints
 295  *  
 296  *      1. The window can never be shrunk once it is offered (RFC 793)
 297  *      2. We limit memory per socket
 298  *   
 299  *      For now we use NET2E3's heuristic of offering half the memory
 300  *      we have handy. All is not as bad as this seems however because
 301  *      of two things. Firstly we will bin packets even within the window
 302  *      in order to get the data we are waiting for into the memory limit.
 303  *      Secondly we bin common duplicate forms at receive time
 304  *      Better heuristics welcome
 305  */
 306    
 307 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 308 {
 309         int new_window = sk->prot->rspace(sk);
 310         
 311         if(sk->window_clamp)
 312                 new_window=min(sk->window_clamp,new_window);
 313         /*
 314          *      Two things are going on here.  First, we don't ever offer a
 315          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 316          *      receiver side of SWS as specified in RFC1122.
 317          *      Second, we always give them at least the window they
 318          *      had before, in order to avoid retracting window.  This
 319          *      is technically allowed, but RFC1122 advises against it and
 320          *      in practice it causes trouble.
 321          *
 322          *      Fixme: This doesn't correctly handle the case where
 323          *      new_window > sk->window but not by enough to allow for the
 324          *      shift in sequence space. 
 325          */
 326         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 327                 return(sk->window);
 328         return(new_window);
 329 }
 330 
 331 /*
 332  *      Find someone to 'accept'. Must be called with
 333  *      sk->inuse=1 or cli()
 334  */ 
 335 
 336 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 337 {
 338         struct sk_buff *p=skb_peek(&s->receive_queue);
 339         if(p==NULL)
 340                 return NULL;
 341         do
 342         {
 343                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 344                         return p;
 345                 p=p->next;
 346         }
 347         while(p!=(struct sk_buff *)&s->receive_queue);
 348         return NULL;
 349 }
 350 
 351 /*
 352  *      Remove a completed connection and return it. This is used by
 353  *      tcp_accept() to get connections from the queue.
 354  */
 355 
 356 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 357 {
 358         struct sk_buff *skb;
 359         unsigned long flags;
 360         save_flags(flags);
 361         cli(); 
 362         skb=tcp_find_established(s);
 363         if(skb!=NULL)
 364                 skb_unlink(skb);        /* Take it off the queue */
 365         restore_flags(flags);
 366         return skb;
 367 }
 368 
 369 /* 
 370  *      This routine closes sockets which have been at least partially
 371  *      opened, but not yet accepted. Currently it is only called by
 372  *      tcp_close, and timeout mirrors the value there. 
 373  */
 374 
 375 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 376 {
 377         struct sk_buff *skb;
 378 
 379         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 380         {
 381                 skb->sk->dead=1;
 382                 tcp_close(skb->sk, 0);
 383                 kfree_skb(skb, FREE_READ);
 384         }
 385         return;
 386 }
 387 
 388 /*
 389  *      Enter the time wait state. 
 390  */
 391 
 392 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 393 {
 394         tcp_set_state(sk,TCP_TIME_WAIT);
 395         sk->shutdown = SHUTDOWN_MASK;
 396         if (!sk->dead)
 397                 sk->state_change(sk);
 398         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 399 }
 400 
 401 /*
 402  *      A socket has timed out on its send queue and wants to do a
 403  *      little retransmitting. Currently this means TCP.
 404  */
 405 
 406 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 407 {
 408         struct sk_buff * skb;
 409         struct proto *prot;
 410         struct device *dev;
 411         int ct=0;
 412 
 413         prot = sk->prot;
 414         skb = sk->send_head;
 415 
 416         while (skb != NULL)
 417         {
 418                 struct tcphdr *th;
 419                 struct iphdr *iph;
 420                 int size;
 421 
 422                 dev = skb->dev;
 423                 IS_SKB(skb);
 424                 skb->when = jiffies;
 425 
 426                 /*
 427                  * In general it's OK just to use the old packet.  However we
 428                  * need to use the current ack and window fields.  Urg and
 429                  * urg_ptr could possibly stand to be updated as well, but we
 430                  * don't keep the necessary data.  That shouldn't be a problem,
 431                  * if the other end is doing the right thing.  Since we're
 432                  * changing the packet, we have to issue a new IP identifier.
 433                  */
 434 
 435                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 436                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 437                 size = skb->len - (((unsigned char *) th) - skb->data);
 438                 
 439                 /*
 440                  *      Note: We ought to check for window limits here but
 441                  *      currently this is done (less efficiently) elsewhere.
 442                  *      We do need to check for a route change but can't handle
 443                  *      that until we have the new 1.3.x buffers in.
 444                  *
 445                  */
 446 
 447                 iph->id = htons(ip_id_count++);
 448                 ip_send_check(iph);
 449 
 450                 /*
 451                  *      This is not the right way to handle this. We have to
 452                  *      issue an up to date window and ack report with this 
 453                  *      retransmit to keep the odd buggy tcp that relies on 
 454                  *      the fact BSD does this happy. 
 455                  *      We don't however need to recalculate the entire 
 456                  *      checksum, so someone wanting a small problem to play
 457                  *      with might like to implement RFC1141/RFC1624 and speed
 458                  *      this up by avoiding a full checksum.
 459                  */
 460                  
 461                 th->ack_seq = ntohl(sk->acked_seq);
 462                 th->window = ntohs(tcp_select_window(sk));
 463                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 464                 
 465                 /*
 466                  *      If the interface is (still) up and running, kick it.
 467                  */
 468 
 469                 if (dev->flags & IFF_UP)
 470                 {
 471                         /*
 472                          *      If the packet is still being sent by the device/protocol
 473                          *      below then don't retransmit. This is both needed, and good -
 474                          *      especially with connected mode AX.25 where it stops resends
 475                          *      occurring of an as yet unsent anyway frame!
 476                          *      We still add up the counts as the round trip time wants
 477                          *      adjusting.
 478                          */
 479                         if (sk && !skb_device_locked(skb))
 480                         {
 481                                 /* Remove it from any existing driver queue first! */
 482                                 skb_unlink(skb);
 483                                 /* Now queue it */
 484                                 ip_statistics.IpOutRequests++;
 485                                 dev_queue_xmit(skb, dev, sk->priority);
 486                         }
 487                 }
 488 
 489                 /*
 490                  *      Count retransmissions
 491                  */
 492                  
 493                 ct++;
 494                 sk->prot->retransmits ++;
 495 
 496                 /*
 497                  *      Only one retransmit requested.
 498                  */
 499         
 500                 if (!all)
 501                         break;
 502 
 503                 /*
 504                  *      This should cut it off before we send too many packets.
 505                  */
 506 
 507                 if (ct >= sk->cong_window)
 508                         break;
 509                 skb = skb->link3;
 510         }
 511 }
 512 
 513 /*
 514  *      Reset the retransmission timer
 515  */
 516  
 517 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 518 {
 519         del_timer(&sk->retransmit_timer);
 520         sk->ip_xmit_timeout = why;
 521         if((int)when < 0)
 522         {
 523                 when=3;
 524                 printk("Error: Negative timer in xmit_timer\n");
 525         }
 526         sk->retransmit_timer.expires=when;
 527         add_timer(&sk->retransmit_timer);
 528 }
 529 
 530 /*
 531  *      This is the normal code called for timeouts.  It does the retransmission
 532  *      and then does backoff.  tcp_do_retransmit is separated out because
 533  *      tcp_ack needs to send stuff from the retransmit queue without
 534  *      initiating a backoff.
 535  */
 536 
 537 
 538 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 539 {
 540         tcp_do_retransmit(sk, all);
 541 
 542         /*
 543          * Increase the timeout each time we retransmit.  Note that
 544          * we do not increase the rtt estimate.  rto is initialized
 545          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 546          * that doubling rto each time is the least we can get away with.
 547          * In KA9Q, Karn uses this for the first few times, and then
 548          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 549          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 550          * defined in the protocol as the maximum possible RTT.  I guess
 551          * we'll have to use something other than TCP to talk to the
 552          * University of Mars.
 553          *
 554          * PAWS allows us longer timeouts and large windows, so once
 555          * implemented ftp to mars will work nicely. We will have to fix
 556          * the 120 second clamps though!
 557          */
 558 
 559         sk->retransmits++;
 560         sk->backoff++;
 561         sk->rto = min(sk->rto << 1, 120*HZ);
 562         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 563 }
 564 
 565 
 566 /*
 567  *      A timer event has trigger a tcp retransmit timeout. The
 568  *      socket xmit queue is ready and set up to send. Because
 569  *      the ack receive code keeps the queue straight we do
 570  *      nothing clever here.
 571  */
 572 
 573 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 574 {
 575         if (all) 
 576         {
 577                 tcp_retransmit_time(sk, all);
 578                 return;
 579         }
 580 
 581         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 582         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 583         sk->cong_count = 0;
 584 
 585         sk->cong_window = 1;
 586 
 587         /* Do the actual retransmit. */
 588         tcp_retransmit_time(sk, all);
 589 }
 590 
 591 /*
 592  *      A write timeout has occurred. Process the after effects.
 593  */
 594 
 595 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 596 {
 597         /*
 598          *      Look for a 'soft' timeout.
 599          */
 600         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 601                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 602         {
 603                 /*
 604                  *      Attempt to recover if arp has changed (unlikely!) or
 605                  *      a route has shifted (not supported prior to 1.3).
 606                  */
 607                 arp_destroy (sk->daddr, 0);
 608                 ip_route_check (sk->daddr);
 609         }
 610         /*
 611          *      Has it gone just too far ?
 612          */
 613         if (sk->retransmits > TCP_RETR2) 
 614         {
 615                 sk->err = ETIMEDOUT;
 616                 sk->error_report(sk);
 617                 del_timer(&sk->retransmit_timer);
 618                 /*
 619                  *      Time wait the socket 
 620                  */
 621                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 622                 {
 623                         tcp_set_state(sk,TCP_TIME_WAIT);
 624                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 625                 }
 626                 else
 627                 {
 628                         /*
 629                          *      Clean up time.
 630                          */
 631                         tcp_set_state(sk, TCP_CLOSE);
 632                         return 0;
 633                 }
 634         }
 635         return 1;
 636 }
 637 
 638 /*
 639  *      The TCP retransmit timer. This lacks a few small details.
 640  *
 641  *      1.      An initial rtt timeout on the probe0 should cause what we can
 642  *              of the first write queue buffer to be split and sent.
 643  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 644  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 645  *              tcp_err should save a 'soft error' for us.
 646  */
 647 
 648 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650         struct sock *sk = (struct sock*)data;
 651         int why = sk->ip_xmit_timeout;
 652 
 653         /* 
 654          * only process if socket is not in use
 655          */
 656 
 657         cli();
 658         if (sk->inuse || in_bh) 
 659         {
 660                 /* Try again in 1 second */
 661                 sk->retransmit_timer.expires = HZ;
 662                 add_timer(&sk->retransmit_timer);
 663                 sti();
 664                 return;
 665         }
 666 
 667         sk->inuse = 1;
 668         sti();
 669 
 670         /* Always see if we need to send an ack. */
 671 
 672         if (sk->ack_backlog && !sk->zapped) 
 673         {
 674                 sk->prot->read_wakeup (sk);
 675                 if (! sk->dead)
 676                         sk->data_ready(sk,0);
 677         }
 678 
 679         /* Now we need to figure out why the socket was on the timer. */
 680 
 681         switch (why) 
 682         {
 683                 /* Window probing */
 684                 case TIME_PROBE0:
 685                         tcp_send_probe0(sk);
 686                         tcp_write_timeout(sk);
 687                         break;
 688                 /* Retransmitting */
 689                 case TIME_WRITE:
 690                         /* It could be we got here because we needed to send an ack.
 691                          * So we need to check for that.
 692                          */
 693                 {
 694                         struct sk_buff *skb;
 695                         unsigned long flags;
 696 
 697                         save_flags(flags);
 698                         cli();
 699                         skb = sk->send_head;
 700                         if (!skb) 
 701                         {
 702                                 restore_flags(flags);
 703                         } 
 704                         else 
 705                         {
 706                                 /*
 707                                  *      Kicked by a delayed ack. Reset timer
 708                                  *      correctly now
 709                                  */
 710                                 if (jiffies < skb->when + sk->rto) 
 711                                 {
 712                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 713                                         restore_flags(flags);
 714                                         break;
 715                                 }
 716                                 restore_flags(flags);
 717                                 /*
 718                                  *      Retransmission
 719                                  */
 720                                 sk->prot->retransmit (sk, 0);
 721                                 tcp_write_timeout(sk);
 722                         }
 723                         break;
 724                 }
 725                 /* Sending Keepalives */
 726                 case TIME_KEEPOPEN:
 727                         /* 
 728                          * this reset_timer() call is a hack, this is not
 729                          * how KEEPOPEN is supposed to work.
 730                          */
 731                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 732 
 733                         /* Send something to keep the connection open. */
 734                         if (sk->prot->write_wakeup)
 735                                   sk->prot->write_wakeup (sk);
 736                         sk->retransmits++;
 737                         tcp_write_timeout(sk);
 738                         break;
 739                 default:
 740                         printk ("rexmit_timer: timer expired - reason unknown\n");
 741                         break;
 742         }
 743         release_sock(sk);
 744 }
 745 
 746 /*
 747  * This routine is called by the ICMP module when it gets some
 748  * sort of error condition.  If err < 0 then the socket should
 749  * be closed and the error returned to the user.  If err > 0
 750  * it's just the icmp type << 8 | icmp code.  After adjustment
 751  * header points to the first 8 bytes of the tcp header.  We need
 752  * to find the appropriate port.
 753  */
 754 
 755 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 756         unsigned long saddr, struct inet_protocol *protocol)
 757 {
 758         struct tcphdr *th;
 759         struct sock *sk;
 760         struct iphdr *iph=(struct iphdr *)header;
 761   
 762         header+=4*iph->ihl;
 763    
 764 
 765         th =(struct tcphdr *)header;
 766         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 767 
 768         if (sk == NULL) 
 769                 return;
 770   
 771         if(err<0)
 772         {
 773                 sk->err = -err;
 774                 sk->error_report(sk);
 775                 return;
 776         }
 777 
 778         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 779         {
 780                 /*
 781                  * FIXME:
 782                  * For now we will just trigger a linear backoff.
 783                  * The slow start code should cause a real backoff here.
 784                  */
 785                 if (sk->cong_window > 4)
 786                         sk->cong_window--;
 787                 return;
 788         }
 789 
 790 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 791 
 792         /*
 793          * If we've already connected we will keep trying
 794          * until we time out, or the user gives up.
 795          */
 796 
 797         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 798         {
 799                 if (sk->state == TCP_SYN_SENT) 
 800                 {
 801                         tcp_statistics.TcpAttemptFails++;
 802                         tcp_set_state(sk,TCP_CLOSE);
 803                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 804                 }
 805                 sk->err = icmp_err_convert[err & 0xff].errno;           
 806         }
 807         return;
 808 }
 809 
 810 
 811 /*
 812  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 813  *      in the received data queue (ie a frame missing that needs sending to us). Not
 814  *      sorting using two queues as data arrives makes life so much harder.
 815  */
 816 
 817 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 818 {
 819         unsigned long counted;
 820         unsigned long amount;
 821         struct sk_buff *skb;
 822         int sum;
 823         unsigned long flags;
 824 
 825         if(sk && sk->debug)
 826                 printk("tcp_readable: %p - ",sk);
 827 
 828         save_flags(flags);
 829         cli();
 830         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 831         {
 832                 restore_flags(flags);
 833                 if(sk && sk->debug) 
 834                         printk("empty\n");
 835                 return(0);
 836         }
 837   
 838         counted = sk->copied_seq;       /* Where we are at the moment */
 839         amount = 0;
 840   
 841         /* 
 842          *      Do until a push or until we are out of data. 
 843          */
 844          
 845         do 
 846         {
 847                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 848                         break;
 849                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 850                 if (skb->h.th->syn)
 851                         sum++;
 852                 if (sum > 0) 
 853                 {                                       /* Add it up, move on */
 854                         amount += sum;
 855                         if (skb->h.th->syn) 
 856                                 amount--;
 857                         counted += sum;
 858                 }
 859                 /*
 860                  * Don't count urg data ... but do it in the right place!
 861                  * Consider: "old_data (ptr is here) URG PUSH data"
 862                  * The old code would stop at the first push because
 863                  * it counted the urg (amount==1) and then does amount--
 864                  * *after* the loop.  This means tcp_readable() always
 865                  * returned zero if any URG PUSH was in the queue, even
 866                  * though there was normal data available. If we subtract
 867                  * the urg data right here, we even get it to work for more
 868                  * than one URG PUSH skb without normal data.
 869                  * This means that select() finally works now with urg data
 870                  * in the queue.  Note that rlogin was never affected
 871                  * because it doesn't use select(); it uses two processes
 872                  * and a blocking read().  And the queue scan in tcp_read()
 873                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 874                  */
 875                 if (skb->h.th->urg)
 876                         amount--;       /* don't count urg data */
 877                 if (amount && skb->h.th->psh) break;
 878                 skb = skb->next;
 879         }
 880         while(skb != (struct sk_buff *)&sk->receive_queue);
 881 
 882         restore_flags(flags);
 883         if(sk->debug)
 884                 printk("got %lu bytes.\n",amount);
 885         return(amount);
 886 }
 887 
 888 /*
 889  * LISTEN is a special case for select..
 890  */
 891 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 892 {
 893         if (sel_type == SEL_IN) {
 894                 int retval;
 895 
 896                 sk->inuse = 1;
 897                 retval = (tcp_find_established(sk) != NULL);
 898                 release_sock(sk);
 899                 if (!retval)
 900                         select_wait(&master_select_wakeup,wait);
 901                 return retval;
 902         }
 903         return 0;
 904 }
 905 
 906 
 907 /*
 908  *      Wait for a TCP event.
 909  *
 910  *      Note that we don't need to set "sk->inuse", as the upper select layers
 911  *      take care of normal races (between the test and the event) and we don't
 912  *      go look at any of the socket buffers directly.
 913  */
 914 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 915 {
 916         if (sk->state == TCP_LISTEN)
 917                 return tcp_listen_select(sk, sel_type, wait);
 918 
 919         switch(sel_type) {
 920         case SEL_IN:
 921                 if (sk->err)
 922                         return 1;
 923                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 924                         break;
 925 
 926                 if (sk->shutdown & RCV_SHUTDOWN)
 927                         return 1;
 928                         
 929                 if (sk->acked_seq == sk->copied_seq)
 930                         break;
 931 
 932                 if (sk->urg_seq != sk->copied_seq ||
 933                     sk->acked_seq != sk->copied_seq+1 ||
 934                     sk->urginline || !sk->urg_data)
 935                         return 1;
 936                 break;
 937 
 938         case SEL_OUT:
 939                 if (sk->err)
 940                         return 1;
 941                 if (sk->shutdown & SEND_SHUTDOWN) 
 942                         return 0;
 943                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 944                         break;
 945                 /*
 946                  * This is now right thanks to a small fix
 947                  * by Matt Dillon.
 948                  */
 949 
 950                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 951                         break;
 952                 return 1;
 953 
 954         case SEL_EX:
 955                 if (sk->urg_data)
 956                         return 1;
 957                 break;
 958         }
 959         select_wait(sk->sleep, wait);
 960         return 0;
 961 }
 962 
 963 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 964 {
 965         int err;
 966         switch(cmd) 
 967         {
 968 
 969                 case TIOCINQ:
 970 #ifdef FIXME    /* FIXME: */
 971                 case FIONREAD:
 972 #endif
 973                 {
 974                         unsigned long amount;
 975 
 976                         if (sk->state == TCP_LISTEN) 
 977                                 return(-EINVAL);
 978 
 979                         sk->inuse = 1;
 980                         amount = tcp_readable(sk);
 981                         release_sock(sk);
 982                         err=verify_area(VERIFY_WRITE,(void *)arg,
 983                                                    sizeof(unsigned long));
 984                         if(err)
 985                                 return err;
 986                         put_fs_long(amount,(unsigned long *)arg);
 987                         return(0);
 988                 }
 989                 case SIOCATMARK:
 990                 {
 991                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 992 
 993                         err = verify_area(VERIFY_WRITE,(void *) arg,
 994                                                   sizeof(unsigned long));
 995                         if (err)
 996                                 return err;
 997                         put_fs_long(answ,(int *) arg);
 998                         return(0);
 999                 }
1000                 case TIOCOUTQ:
1001                 {
1002                         unsigned long amount;
1003 
1004                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1005                         amount = sk->prot->wspace(sk);
1006                         err=verify_area(VERIFY_WRITE,(void *)arg,
1007                                                    sizeof(unsigned long));
1008                         if(err)
1009                                 return err;
1010                         put_fs_long(amount,(unsigned long *)arg);
1011                         return(0);
1012                 }
1013                 default:
1014                         return(-EINVAL);
1015         }
1016 }
1017 
1018 
1019 /*
1020  *      This routine computes a TCP checksum. 
1021  */
1022  
1023 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1024           unsigned long saddr, unsigned long daddr)
1025 {     
1026         unsigned long sum;
1027    
1028         if (saddr == 0) saddr = ip_my_addr();
1029 
1030 /*
1031  * stupid, gcc complains when I use just one __asm__ block,
1032  * something about too many reloads, but this is just two
1033  * instructions longer than what I want
1034  */
1035         __asm__("
1036             addl %%ecx, %%ebx
1037             adcl %%edx, %%ebx
1038             adcl $0, %%ebx
1039             "
1040         : "=b"(sum)
1041         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1042         : "bx", "cx", "dx" );
1043         __asm__("
1044             movl %%ecx, %%edx
1045             cld
1046             cmpl $32, %%ecx
1047             jb 2f
1048             shrl $5, %%ecx
1049             clc
1050 1:          lodsl
1051             adcl %%eax, %%ebx
1052             lodsl
1053             adcl %%eax, %%ebx
1054             lodsl
1055             adcl %%eax, %%ebx
1056             lodsl
1057             adcl %%eax, %%ebx
1058             lodsl
1059             adcl %%eax, %%ebx
1060             lodsl
1061             adcl %%eax, %%ebx
1062             lodsl
1063             adcl %%eax, %%ebx
1064             lodsl
1065             adcl %%eax, %%ebx
1066             loop 1b
1067             adcl $0, %%ebx
1068             movl %%edx, %%ecx
1069 2:          andl $28, %%ecx
1070             je 4f
1071             shrl $2, %%ecx
1072             clc
1073 3:          lodsl
1074             adcl %%eax, %%ebx
1075             loop 3b
1076             adcl $0, %%ebx
1077 4:          movl $0, %%eax
1078             testw $2, %%dx
1079             je 5f
1080             lodsw
1081             addl %%eax, %%ebx
1082             adcl $0, %%ebx
1083             movw $0, %%ax
1084 5:          test $1, %%edx
1085             je 6f
1086             lodsb
1087             addl %%eax, %%ebx
1088             adcl $0, %%ebx
1089 6:          movl %%ebx, %%eax
1090             shrl $16, %%eax
1091             addw %%ax, %%bx
1092             adcw $0, %%bx
1093             "
1094         : "=b"(sum)
1095         : "0"(sum), "c"(len), "S"(th)
1096         : "ax", "bx", "cx", "dx", "si" );
1097 
1098         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1099   
1100         return((~sum) & 0xffff);
1101 }
1102 
1103 
1104 
1105 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1106                 unsigned long daddr, int len, struct sock *sk)
1107 {
1108         th->check = 0;
1109         th->check = tcp_check(th, len, saddr, daddr);
1110         return;
1111 }
1112 
1113 /*
1114  *      This is the main buffer sending routine. We queue the buffer
1115  *      having checked it is sane seeming.
1116  */
1117  
1118 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1119 {
1120         int size;
1121         struct tcphdr * th = skb->h.th;
1122 
1123         /*
1124          *      length of packet (not counting length of pre-tcp headers) 
1125          */
1126          
1127         size = skb->len - ((unsigned char *) th - skb->data);
1128 
1129         /*
1130          *      Sanity check it.. 
1131          */
1132          
1133         if (size < sizeof(struct tcphdr) || size > skb->len) 
1134         {
1135                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1136                         skb, skb->data, th, skb->len);
1137                 kfree_skb(skb, FREE_WRITE);
1138                 return;
1139         }
1140 
1141         /*
1142          *      If we have queued a header size packet.. (these crash a few
1143          *      tcp stacks if ack is not set)
1144          */
1145          
1146         if (size == sizeof(struct tcphdr)) 
1147         {
1148                 /* If it's got a syn or fin it's notionally included in the size..*/
1149                 if(!th->syn && !th->fin) 
1150                 {
1151                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1152                         kfree_skb(skb,FREE_WRITE);
1153                         return;
1154                 }
1155         }
1156 
1157         /*
1158          *      Actual processing.
1159          */
1160          
1161         tcp_statistics.TcpOutSegs++;  
1162         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1163         
1164         /*
1165          *      We must queue if
1166          *
1167          *      a) The right edge of this frame exceeds the window
1168          *      b) We are retransmitting (Nagle's rule)
1169          *      c) We have too many packets 'in flight'
1170          */
1171          
1172         if (after(skb->h.seq, sk->window_seq) ||
1173             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1174              sk->packets_out >= sk->cong_window) 
1175         {
1176                 /* checksum will be supplied by tcp_write_xmit.  So
1177                  * we shouldn't need to set it at all.  I'm being paranoid */
1178                 th->check = 0;
1179                 if (skb->next != NULL) 
1180                 {
1181                         printk("tcp_send_partial: next != NULL\n");
1182                         skb_unlink(skb);
1183                 }
1184                 skb_queue_tail(&sk->write_queue, skb);
1185                 
1186                 /*
1187                  *      If we don't fit we have to start the zero window
1188                  *      probes. This is broken - we really need to do a partial
1189                  *      send _first_ (This is what causes the Cisco and PC/TCP
1190                  *      grief).
1191                  */
1192                  
1193                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1194                     sk->send_head == NULL && sk->ack_backlog == 0)
1195                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1196         } 
1197         else 
1198         {
1199                 /*
1200                  *      This is going straight out
1201                  */
1202                  
1203                 th->ack_seq = ntohl(sk->acked_seq);
1204                 th->window = ntohs(tcp_select_window(sk));
1205 
1206                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1207 
1208                 sk->sent_seq = sk->write_seq;
1209                 
1210                 /*
1211                  *      This is mad. The tcp retransmit queue is put together
1212                  *      by the ip layer. This causes half the problems with
1213                  *      unroutable FIN's and other things.
1214                  */
1215                  
1216                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1217                 
1218                 /*
1219                  *      Set for next retransmit based on expected ACK time.
1220                  *      FIXME: We set this every time which means our 
1221                  *      retransmits are really about a window behind.
1222                  */
1223 
1224                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1225         }
1226 }
1227 
1228 /*
1229  *      Locking problems lead us to a messy situation where we can have
1230  *      multiple partially complete buffers queued up. This is really bad
1231  *      as we don't want to be sending partial buffers. Fix this with
1232  *      a semaphore or similar to lock tcp_write per socket.
1233  *
1234  *      These routines are pretty self descriptive.
1235  */
1236  
1237 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1238 {
1239         struct sk_buff * skb;
1240         unsigned long flags;
1241 
1242         save_flags(flags);
1243         cli();
1244         skb = sk->partial;
1245         if (skb) {
1246                 sk->partial = NULL;
1247                 del_timer(&sk->partial_timer);
1248         }
1249         restore_flags(flags);
1250         return skb;
1251 }
1252 
1253 /*
1254  *      Empty the partial queue
1255  */
1256  
1257 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1258 {
1259         struct sk_buff *skb;
1260 
1261         if (sk == NULL)
1262                 return;
1263         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1264                 tcp_send_skb(sk, skb);
1265 }
1266 
1267 /*
1268  *      Queue a partial frame
1269  */
1270  
1271 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1272 {
1273         struct sk_buff * tmp;
1274         unsigned long flags;
1275 
1276         save_flags(flags);
1277         cli();
1278         tmp = sk->partial;
1279         if (tmp)
1280                 del_timer(&sk->partial_timer);
1281         sk->partial = skb;
1282         init_timer(&sk->partial_timer);
1283         /*
1284          *      Wait up to 1 second for the buffer to fill.
1285          */
1286         sk->partial_timer.expires = HZ;
1287         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1288         sk->partial_timer.data = (unsigned long) sk;
1289         add_timer(&sk->partial_timer);
1290         restore_flags(flags);
1291         if (tmp)
1292                 tcp_send_skb(sk, tmp);
1293 }
1294 
1295 
1296 /*
1297  *      This routine sends an ack and also updates the window. 
1298  */
1299  
1300 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1301              struct sock *sk,
1302              struct tcphdr *th, unsigned long daddr)
1303 {
1304         struct sk_buff *buff;
1305         struct tcphdr *t1;
1306         struct device *dev = NULL;
1307         int tmp;
1308 
1309         if(sk->zapped)
1310                 return;         /* We have been reset, we may not send again */
1311                 
1312         /*
1313          * We need to grab some memory, and put together an ack,
1314          * and then put it into the queue to be sent.
1315          */
1316 
1317         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1318         if (buff == NULL) 
1319         {
1320                 /* 
1321                  *      Force it to send an ack. We don't have to do this
1322                  *      (ACK is unreliable) but it's much better use of 
1323                  *      bandwidth on slow links to send a spare ack than
1324                  *      resend packets. 
1325                  */
1326                  
1327                 sk->ack_backlog++;
1328                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1329                 {
1330                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1331                 }
1332                 return;
1333         }
1334 
1335         /*
1336          *      Assemble a suitable TCP frame
1337          */
1338          
1339         buff->len = sizeof(struct tcphdr);
1340         buff->sk = sk;
1341         buff->localroute = sk->localroute;
1342         t1 =(struct tcphdr *) buff->data;
1343 
1344         /* 
1345          *      Put in the IP header and routing stuff. 
1346          */
1347          
1348         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1349                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1350         if (tmp < 0) 
1351         {
1352                 buff->free = 1;
1353                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1354                 return;
1355         }
1356         buff->len += tmp;
1357         t1 =(struct tcphdr *)((char *)t1 +tmp);
1358 
1359         memcpy(t1, th, sizeof(*t1));
1360 
1361         /*
1362          *      Swap the send and the receive. 
1363          */
1364          
1365         t1->dest = th->source;
1366         t1->source = th->dest;
1367         t1->seq = ntohl(sequence);
1368         t1->ack = 1;
1369         sk->window = tcp_select_window(sk);
1370         t1->window = ntohs(sk->window);
1371         t1->res1 = 0;
1372         t1->res2 = 0;
1373         t1->rst = 0;
1374         t1->urg = 0;
1375         t1->syn = 0;
1376         t1->psh = 0;
1377         t1->fin = 0;
1378         
1379         /*
1380          *      If we have nothing queued for transmit and the transmit timer
1381          *      is on we are just doing an ACK timeout and need to switch
1382          *      to a keepalive.
1383          */
1384          
1385         if (ack == sk->acked_seq) 
1386         {
1387                 sk->ack_backlog = 0;
1388                 sk->bytes_rcv = 0;
1389                 sk->ack_timed = 0;
1390                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1391                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1392                 {
1393                         if(sk->keepopen) {
1394                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1395                         } else {
1396                                 delete_timer(sk);
1397                         }
1398                 }
1399         }
1400         
1401         /*
1402          *      Fill in the packet and send it
1403          */
1404          
1405         t1->ack_seq = ntohl(ack);
1406         t1->doff = sizeof(*t1)/4;
1407         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1408         if (sk->debug)
1409                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1410         tcp_statistics.TcpOutSegs++;
1411         sk->prot->queue_xmit(sk, dev, buff, 1);
1412 }
1413 
1414 
1415 /* 
1416  *      This routine builds a generic TCP header. 
1417  */
1418  
1419 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1420 {
1421 
1422         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1423         th->seq = htonl(sk->write_seq);
1424         th->psh =(push == 0) ? 1 : 0;
1425         th->doff = sizeof(*th)/4;
1426         th->ack = 1;
1427         th->fin = 0;
1428         sk->ack_backlog = 0;
1429         sk->bytes_rcv = 0;
1430         sk->ack_timed = 0;
1431         th->ack_seq = htonl(sk->acked_seq);
1432         sk->window = tcp_select_window(sk);
1433         th->window = htons(sk->window);
1434 
1435         return(sizeof(*th));
1436 }
1437 
1438 /*
1439  *      This routine copies from a user buffer into a socket,
1440  *      and starts the transmit system.
1441  */
1442 
1443 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1444           int len, int nonblock, unsigned flags)
1445 {
1446         int copied = 0;
1447         int copy;
1448         int tmp;
1449         struct sk_buff *skb;
1450         struct sk_buff *send_tmp;
1451         unsigned char *buff;
1452         struct proto *prot;
1453         struct device *dev = NULL;
1454 
1455         sk->inuse=1;
1456         prot = sk->prot;
1457         while(len > 0) 
1458         {
1459                 if (sk->err) 
1460                 {                       /* Stop on an error */
1461                         release_sock(sk);
1462                         if (copied) 
1463                                 return(copied);
1464                         tmp = -sk->err;
1465                         sk->err = 0;
1466                         return(tmp);
1467                 }
1468 
1469                 /*
1470                  *      First thing we do is make sure that we are established. 
1471                  */
1472         
1473                 if (sk->shutdown & SEND_SHUTDOWN) 
1474                 {
1475                         release_sock(sk);
1476                         sk->err = EPIPE;
1477                         if (copied) 
1478                                 return(copied);
1479                         sk->err = 0;
1480                         return(-EPIPE);
1481                 }
1482 
1483                 /* 
1484                  *      Wait for a connection to finish.
1485                  */
1486         
1487                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1488                 {
1489                         if (sk->err) 
1490                         {
1491                                 release_sock(sk);
1492                                 if (copied) 
1493                                         return(copied);
1494                                 tmp = -sk->err;
1495                                 sk->err = 0;
1496                                 return(tmp);
1497                         }
1498 
1499                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1500                         {
1501                                 release_sock(sk);
1502                                 if (copied) 
1503                                         return(copied);
1504 
1505                                 if (sk->err) 
1506                                 {
1507                                         tmp = -sk->err;
1508                                         sk->err = 0;
1509                                         return(tmp);
1510                                 }
1511 
1512                                 if (sk->keepopen) 
1513                                 {
1514                                         send_sig(SIGPIPE, current, 0);
1515                                 }
1516                                 return(-EPIPE);
1517                         }
1518 
1519                         if (nonblock || copied) 
1520                         {
1521                                 release_sock(sk);
1522                                 if (copied) 
1523                                         return(copied);
1524                                 return(-EAGAIN);
1525                         }
1526 
1527                         release_sock(sk);
1528                         cli();
1529                 
1530                         if (sk->state != TCP_ESTABLISHED &&
1531                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1532                         {
1533                                 interruptible_sleep_on(sk->sleep);
1534                                 if (current->signal & ~current->blocked) 
1535                                 {
1536                                         sti();
1537                                         if (copied) 
1538                                                 return(copied);
1539                                         return(-ERESTARTSYS);
1540                                 }
1541                         }
1542                         sk->inuse = 1;
1543                         sti();
1544                 }
1545 
1546         /*
1547          * The following code can result in copy <= if sk->mss is ever
1548          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1549          * sk->mtu is constant once SYN processing is finished.  I.e. we
1550          * had better not get here until we've seen his SYN and at least one
1551          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1552          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1553          * non-decreasing.  Note that any ioctl to set user_mss must be done
1554          * before the exchange of SYN's.  If the initial ack from the other
1555          * end has a window of 0, max_window and thus mss will both be 0.
1556          */
1557 
1558         /* 
1559          *      Now we need to check if we have a half built packet. 
1560          */
1561 
1562                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1563                 {
1564                         int hdrlen;
1565 
1566                          /* IP header + TCP header */
1567                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1568                                  + sizeof(struct tcphdr);
1569         
1570                         /* Add more stuff to the end of skb->len */
1571                         if (!(flags & MSG_OOB)) 
1572                         {
1573                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1574                                 /* FIXME: this is really a bug. */
1575                                 if (copy <= 0) 
1576                                 {
1577                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1578                                         copy = 0;
1579                                 }
1580           
1581                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1582                                 skb->len += copy;
1583                                 from += copy;
1584                                 copied += copy;
1585                                 len -= copy;
1586                                 sk->write_seq += copy;
1587                         }
1588                         if ((skb->len - hdrlen) >= sk->mss ||
1589                                 (flags & MSG_OOB) || !sk->packets_out)
1590                                 tcp_send_skb(sk, skb);
1591                         else
1592                                 tcp_enqueue_partial(skb, sk);
1593                         continue;
1594                 }
1595 
1596         /*
1597          * We also need to worry about the window.
1598          * If window < 1/2 the maximum window we've seen from this
1599          *   host, don't use it.  This is sender side
1600          *   silly window prevention, as specified in RFC1122.
1601          *   (Note that this is different than earlier versions of
1602          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1603          *   use the whole MSS.  Since the results in the right
1604          *   edge of the packet being outside the window, it will
1605          *   be queued for later rather than sent.
1606          */
1607 
1608                 copy = sk->window_seq - sk->write_seq;
1609                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1610                         copy = sk->mss;
1611                 if (copy > len)
1612                         copy = len;
1613 
1614         /*
1615          *      We should really check the window here also. 
1616          */
1617          
1618                 send_tmp = NULL;
1619                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1620                 {
1621                         /*
1622                          *      We will release the socket in case we sleep here. 
1623                          */
1624                         release_sock(sk);
1625                         /*
1626                          *      NB: following must be mtu, because mss can be increased.
1627                          *      mss is always <= mtu 
1628                          */
1629                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1630                         sk->inuse = 1;
1631                         send_tmp = skb;
1632                 } 
1633                 else 
1634                 {
1635                         /*
1636                          *      We will release the socket in case we sleep here. 
1637                          */
1638                         release_sock(sk);
1639                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1640                         sk->inuse = 1;
1641                 }
1642 
1643                 /*
1644                  *      If we didn't get any memory, we need to sleep. 
1645                  */
1646 
1647                 if (skb == NULL) 
1648                 {
1649                         sk->socket->flags |= SO_NOSPACE;
1650                         if (nonblock) 
1651                         {
1652                                 release_sock(sk);
1653                                 if (copied) 
1654                                         return(copied);
1655                                 return(-EAGAIN);
1656                         }
1657 
1658                         /*
1659                          *      FIXME: here is another race condition. 
1660                          */
1661 
1662                         tmp = sk->wmem_alloc;
1663                         release_sock(sk);
1664                         cli();
1665                         /*
1666                          *      Again we will try to avoid it. 
1667                          */
1668                         if (tmp <= sk->wmem_alloc &&
1669                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1670                                 && sk->err == 0) 
1671                         {
1672                                 sk->socket->flags &= ~SO_NOSPACE;
1673                                 interruptible_sleep_on(sk->sleep);
1674                                 if (current->signal & ~current->blocked) 
1675                                 {
1676                                         sti();
1677                                         if (copied) 
1678                                                 return(copied);
1679                                         return(-ERESTARTSYS);
1680                                 }
1681                         }
1682                         sk->inuse = 1;
1683                         sti();
1684                         continue;
1685                 }
1686 
1687                 skb->len = 0;
1688                 skb->sk = sk;
1689                 skb->free = 0;
1690                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1691         
1692                 buff = skb->data;
1693         
1694                 /*
1695                  * FIXME: we need to optimize this.
1696                  * Perhaps some hints here would be good.
1697                  */
1698                 
1699                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1700                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1701                 if (tmp < 0 ) 
1702                 {
1703                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1704                         release_sock(sk);
1705                         if (copied) 
1706                                 return(copied);
1707                         return(tmp);
1708                 }
1709                 skb->len += tmp;
1710                 skb->dev = dev;
1711                 buff += tmp;
1712                 skb->h.th =(struct tcphdr *) buff;
1713                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1714                 if (tmp < 0) 
1715                 {
1716                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1717                         release_sock(sk);
1718                         if (copied) 
1719                                 return(copied);
1720                         return(tmp);
1721                 }
1722 
1723                 if (flags & MSG_OOB) 
1724                 {
1725                         ((struct tcphdr *)buff)->urg = 1;
1726                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1727                 }
1728                 skb->len += tmp;
1729                 memcpy_fromfs(buff+tmp, from, copy);
1730 
1731                 from += copy;
1732                 copied += copy;
1733                 len -= copy;
1734                 skb->len += copy;
1735                 skb->free = 0;
1736                 sk->write_seq += copy;
1737         
1738                 if (send_tmp != NULL && sk->packets_out) 
1739                 {
1740                         tcp_enqueue_partial(send_tmp, sk);
1741                         continue;
1742                 }
1743                 tcp_send_skb(sk, skb);
1744         }
1745         sk->err = 0;
1746 
1747 /*
1748  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1749  *      interactive fast network servers. It's meant to be on and
1750  *      it really improves the throughput though not the echo time
1751  *      on my slow slip link - Alan
1752  */
1753 
1754 /*
1755  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1756  */
1757  
1758         if(sk->partial && ((!sk->packets_out) 
1759      /* If not nagling we can send on the before case too.. */
1760               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1761         ))
1762                 tcp_send_partial(sk);
1763 
1764         release_sock(sk);
1765         return(copied);
1766 }
1767 
1768 /*
1769  *      This is just a wrapper. 
1770  */
1771 
1772 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1773            int len, int nonblock, unsigned flags,
1774            struct sockaddr_in *addr, int addr_len)
1775 {
1776         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1777                 return -EINVAL;
1778         if (sk->state == TCP_CLOSE)
1779                 return -ENOTCONN;
1780         if (addr_len < sizeof(*addr))
1781                 return -EINVAL;
1782         if (addr->sin_family && addr->sin_family != AF_INET) 
1783                 return -EINVAL;
1784         if (addr->sin_port != sk->dummy_th.dest) 
1785                 return -EISCONN;
1786         if (addr->sin_addr.s_addr != sk->daddr) 
1787                 return -EISCONN;
1788         return tcp_write(sk, from, len, nonblock, flags);
1789 }
1790 
1791 
1792 /*
1793  *      Send an ack if one is backlogged at this point. Ought to merge
1794  *      this with tcp_send_ack().
1795  */
1796  
1797 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1798 {
1799         int tmp;
1800         struct device *dev = NULL;
1801         struct tcphdr *t1;
1802         struct sk_buff *buff;
1803 
1804         if (!sk->ack_backlog) 
1805                 return;
1806 
1807         /*
1808          * If we're closed, don't send an ack, or we'll get a RST
1809          * from the closed destination.
1810          */
1811         if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1812                 return; 
1813 
1814         /*
1815          * FIXME: we need to put code here to prevent this routine from
1816          * being called.  Being called once in a while is ok, so only check
1817          * if this is the second time in a row.
1818          */
1819 
1820         /*
1821          * We need to grab some memory, and put together an ack,
1822          * and then put it into the queue to be sent.
1823          */
1824 
1825         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1826         if (buff == NULL) 
1827         {
1828                 /* Try again real soon. */
1829                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1830                 return;
1831         }
1832 
1833         buff->len = sizeof(struct tcphdr);
1834         buff->sk = sk;
1835         buff->localroute = sk->localroute;
1836         
1837         /*
1838          *      Put in the IP header and routing stuff. 
1839          */
1840 
1841         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1842                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1843         if (tmp < 0) 
1844         {
1845                 buff->free = 1;
1846                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1847                 return;
1848         }
1849 
1850         buff->len += tmp;
1851         t1 =(struct tcphdr *)(buff->data +tmp);
1852 
1853         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1854         t1->seq = htonl(sk->sent_seq);
1855         t1->ack = 1;
1856         t1->res1 = 0;
1857         t1->res2 = 0;
1858         t1->rst = 0;
1859         t1->urg = 0;
1860         t1->syn = 0;
1861         t1->psh = 0;
1862         sk->ack_backlog = 0;
1863         sk->bytes_rcv = 0;
1864         sk->window = tcp_select_window(sk);
1865         t1->window = ntohs(sk->window);
1866         t1->ack_seq = ntohl(sk->acked_seq);
1867         t1->doff = sizeof(*t1)/4;
1868         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1869         sk->prot->queue_xmit(sk, dev, buff, 1);
1870         tcp_statistics.TcpOutSegs++;
1871 }
1872 
1873 
1874 /*
1875  *      FIXME:
1876  *      This routine frees used buffers.
1877  *      It should consider sending an ACK to let the
1878  *      other end know we now have a bigger window.
1879  */
1880 
1881 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1882 {
1883         unsigned long flags;
1884         unsigned long left;
1885         struct sk_buff *skb;
1886         unsigned long rspace;
1887 
1888         if(sk->debug)
1889                 printk("cleaning rbuf for sk=%p\n", sk);
1890   
1891         save_flags(flags);
1892         cli();
1893   
1894         left = sk->prot->rspace(sk);
1895  
1896         /*
1897          *      We have to loop through all the buffer headers,
1898          *      and try to free up all the space we can.
1899          */
1900 
1901         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1902         {
1903                 if (!skb->used || skb->users) 
1904                         break;
1905                 skb_unlink(skb);
1906                 skb->sk = sk;
1907                 kfree_skb(skb, FREE_READ);
1908         }
1909 
1910         restore_flags(flags);
1911 
1912         /*
1913          *      FIXME:
1914          *      At this point we should send an ack if the difference
1915          *      in the window, and the amount of space is bigger than
1916          *      TCP_WINDOW_DIFF.
1917          */
1918 
1919         if(sk->debug)
1920                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1921                                             left);
1922         if ((rspace=sk->prot->rspace(sk)) != left) 
1923         {
1924                 /*
1925                  * This area has caused the most trouble.  The current strategy
1926                  * is to simply do nothing if the other end has room to send at
1927                  * least 3 full packets, because the ack from those will auto-
1928                  * matically update the window.  If the other end doesn't think
1929                  * we have much space left, but we have room for at least 1 more
1930                  * complete packet than it thinks we do, we will send an ack
1931                  * immediately.  Otherwise we will wait up to .5 seconds in case
1932                  * the user reads some more.
1933                  */
1934                 sk->ack_backlog++;
1935         /*
1936          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1937          * if the other end is offering a window smaller than the agreed on MSS
1938          * (called sk->mtu here).  In theory there's no connection between send
1939          * and receive, and so no reason to think that they're going to send
1940          * small packets.  For the moment I'm using the hack of reducing the mss
1941          * only on the send side, so I'm putting mtu here.
1942          */
1943 
1944                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1945                 {
1946                         /* Send an ack right now. */
1947                         tcp_read_wakeup(sk);
1948                 } 
1949                 else 
1950                 {
1951                         /* Force it to send an ack soon. */
1952                         int was_active = del_timer(&sk->retransmit_timer);
1953                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1954                         {
1955                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1956                         } 
1957                         else
1958                                 add_timer(&sk->retransmit_timer);
1959                 }
1960         }
1961 } 
1962 
1963 
1964 /*
1965  *      Handle reading urgent data. BSD has very simple semantics for
1966  *      this, no blocking and very strange errors 8)
1967  */
1968  
1969 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1970              unsigned char *to, int len, unsigned flags)
1971 {
1972         /*
1973          *      No URG data to read
1974          */
1975         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1976                 return -EINVAL; /* Yes this is right ! */
1977                 
1978         if (sk->err) 
1979         {
1980                 int tmp = -sk->err;
1981                 sk->err = 0;
1982                 return tmp;
1983         }
1984 
1985         if (sk->state == TCP_CLOSE || sk->done) 
1986         {
1987                 if (!sk->done) {
1988                         sk->done = 1;
1989                         return 0;
1990                 }
1991                 return -ENOTCONN;
1992         }
1993 
1994         if (sk->shutdown & RCV_SHUTDOWN) 
1995         {
1996                 sk->done = 1;
1997                 return 0;
1998         }
1999         sk->inuse = 1;
2000         if (sk->urg_data & URG_VALID) 
2001         {
2002                 char c = sk->urg_data;
2003                 if (!(flags & MSG_PEEK))
2004                         sk->urg_data = URG_READ;
2005                 put_fs_byte(c, to);
2006                 release_sock(sk);
2007                 return 1;
2008         }
2009         release_sock(sk);
2010         
2011         /*
2012          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
2013          * the available implementations agree in this case:
2014          * this call should never block, independent of the
2015          * blocking state of the socket.
2016          * Mike <pall@rz.uni-karlsruhe.de>
2017          */
2018         return -EAGAIN;
2019 }
2020 
2021 
2022 /*
2023  *      This routine copies from a sock struct into the user buffer. 
2024  */
2025  
2026 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2027         int len, int nonblock, unsigned flags)
2028 {
2029         struct wait_queue wait = { current, NULL };
2030         int copied = 0;
2031         unsigned long peek_seq;
2032         volatile unsigned long *seq;    /* So gcc doesn't overoptimise */
2033         unsigned long used;
2034 
2035         /* 
2036          *      This error should be checked. 
2037          */
2038          
2039         if (sk->state == TCP_LISTEN)
2040                 return -ENOTCONN;
2041 
2042         /*
2043          *      Urgent data needs to be handled specially. 
2044          */
2045          
2046         if (flags & MSG_OOB)
2047                 return tcp_read_urg(sk, nonblock, to, len, flags);
2048 
2049         /*
2050          *      Copying sequence to update. This is volatile to handle
2051          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2052          *      inline and thus not flush cached variables otherwise).
2053          */
2054          
2055         peek_seq = sk->copied_seq;
2056         seq = &sk->copied_seq;
2057         if (flags & MSG_PEEK)
2058                 seq = &peek_seq;
2059 
2060         add_wait_queue(sk->sleep, &wait);
2061         sk->inuse = 1;
2062         while (len > 0) 
2063         {
2064                 struct sk_buff * skb;
2065                 unsigned long offset;
2066         
2067                 /*
2068                  * Are we at urgent data? Stop if we have read anything.
2069                  */
2070                  
2071                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2072                         break;
2073 
2074                 /*
2075                  *      Next get a buffer.
2076                  */
2077                  
2078                 current->state = TASK_INTERRUPTIBLE;
2079 
2080                 skb = skb_peek(&sk->receive_queue);
2081                 do 
2082                 {
2083                         if (!skb)
2084                                 break;
2085                         if (before(*seq, skb->h.th->seq))
2086                                 break;
2087                         offset = *seq - skb->h.th->seq;
2088                         if (skb->h.th->syn)
2089                                 offset--;
2090                         if (offset < skb->len)
2091                                 goto found_ok_skb;
2092                         if (skb->h.th->fin)
2093                                 goto found_fin_ok;
2094                         if (!(flags & MSG_PEEK))
2095                                 skb->used = 1;
2096                         skb = skb->next;
2097                 }
2098                 while (skb != (struct sk_buff *)&sk->receive_queue);
2099 
2100                 if (copied)
2101                         break;
2102 
2103                 if (sk->err) 
2104                 {
2105                         copied = -sk->err;
2106                         sk->err = 0;
2107                         break;
2108                 }
2109 
2110                 if (sk->state == TCP_CLOSE) 
2111                 {
2112                         if (!sk->done) 
2113                         {
2114                                 sk->done = 1;
2115                                 break;
2116                         }
2117                         copied = -ENOTCONN;
2118                         break;
2119                 }
2120 
2121                 if (sk->shutdown & RCV_SHUTDOWN) 
2122                 {
2123                         sk->done = 1;
2124                         break;
2125                 }
2126                         
2127                 if (nonblock) 
2128                 {
2129                         copied = -EAGAIN;
2130                         break;
2131                 }
2132 
2133                 cleanup_rbuf(sk);
2134                 release_sock(sk);
2135                 sk->socket->flags |= SO_WAITDATA;
2136                 schedule();
2137                 sk->socket->flags &= ~SO_WAITDATA;
2138                 sk->inuse = 1;
2139 
2140                 if (current->signal & ~current->blocked) 
2141                 {
2142                         copied = -ERESTARTSYS;
2143                         break;
2144                 }
2145                 continue;
2146 
2147         found_ok_skb:
2148                 /*
2149                  *      Lock the buffer. We can be fairly relaxed as
2150                  *      an interrupt will never steal a buffer we are 
2151                  *      using unless I've missed something serious in
2152                  *      tcp_data.
2153                  */
2154                 
2155                 skb->users++;
2156                 
2157                 /*
2158                  *      Ok so how much can we use ? 
2159                  */
2160                  
2161                 used = skb->len - offset;
2162                 if (len < used)
2163                         used = len;
2164                 /*
2165                  *      Do we have urgent data here? 
2166                  */
2167                 
2168                 if (sk->urg_data) 
2169                 {
2170                         unsigned long urg_offset = sk->urg_seq - *seq;
2171                         if (urg_offset < used) 
2172                         {
2173                                 if (!urg_offset) 
2174                                 {
2175                                         if (!sk->urginline) 
2176                                         {
2177                                                 ++*seq;
2178                                                 offset++;
2179                                                 used--;
2180                                         }
2181                                 }
2182                                 else
2183                                         used = urg_offset;
2184                         }
2185                 }
2186                 
2187                 /*
2188                  *      Copy it - We _MUST_ update *seq first so that we
2189                  *      don't ever double read when we have dual readers
2190                  */
2191                  
2192                 *seq += used;
2193 
2194                 /*
2195                  *      This memcpy_tofs can sleep. If it sleeps and we
2196                  *      do a second read it relies on the skb->users to avoid
2197                  *      a crash when cleanup_rbuf() gets called.
2198                  */
2199                  
2200                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2201                         skb->h.th->doff*4 + offset, used);
2202                 copied += used;
2203                 len -= used;
2204                 to += used;
2205                 
2206                 /*
2207                  *      We now will not sleep again until we are finished
2208                  *      with skb. Sorry if you are doing the SMP port
2209                  *      but you'll just have to fix it neatly ;)
2210                  */
2211                  
2212                 skb->users --;
2213                 
2214                 if (after(sk->copied_seq,sk->urg_seq))
2215                         sk->urg_data = 0;
2216                 if (used + offset < skb->len)
2217                         continue;
2218                 
2219                 /*
2220                  *      Process the FIN.
2221                  */
2222 
2223                 if (skb->h.th->fin)
2224                         goto found_fin_ok;
2225                 if (flags & MSG_PEEK)
2226                         continue;
2227                 skb->used = 1;
2228                 continue;
2229 
2230         found_fin_ok:
2231                 ++*seq;
2232                 if (flags & MSG_PEEK)
2233                         break;
2234                         
2235                 /*
2236                  *      All is done
2237                  */
2238                  
2239                 skb->used = 1;
2240                 sk->shutdown |= RCV_SHUTDOWN;
2241                 break;
2242 
2243         }
2244         remove_wait_queue(sk->sleep, &wait);
2245         current->state = TASK_RUNNING;
2246 
2247         /* Clean up data we have read: This will do ACK frames */
2248         cleanup_rbuf(sk);
2249         release_sock(sk);
2250         return copied;
2251 }
2252 
2253 /*
2254  *      State processing on a close. This implements the state shift for
2255  *      sending our FIN frame. Note that we only send a FIN for some 
2256  *      states. A shutdown() may have already sent the FIN, or we may be
2257  *      closed.
2258  */
2259  
2260 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2261 {
2262         int ns=TCP_CLOSE;
2263         int send_fin=0;
2264         switch(sk->state)
2265         {
2266                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2267                         break;
2268                 case TCP_SYN_RECV:
2269                 case TCP_ESTABLISHED:   /* Closedown begin */
2270                         ns=TCP_FIN_WAIT1;
2271                         send_fin=1;
2272                         break;
2273                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2274                 case TCP_FIN_WAIT2:
2275                 case TCP_CLOSING:
2276                         ns=sk->state;
2277                         break;
2278                 case TCP_CLOSE:
2279                 case TCP_LISTEN:
2280                         break;
2281                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2282                                            wait only for the ACK */
2283                         ns=TCP_LAST_ACK;
2284                         send_fin=1;
2285         }
2286         
2287         tcp_set_state(sk,ns);
2288                 
2289         /*
2290          *      This is a (useful) BSD violating of the RFC. There is a
2291          *      problem with TCP as specified in that the other end could
2292          *      keep a socket open forever with no application left this end.
2293          *      We use a 3 minute timeout (about the same as BSD) then kill
2294          *      our end. If they send after that then tough - BUT: long enough
2295          *      that we won't make the old 4*rto = almost no time - whoops
2296          *      reset mistake.
2297          */
2298         if(dead && ns==TCP_FIN_WAIT2)
2299         {
2300                 int timer_active=del_timer(&sk->timer);
2301                 if(timer_active)
2302                         add_timer(&sk->timer);
2303                 else
2304                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2305         }
2306         
2307         return send_fin;
2308 }
2309 
2310 /*
2311  *      Send a fin.
2312  */
2313 
2314 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2315 {
2316         struct proto *prot =(struct proto *)sk->prot;
2317         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2318         struct tcphdr *t1;
2319         struct sk_buff *buff;
2320         struct device *dev=NULL;
2321         int tmp;
2322                 
2323         release_sock(sk); /* in case the malloc sleeps. */
2324         
2325         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2326         sk->inuse = 1;
2327 
2328         if (buff == NULL)
2329         {
2330                 /* This is a disaster if it occurs */
2331                 printk("tcp_send_fin: Impossible malloc failure");
2332                 return;
2333         }
2334 
2335         /*
2336          *      Administrivia
2337          */
2338          
2339         buff->sk = sk;
2340         buff->len = sizeof(*t1);
2341         buff->localroute = sk->localroute;
2342         t1 =(struct tcphdr *) buff->data;
2343 
2344         /*
2345          *      Put in the IP header and routing stuff. 
2346          */
2347 
2348         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2349                            IPPROTO_TCP, sk->opt,
2350                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2351         if (tmp < 0) 
2352         {
2353                 int t;
2354                 /*
2355                  *      Finish anyway, treat this as a send that got lost. 
2356                  *      (Not good).
2357                  */
2358                  
2359                 buff->free = 1;
2360                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2361                 sk->write_seq++;
2362                 t=del_timer(&sk->timer);
2363                 if(t)
2364                         add_timer(&sk->timer);
2365                 else
2366                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2367                 return;
2368         }
2369         
2370         /*
2371          *      We ought to check if the end of the queue is a buffer and
2372          *      if so simply add the fin to that buffer, not send it ahead.
2373          */
2374 
2375         t1 =(struct tcphdr *)((char *)t1 +tmp);
2376         buff->len += tmp;
2377         buff->dev = dev;
2378         memcpy(t1, th, sizeof(*t1));
2379         t1->seq = ntohl(sk->write_seq);
2380         sk->write_seq++;
2381         buff->h.seq = sk->write_seq;
2382         t1->ack = 1;
2383         t1->ack_seq = ntohl(sk->acked_seq);
2384         t1->window = ntohs(sk->window=tcp_select_window(sk));
2385         t1->fin = 1;
2386         t1->rst = 0;
2387         t1->doff = sizeof(*t1)/4;
2388         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2389 
2390         /*
2391          * If there is data in the write queue, the fin must be appended to
2392          * the write queue.
2393          */
2394         
2395         if (skb_peek(&sk->write_queue) != NULL) 
2396         {
2397                 buff->free = 0;
2398                 if (buff->next != NULL) 
2399                 {
2400                         printk("tcp_send_fin: next != NULL\n");
2401                         skb_unlink(buff);
2402                 }
2403                 skb_queue_tail(&sk->write_queue, buff);
2404         } 
2405         else 
2406         {
2407                 sk->sent_seq = sk->write_seq;
2408                 sk->prot->queue_xmit(sk, dev, buff, 0);
2409                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2410         }
2411 }
2412 
2413 /*
2414  *      Shutdown the sending side of a connection. Much like close except
2415  *      that we don't receive shut down or set sk->dead=1.
2416  */
2417 
2418 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2419 {
2420         /*
2421          *      We need to grab some memory, and put together a FIN,
2422          *      and then put it into the queue to be sent.
2423          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2424          */
2425 
2426         if (!(how & SEND_SHUTDOWN)) 
2427                 return;
2428          
2429         /*
2430          *      If we've already sent a FIN, or it's a closed state
2431          */
2432          
2433         if (sk->state == TCP_FIN_WAIT1 ||
2434             sk->state == TCP_FIN_WAIT2 ||
2435             sk->state == TCP_CLOSING ||
2436             sk->state == TCP_LAST_ACK ||
2437             sk->state == TCP_TIME_WAIT || 
2438             sk->state == TCP_CLOSE ||
2439             sk->state == TCP_LISTEN
2440           )
2441         {
2442                 return;
2443         }
2444         sk->inuse = 1;
2445 
2446         /*
2447          * flag that the sender has shutdown
2448          */
2449 
2450         sk->shutdown |= SEND_SHUTDOWN;
2451 
2452         /*
2453          *  Clear out any half completed packets. 
2454          */
2455 
2456         if (sk->partial)
2457                 tcp_send_partial(sk);
2458                 
2459         /*
2460          *      FIN if needed
2461          */
2462          
2463         if(tcp_close_state(sk,0))
2464                 tcp_send_fin(sk);
2465                 
2466         release_sock(sk);
2467 }
2468 
2469 
2470 static int
2471 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2472              int to_len, int nonblock, unsigned flags,
2473              struct sockaddr_in *addr, int *addr_len)
2474 {
2475         int result;
2476   
2477         /* 
2478          *      Have to check these first unlike the old code. If 
2479          *      we check them after we lose data on an error
2480          *      which is wrong 
2481          */
2482 
2483         if(addr_len)
2484                 *addr_len = sizeof(*addr);
2485         result=tcp_read(sk, to, to_len, nonblock, flags);
2486 
2487         if (result < 0) 
2488                 return(result);
2489   
2490         if(addr)
2491         {
2492                 addr->sin_family = AF_INET;
2493                 addr->sin_port = sk->dummy_th.dest;
2494                 addr->sin_addr.s_addr = sk->daddr;
2495         }
2496         return(result);
2497 }
2498 
2499 
2500 /*
2501  *      This routine will send an RST to the other tcp. 
2502  */
2503  
2504 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2505           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2506 {
2507         struct sk_buff *buff;
2508         struct tcphdr *t1;
2509         int tmp;
2510         struct device *ndev=NULL;
2511 
2512         /*
2513          *      Cannot reset a reset (Think about it).
2514          */
2515          
2516         if(th->rst)
2517                 return;
2518   
2519         /*
2520          * We need to grab some memory, and put together an RST,
2521          * and then put it into the queue to be sent.
2522          */
2523 
2524         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2525         if (buff == NULL) 
2526                 return;
2527 
2528         buff->len = sizeof(*t1);
2529         buff->sk = NULL;
2530         buff->dev = dev;
2531         buff->localroute = 0;
2532 
2533         t1 =(struct tcphdr *) buff->data;
2534 
2535         /*
2536          *      Put in the IP header and routing stuff. 
2537          */
2538 
2539         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2540                            sizeof(struct tcphdr),tos,ttl);
2541         if (tmp < 0) 
2542         {
2543                 buff->free = 1;
2544                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2545                 return;
2546         }
2547 
2548         t1 =(struct tcphdr *)((char *)t1 +tmp);
2549         buff->len += tmp;
2550         memcpy(t1, th, sizeof(*t1));
2551 
2552         /*
2553          *      Swap the send and the receive. 
2554          */
2555 
2556         t1->dest = th->source;
2557         t1->source = th->dest;
2558         t1->rst = 1;  
2559         t1->window = 0;
2560   
2561         if(th->ack)
2562         {
2563                 t1->ack = 0;
2564                 t1->seq = th->ack_seq;
2565                 t1->ack_seq = 0;
2566         }
2567         else
2568         {
2569                 t1->ack = 1;
2570                 if(!th->syn)
2571                         t1->ack_seq=htonl(th->seq);
2572                 else
2573                         t1->ack_seq=htonl(th->seq+1);
2574                 t1->seq=0;
2575         }
2576 
2577         t1->syn = 0;
2578         t1->urg = 0;
2579         t1->fin = 0;
2580         t1->psh = 0;
2581         t1->doff = sizeof(*t1)/4;
2582         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2583         prot->queue_xmit(NULL, ndev, buff, 1);
2584         tcp_statistics.TcpOutSegs++;
2585 }
2586 
2587 
2588 /*
2589  *      Look for tcp options. Parses everything but only knows about MSS.
2590  *      This routine is always called with the packet containing the SYN.
2591  *      However it may also be called with the ack to the SYN.  So you
2592  *      can't assume this is always the SYN.  It's always called after
2593  *      we have set up sk->mtu to our own MTU.
2594  *
2595  *      We need at minimum to add PAWS support here. Possibly large windows
2596  *      as Linux gets deployed on 100Mb/sec networks.
2597  */
2598  
2599 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2600 {
2601         unsigned char *ptr;
2602         int length=(th->doff*4)-sizeof(struct tcphdr);
2603         int mss_seen = 0;
2604     
2605         ptr = (unsigned char *)(th + 1);
2606   
2607         while(length>0)
2608         {
2609                 int opcode=*ptr++;
2610                 int opsize=*ptr++;
2611                 switch(opcode)
2612                 {
2613                         case TCPOPT_EOL:
2614                                 return;
2615                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2616                                 length--;
2617                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2618                                 continue;
2619                         
2620                         default:
2621                                 if(opsize<=2)   /* Avoid silly options looping forever */
2622                                         return;
2623                                 switch(opcode)
2624                                 {
2625                                         case TCPOPT_MSS:
2626                                                 if(opsize==4 && th->syn)
2627                                                 {
2628                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2629                                                         mss_seen = 1;
2630                                                 }
2631                                                 break;
2632                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2633                                 }
2634                                 ptr+=opsize-2;
2635                                 length-=opsize;
2636                 }
2637         }
2638         if (th->syn) 
2639         {
2640                 if (! mss_seen)
2641                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2642         }
2643 #ifdef CONFIG_INET_PCTCP
2644         sk->mss = min(sk->max_window >> 1, sk->mtu);
2645 #else    
2646         sk->mss = min(sk->max_window, sk->mtu);
2647 #endif  
2648 }
2649 
2650 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2651 {
2652         dst = ntohl(dst);
2653         if (IN_CLASSA(dst))
2654                 return htonl(IN_CLASSA_NET);
2655         if (IN_CLASSB(dst))
2656                 return htonl(IN_CLASSB_NET);
2657         return htonl(IN_CLASSC_NET);
2658 }
2659 
2660 /*
2661  *      Default sequence number picking algorithm.
2662  *      As close as possible to RFC 793, which
2663  *      suggests using a 250kHz clock.
2664  *      Further reading shows this assumes 2MB/s networks.
2665  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2666  *      That's funny, Linux has one built in!  Use it!
2667  */
2668 
2669 extern inline unsigned long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2670 {
2671         struct timeval tv;
2672         do_gettimeofday(&tv);
2673         return tv.tv_usec+tv.tv_sec*1000000;
2674 }
2675 
2676 /*
2677  *      This routine handles a connection request.
2678  *      It should make sure we haven't already responded.
2679  *      Because of the way BSD works, we have to send a syn/ack now.
2680  *      This also means it will be harder to close a socket which is
2681  *      listening.
2682  */
2683  
2684 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2685                  unsigned long daddr, unsigned long saddr,
2686                  struct options *opt, struct device *dev, unsigned long seq)
2687 {
2688         struct sk_buff *buff;
2689         struct tcphdr *t1;
2690         unsigned char *ptr;
2691         struct sock *newsk;
2692         struct tcphdr *th;
2693         struct device *ndev=NULL;
2694         int tmp;
2695         struct rtable *rt;
2696   
2697         th = skb->h.th;
2698 
2699         /* If the socket is dead, don't accept the connection. */
2700         if (!sk->dead) 
2701         {
2702                 sk->data_ready(sk,0);
2703         }
2704         else 
2705         {
2706                 if(sk->debug)
2707                         printk("Reset on %p: Connect on dead socket.\n",sk);
2708                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2709                 tcp_statistics.TcpAttemptFails++;
2710                 kfree_skb(skb, FREE_READ);
2711                 return;
2712         }
2713 
2714         /*
2715          * Make sure we can accept more.  This will prevent a
2716          * flurry of syns from eating up all our memory.
2717          */
2718 
2719         if (sk->ack_backlog >= sk->max_ack_backlog) 
2720         {
2721                 tcp_statistics.TcpAttemptFails++;
2722                 kfree_skb(skb, FREE_READ);
2723                 return;
2724         }
2725 
2726         /*
2727          * We need to build a new sock struct.
2728          * It is sort of bad to have a socket without an inode attached
2729          * to it, but the wake_up's will just wake up the listening socket,
2730          * and if the listening socket is destroyed before this is taken
2731          * off of the queue, this will take care of it.
2732          */
2733 
2734         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2735         if (newsk == NULL) 
2736         {
2737                 /* just ignore the syn.  It will get retransmitted. */
2738                 tcp_statistics.TcpAttemptFails++;
2739                 kfree_skb(skb, FREE_READ);
2740                 return;
2741         }
2742 
2743         memcpy(newsk, sk, sizeof(*newsk));
2744         skb_queue_head_init(&newsk->write_queue);
2745         skb_queue_head_init(&newsk->receive_queue);
2746         newsk->send_head = NULL;
2747         newsk->send_tail = NULL;
2748         skb_queue_head_init(&newsk->back_log);
2749         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2750         newsk->rto = TCP_TIMEOUT_INIT;
2751         newsk->mdev = 0;
2752         newsk->max_window = 0;
2753         newsk->cong_window = 1;
2754         newsk->cong_count = 0;
2755         newsk->ssthresh = 0;
2756         newsk->backoff = 0;
2757         newsk->blog = 0;
2758         newsk->intr = 0;
2759         newsk->proc = 0;
2760         newsk->done = 0;
2761         newsk->partial = NULL;
2762         newsk->pair = NULL;
2763         newsk->wmem_alloc = 0;
2764         newsk->rmem_alloc = 0;
2765         newsk->localroute = sk->localroute;
2766 
2767         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2768 
2769         newsk->err = 0;
2770         newsk->shutdown = 0;
2771         newsk->ack_backlog = 0;
2772         newsk->acked_seq = skb->h.th->seq+1;
2773         newsk->copied_seq = skb->h.th->seq+1;
2774         newsk->fin_seq = skb->h.th->seq;
2775         newsk->state = TCP_SYN_RECV;
2776         newsk->timeout = 0;
2777         newsk->ip_xmit_timeout = 0;
2778         newsk->write_seq = seq; 
2779         newsk->window_seq = newsk->write_seq;
2780         newsk->rcv_ack_seq = newsk->write_seq;
2781         newsk->urg_data = 0;
2782         newsk->retransmits = 0;
2783         newsk->linger=0;
2784         newsk->destroy = 0;
2785         init_timer(&newsk->timer);
2786         newsk->timer.data = (unsigned long)newsk;
2787         newsk->timer.function = &net_timer;
2788         init_timer(&newsk->retransmit_timer);
2789         newsk->retransmit_timer.data = (unsigned long)newsk;
2790         newsk->retransmit_timer.function=&retransmit_timer;
2791         newsk->dummy_th.source = skb->h.th->dest;
2792         newsk->dummy_th.dest = skb->h.th->source;
2793         
2794         /*
2795          *      Swap these two, they are from our point of view. 
2796          */
2797          
2798         newsk->daddr = saddr;
2799         newsk->saddr = daddr;
2800 
2801         put_sock(newsk->num,newsk);
2802         newsk->dummy_th.res1 = 0;
2803         newsk->dummy_th.doff = 6;
2804         newsk->dummy_th.fin = 0;
2805         newsk->dummy_th.syn = 0;
2806         newsk->dummy_th.rst = 0;        
2807         newsk->dummy_th.psh = 0;
2808         newsk->dummy_th.ack = 0;
2809         newsk->dummy_th.urg = 0;
2810         newsk->dummy_th.res2 = 0;
2811         newsk->acked_seq = skb->h.th->seq + 1;
2812         newsk->copied_seq = skb->h.th->seq + 1;
2813         newsk->socket = NULL;
2814 
2815         /*
2816          *      Grab the ttl and tos values and use them 
2817          */
2818 
2819         newsk->ip_ttl=sk->ip_ttl;
2820         newsk->ip_tos=skb->ip_hdr->tos;
2821 
2822         /*
2823          *      Use 512 or whatever user asked for 
2824          */
2825 
2826         /*
2827          *      Note use of sk->user_mss, since user has no direct access to newsk 
2828          */
2829 
2830         rt=ip_rt_route(saddr, NULL,NULL);
2831         
2832         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2833                 newsk->window_clamp = rt->rt_window;
2834         else
2835                 newsk->window_clamp = 0;
2836                 
2837         if (sk->user_mss)
2838                 newsk->mtu = sk->user_mss;
2839         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2840                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2841         else 
2842         {
2843 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2844                 if ((saddr ^ daddr) & default_mask(saddr))
2845 #else
2846                 if ((saddr ^ daddr) & dev->pa_mask)
2847 #endif
2848                         newsk->mtu = 576 - HEADER_SIZE;
2849                 else
2850                         newsk->mtu = MAX_WINDOW;
2851         }
2852 
2853         /*
2854          *      But not bigger than device MTU 
2855          */
2856 
2857         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2858 
2859         /*
2860          *      This will min with what arrived in the packet 
2861          */
2862 
2863         tcp_options(newsk,skb->h.th);
2864 
2865         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2866         if (buff == NULL) 
2867         {
2868                 sk->err = ENOMEM;
2869                 newsk->dead = 1;
2870                 newsk->state = TCP_CLOSE;
2871                 /* And this will destroy it */
2872                 release_sock(newsk);
2873                 kfree_skb(skb, FREE_READ);
2874                 tcp_statistics.TcpAttemptFails++;
2875                 return;
2876         }
2877   
2878         buff->len = sizeof(struct tcphdr)+4;
2879         buff->sk = newsk;
2880         buff->localroute = newsk->localroute;
2881 
2882         t1 =(struct tcphdr *) buff->data;
2883 
2884         /*
2885          *      Put in the IP header and routing stuff. 
2886          */
2887 
2888         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2889                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2890 
2891         /*
2892          *      Something went wrong. 
2893          */
2894 
2895         if (tmp < 0) 
2896         {
2897                 sk->err = tmp;
2898                 buff->free = 1;
2899                 kfree_skb(buff,FREE_WRITE);
2900                 newsk->dead = 1;
2901                 newsk->state = TCP_CLOSE;
2902                 release_sock(newsk);
2903                 skb->sk = sk;
2904                 kfree_skb(skb, FREE_READ);
2905                 tcp_statistics.TcpAttemptFails++;
2906                 return;
2907         }
2908 
2909         buff->len += tmp;
2910         t1 =(struct tcphdr *)((char *)t1 +tmp);
2911   
2912         memcpy(t1, skb->h.th, sizeof(*t1));
2913         buff->h.seq = newsk->write_seq;
2914         /*
2915          *      Swap the send and the receive. 
2916          */
2917         t1->dest = skb->h.th->source;
2918         t1->source = newsk->dummy_th.source;
2919         t1->seq = ntohl(newsk->write_seq++);
2920         t1->ack = 1;
2921         newsk->window = tcp_select_window(newsk);
2922         newsk->sent_seq = newsk->write_seq;
2923         t1->window = ntohs(newsk->window);
2924         t1->res1 = 0;
2925         t1->res2 = 0;
2926         t1->rst = 0;
2927         t1->urg = 0;
2928         t1->psh = 0;
2929         t1->syn = 1;
2930         t1->ack_seq = ntohl(skb->h.th->seq+1);
2931         t1->doff = sizeof(*t1)/4+1;
2932         ptr =(unsigned char *)(t1+1);
2933         ptr[0] = 2;
2934         ptr[1] = 4;
2935         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2936         ptr[3] =(newsk->mtu) & 0xff;
2937 
2938         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2939         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2940         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2941         skb->sk = newsk;
2942 
2943         /*
2944          *      Charge the sock_buff to newsk. 
2945          */
2946          
2947         sk->rmem_alloc -= skb->mem_len;
2948         newsk->rmem_alloc += skb->mem_len;
2949         
2950         skb_queue_tail(&sk->receive_queue,skb);
2951         sk->ack_backlog++;
2952         release_sock(newsk);
2953         tcp_statistics.TcpOutSegs++;
2954 }
2955 
2956 
2957 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2958 {
2959         /*
2960          * We need to grab some memory, and put together a FIN, 
2961          * and then put it into the queue to be sent.
2962          */
2963         
2964         sk->inuse = 1;
2965         
2966         if(sk->state == TCP_LISTEN)
2967         {
2968                 /* Special case */
2969                 tcp_set_state(sk, TCP_CLOSE);
2970                 tcp_close_pending(sk);
2971                 release_sock(sk);
2972                 return;
2973         }
2974         
2975         sk->keepopen = 1;
2976         sk->shutdown = SHUTDOWN_MASK;
2977 
2978         if (!sk->dead) 
2979                 sk->state_change(sk);
2980 
2981         if (timeout == 0) 
2982         {
2983                 struct sk_buff *skb;
2984                 
2985                 /*
2986                  *  We need to flush the recv. buffs.  We do this only on the
2987                  *  descriptor close, not protocol-sourced closes, because the
2988                  *  reader process may not have drained the data yet!
2989                  */
2990                  
2991                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2992                         kfree_skb(skb, FREE_READ);
2993                 /*
2994                  *      Get rid off any half-completed packets. 
2995                  */
2996 
2997                 if (sk->partial) 
2998                         tcp_send_partial(sk);
2999         }
3000 
3001                 
3002         /*
3003          *      Timeout is not the same thing - however the code likes
3004          *      to send both the same way (sigh).
3005          */
3006          
3007         if(timeout)
3008         {
3009                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
3010         }
3011         else
3012         {
3013                 if(tcp_close_state(sk,1)==1)
3014                 {
3015                         tcp_send_fin(sk);
3016                 }
3017         }
3018         release_sock(sk);
3019 }
3020 
3021 
3022 /*
3023  *      This routine takes stuff off of the write queue,
3024  *      and puts it in the xmit queue. This happens as incoming acks
3025  *      open up the remote window for us.
3026  */
3027  
3028 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3029 {
3030         struct sk_buff *skb;
3031 
3032         /*
3033          *      The bytes will have to remain here. In time closedown will
3034          *      empty the write queue and all will be happy 
3035          */
3036 
3037         if(sk->zapped)
3038                 return;
3039 
3040         /*
3041          *      Anything on the transmit queue that fits the window can
3042          *      be added providing we are not
3043          *
3044          *      a) retransmitting (Nagle's rule)
3045          *      b) exceeding our congestion window.
3046          */
3047          
3048         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3049                 before(skb->h.seq, sk->window_seq + 1) &&
3050                 (sk->retransmits == 0 ||
3051                  sk->ip_xmit_timeout != TIME_WRITE ||
3052                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3053                 && sk->packets_out < sk->cong_window) 
3054         {
3055                 IS_SKB(skb);
3056                 skb_unlink(skb);
3057                 
3058                 /*
3059                  *      See if we really need to send the packet. 
3060                  */
3061                  
3062                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3063                 {
3064                         /*
3065                          *      This is acked data. We can discard it. This 
3066                          *      cannot currently occur.
3067                          */
3068                          
3069                         sk->retransmits = 0;
3070                         kfree_skb(skb, FREE_WRITE);
3071                         if (!sk->dead) 
3072                                 sk->write_space(sk);
3073                 } 
3074                 else
3075                 {
3076                         struct tcphdr *th;
3077                         struct iphdr *iph;
3078                         int size;
3079 /*
3080  * put in the ack seq and window at this point rather than earlier,
3081  * in order to keep them monotonic.  We really want to avoid taking
3082  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3083  * Ack and window will in general have changed since this packet was put
3084  * on the write queue.
3085  */
3086                         iph = (struct iphdr *)(skb->data +
3087                                                skb->dev->hard_header_len);
3088                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3089                         size = skb->len - (((unsigned char *) th) - skb->data);
3090                         
3091                         th->ack_seq = ntohl(sk->acked_seq);
3092                         th->window = ntohs(tcp_select_window(sk));
3093 
3094                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3095 
3096                         sk->sent_seq = skb->h.seq;
3097                         
3098                         /*
3099                          *      IP manages our queue for some crazy reason
3100                          */
3101                          
3102                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3103                         
3104                         /*
3105                          *      Again we slide the timer wrongly
3106                          */
3107                          
3108                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3109                 }
3110         }
3111 }
3112 
3113 
3114 /*
3115  *      This routine deals with incoming acks, but not outgoing ones.
3116  */
3117 
3118 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3119 {
3120         unsigned long ack;
3121         int flag = 0;
3122 
3123         /* 
3124          * 1 - there was data in packet as well as ack or new data is sent or 
3125          *     in shutdown state
3126          * 2 - data from retransmit queue was acked and removed
3127          * 4 - window shrunk or data from retransmit queue was acked and removed
3128          */
3129 
3130         if(sk->zapped)
3131                 return(1);      /* Dead, cant ack any more so why bother */
3132 
3133         /*
3134          *      Have we discovered a larger window
3135          */
3136          
3137         ack = ntohl(th->ack_seq);
3138 
3139         if (ntohs(th->window) > sk->max_window) 
3140         {
3141                 sk->max_window = ntohs(th->window);
3142 #ifdef CONFIG_INET_PCTCP
3143                 /* Hack because we don't send partial packets to non SWS
3144                    handling hosts */
3145                 sk->mss = min(sk->max_window>>1, sk->mtu);
3146 #else
3147                 sk->mss = min(sk->max_window, sk->mtu);
3148 #endif  
3149         }
3150 
3151         /*
3152          *      We have dropped back to keepalive timeouts. Thus we have
3153          *      no retransmits pending.
3154          */
3155          
3156         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3157                 sk->retransmits = 0;
3158 
3159         /*
3160          *      If the ack is newer than sent or older than previous acks
3161          *      then we can probably ignore it.
3162          */
3163          
3164         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3165         {
3166                 if(sk->debug)
3167                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3168                         
3169                 /*
3170                  *      Keepalive processing.
3171                  */
3172                  
3173                 if (after(ack, sk->sent_seq)) 
3174                 {
3175                         return(0);
3176                 }
3177                 
3178                 /*
3179                  *      Restart the keepalive timer.
3180                  */
3181                  
3182                 if (sk->keepopen) 
3183                 {
3184                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3185                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3186                 }
3187                 return(1);
3188         }
3189 
3190         /*
3191          *      If there is data set flag 1
3192          */
3193          
3194         if (len != th->doff*4) 
3195                 flag |= 1;
3196 
3197         /*
3198          *      See if our window has been shrunk. 
3199          */
3200 
3201         if (after(sk->window_seq, ack+ntohs(th->window))) 
3202         {
3203                 /*
3204                  * We may need to move packets from the send queue
3205                  * to the write queue, if the window has been shrunk on us.
3206                  * The RFC says you are not allowed to shrink your window
3207                  * like this, but if the other end does, you must be able
3208                  * to deal with it.
3209                  */
3210                 struct sk_buff *skb;
3211                 struct sk_buff *skb2;
3212                 struct sk_buff *wskb = NULL;
3213         
3214                 skb2 = sk->send_head;
3215                 sk->send_head = NULL;
3216                 sk->send_tail = NULL;
3217         
3218                 /*
3219                  *      This is an artifact of a flawed concept. We want one
3220                  *      queue and a smarter send routine when we send all.
3221                  */
3222         
3223                 flag |= 4;      /* Window changed */
3224         
3225                 sk->window_seq = ack + ntohs(th->window);
3226                 cli();
3227                 while (skb2 != NULL) 
3228                 {
3229                         skb = skb2;
3230                         skb2 = skb->link3;
3231                         skb->link3 = NULL;
3232                         if (after(skb->h.seq, sk->window_seq)) 
3233                         {
3234                                 if (sk->packets_out > 0) 
3235                                         sk->packets_out--;
3236                                 /* We may need to remove this from the dev send list. */
3237                                 if (skb->next != NULL) 
3238                                 {
3239                                         skb_unlink(skb);                                
3240                                 }
3241                                 /* Now add it to the write_queue. */
3242                                 if (wskb == NULL)
3243                                         skb_queue_head(&sk->write_queue,skb);
3244                                 else
3245                                         skb_append(wskb,skb);
3246                                 wskb = skb;
3247                         } 
3248                         else 
3249                         {
3250                                 if (sk->send_head == NULL) 
3251                                 {
3252                                         sk->send_head = skb;
3253                                         sk->send_tail = skb;
3254                                 }
3255                                 else
3256                                 {
3257                                         sk->send_tail->link3 = skb;
3258                                         sk->send_tail = skb;
3259                                 }
3260                                 skb->link3 = NULL;
3261                         }
3262                 }
3263                 sti();
3264         }
3265 
3266         /*
3267          *      Pipe has emptied
3268          */
3269          
3270         if (sk->send_tail == NULL || sk->send_head == NULL) 
3271         {
3272                 sk->send_head = NULL;
3273                 sk->send_tail = NULL;
3274                 sk->packets_out= 0;
3275         }
3276 
3277         /*
3278          *      Update the right hand window edge of the host
3279          */
3280          
3281         sk->window_seq = ack + ntohs(th->window);
3282 
3283         /*
3284          *      We don't want too many packets out there. 
3285          */
3286          
3287         if (sk->ip_xmit_timeout == TIME_WRITE && 
3288                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3289         {
3290                 /* 
3291                  * This is Jacobson's slow start and congestion avoidance. 
3292                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3293                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3294                  * counter and increment it once every cwnd times.  It's possible
3295                  * that this should be done only if sk->retransmits == 0.  I'm
3296                  * interpreting "new data is acked" as including data that has
3297                  * been retransmitted but is just now being acked.
3298                  */
3299                 if (sk->cong_window < sk->ssthresh)  
3300                         /* 
3301                          *      In "safe" area, increase
3302                          */
3303                         sk->cong_window++;
3304                 else 
3305                 {
3306                         /*
3307                          *      In dangerous area, increase slowly.  In theory this is
3308                          *      sk->cong_window += 1 / sk->cong_window
3309                          */
3310                         if (sk->cong_count >= sk->cong_window) 
3311                         {
3312                                 sk->cong_window++;
3313                                 sk->cong_count = 0;
3314                         }
3315                         else 
3316                                 sk->cong_count++;
3317                 }
3318         }
3319 
3320         /*
3321          *      Remember the highest ack received.
3322          */
3323          
3324         sk->rcv_ack_seq = ack;
3325 
3326         /*
3327          *      If this ack opens up a zero window, clear backoff.  It was
3328          *      being used to time the probes, and is probably far higher than
3329          *      it needs to be for normal retransmission.
3330          */
3331 
3332         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3333         {
3334                 sk->retransmits = 0;    /* Our probe was answered */
3335                 
3336                 /*
3337                  *      Was it a usable window open ?
3338                  */
3339                  
3340                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3341                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3342                 {
3343                         sk->backoff = 0;
3344                         
3345                         /*
3346                          *      Recompute rto from rtt.  this eliminates any backoff.
3347                          */
3348 
3349                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3350                         if (sk->rto > 120*HZ)
3351                                 sk->rto = 120*HZ;
3352                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3353                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3354                                                    .2 of a second is going to need huge windows (SIGH) */
3355                         sk->rto = 20;
3356                 }
3357         }
3358 
3359         /* 
3360          *      See if we can take anything off of the retransmit queue.
3361          */
3362    
3363         while(sk->send_head != NULL) 
3364         {
3365                 /* Check for a bug. */
3366                 if (sk->send_head->link3 &&
3367                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3368                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3369                         
3370                 /*
3371                  *      If our packet is before the ack sequence we can
3372                  *      discard it as it's confirmed to have arrived the other end.
3373                  */
3374                  
3375                 if (before(sk->send_head->h.seq, ack+1)) 
3376                 {
3377                         struct sk_buff *oskb;   
3378                         if (sk->retransmits) 
3379                         {       
3380                                 /*
3381                                  *      We were retransmitting.  don't count this in RTT est 
3382                                  */
3383                                 flag |= 2;
3384 
3385                                 /*
3386                                  * even though we've gotten an ack, we're still
3387                                  * retransmitting as long as we're sending from
3388                                  * the retransmit queue.  Keeping retransmits non-zero
3389                                  * prevents us from getting new data interspersed with
3390                                  * retransmissions.
3391                                  */
3392 
3393                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3394                                         sk->retransmits = 1;
3395                                 else
3396                                         sk->retransmits = 0;
3397                         }
3398                         /*
3399                          * Note that we only reset backoff and rto in the
3400                          * rtt recomputation code.  And that doesn't happen
3401                          * if there were retransmissions in effect.  So the
3402                          * first new packet after the retransmissions is
3403                          * sent with the backoff still in effect.  Not until
3404                          * we get an ack from a non-retransmitted packet do
3405                          * we reset the backoff and rto.  This allows us to deal
3406                          * with a situation where the network delay has increased
3407                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3408                          */
3409 
3410                         /*
3411                          *      We have one less packet out there. 
3412                          */
3413                          
3414                         if (sk->packets_out > 0) 
3415                                 sk->packets_out --;
3416                         /* 
3417                          *      Wake up the process, it can probably write more. 
3418                          */
3419                         if (!sk->dead) 
3420                                 sk->write_space(sk);
3421                         oskb = sk->send_head;
3422 
3423                         if (!(flag&2))  /* Not retransmitting */
3424                         {
3425                                 long m;
3426         
3427                                 /*
3428                                  *      The following amusing code comes from Jacobson's
3429                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3430                                  *      are scaled versions of rtt and mean deviation.
3431                                  *      This is designed to be as fast as possible 
3432                                  *      m stands for "measurement".
3433                                  */
3434         
3435                                 m = jiffies - oskb->when;  /* RTT */
3436                                 if(m<=0)
3437                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3438                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3439                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3440                                 if (m < 0)
3441                                         m = -m;         /* m is now abs(error) */
3442                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3443                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3444         
3445                                 /*
3446                                  *      Now update timeout.  Note that this removes any backoff.
3447                                  */
3448                          
3449                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3450                                 if (sk->rto > 120*HZ)
3451                                         sk->rto = 120*HZ;
3452                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3453                                         sk->rto = 20;
3454                                 sk->backoff = 0;
3455                         }
3456                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3457                                            In this case as we just set it up */
3458                         cli();
3459                         oskb = sk->send_head;
3460                         IS_SKB(oskb);
3461                         sk->send_head = oskb->link3;
3462                         if (sk->send_head == NULL) 
3463                         {
3464                                 sk->send_tail = NULL;
3465                         }
3466 
3467                 /*
3468                  *      We may need to remove this from the dev send list. 
3469                  */
3470 
3471                         if (oskb->next)
3472                                 skb_unlink(oskb);
3473                         sti();
3474                         kfree_skb(oskb, FREE_WRITE); /* write. */
3475                         if (!sk->dead) 
3476                                 sk->write_space(sk);
3477                 }
3478                 else
3479                 {
3480                         break;
3481                 }
3482         }
3483 
3484         /*
3485          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3486          * returns non-NULL, we complete ignore the timer stuff in the else
3487          * clause.  We ought to organize the code so that else clause can
3488          * (should) be executed regardless, possibly moving the PROBE timer
3489          * reset over.  The skb_peek() thing should only move stuff to the
3490          * write queue, NOT also manage the timer functions.
3491          */
3492 
3493         /*
3494          * Maybe we can take some stuff off of the write queue,
3495          * and put it onto the xmit queue.
3496          */
3497         if (skb_peek(&sk->write_queue) != NULL) 
3498         {
3499                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3500                         (sk->retransmits == 0 || 
3501                          sk->ip_xmit_timeout != TIME_WRITE ||
3502                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3503                         && sk->packets_out < sk->cong_window) 
3504                 {
3505                         /*
3506                          *      Add more data to the send queue.
3507                          */
3508                         flag |= 1;
3509                         tcp_write_xmit(sk);
3510                 }
3511                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3512                         sk->send_head == NULL &&
3513                         sk->ack_backlog == 0 &&
3514                         sk->state != TCP_TIME_WAIT) 
3515                 {
3516                         /*
3517                          *      Data to queue but no room.
3518                          */
3519                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3520                 }               
3521         }
3522         else
3523         {
3524                 /*
3525                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3526                  * from TCP_CLOSE we don't do anything
3527                  *
3528                  * from anything else, if there is write data (or fin) pending,
3529                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3530                  * a KEEPALIVE timeout, else we delete the timer.
3531                  *
3532                  * We do not set flag for nominal write data, otherwise we may
3533                  * force a state where we start to write itsy bitsy tidbits
3534                  * of data.
3535                  */
3536 
3537                 switch(sk->state) {
3538                 case TCP_TIME_WAIT:
3539                         /*
3540                          * keep us in TIME_WAIT until we stop getting packets,
3541                          * reset the timeout.
3542                          */
3543                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3544                         break;
3545                 case TCP_CLOSE:
3546                         /*
3547                          * don't touch the timer.
3548                          */
3549                         break;
3550                 default:
3551                         /*
3552                          *      Must check send_head, write_queue, and ack_backlog
3553                          *      to determine which timeout to use.
3554                          */
3555                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3556                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3557                         } else if (sk->keepopen) {
3558                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3559                         } else {
3560                                 del_timer(&sk->retransmit_timer);
3561                                 sk->ip_xmit_timeout = 0;
3562                         }
3563                         break;
3564                 }
3565         }
3566 
3567         /*
3568          *      We have nothing queued but space to send. Send any partial
3569          *      packets immediately (end of Nagle rule application).
3570          */
3571          
3572         if (sk->packets_out == 0 && sk->partial != NULL &&
3573                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3574         {
3575                 flag |= 1;
3576                 tcp_send_partial(sk);
3577         }
3578 
3579         /*
3580          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3581          * we are now waiting for an acknowledge to our FIN.  The other end is
3582          * already in TIME_WAIT.
3583          *
3584          * Move to TCP_CLOSE on success.
3585          */
3586 
3587         if (sk->state == TCP_LAST_ACK) 
3588         {
3589                 if (!sk->dead)
3590                         sk->state_change(sk);
3591                 if(sk->debug)
3592                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3593                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3594                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3595                 {
3596                         flag |= 1;
3597                         tcp_set_state(sk,TCP_CLOSE);
3598                         sk->shutdown = SHUTDOWN_MASK;
3599                 }
3600         }
3601 
3602         /*
3603          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3604          *
3605          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3606          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3607          */
3608 
3609         if (sk->state == TCP_FIN_WAIT1) 
3610         {
3611 
3612                 if (!sk->dead) 
3613                         sk->state_change(sk);
3614                 if (sk->rcv_ack_seq == sk->write_seq) 
3615                 {
3616                         flag |= 1;
3617                         sk->shutdown |= SEND_SHUTDOWN;
3618                         tcp_set_state(sk, TCP_FIN_WAIT2);
3619                 }
3620         }
3621 
3622         /*
3623          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3624          *
3625          *      Move to TIME_WAIT
3626          */
3627 
3628         if (sk->state == TCP_CLOSING) 
3629         {
3630 
3631                 if (!sk->dead) 
3632                         sk->state_change(sk);
3633                 if (sk->rcv_ack_seq == sk->write_seq) 
3634                 {
3635                         flag |= 1;
3636                         tcp_time_wait(sk);
3637                 }
3638         }
3639         
3640         /*
3641          *      Final ack of a three way shake 
3642          */
3643          
3644         if(sk->state==TCP_SYN_RECV)
3645         {
3646                 tcp_set_state(sk, TCP_ESTABLISHED);
3647                 tcp_options(sk,th);
3648                 sk->dummy_th.dest=th->source;
3649                 sk->copied_seq = sk->acked_seq;
3650                 if(!sk->dead)
3651                         sk->state_change(sk);
3652                 if(sk->max_window==0)
3653                 {
3654                         sk->max_window=32;      /* Sanity check */
3655                         sk->mss=min(sk->max_window,sk->mtu);
3656                 }
3657         }
3658         
3659         /*
3660          * I make no guarantees about the first clause in the following
3661          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3662          * what conditions "!flag" would be true.  However I think the rest
3663          * of the conditions would prevent that from causing any
3664          * unnecessary retransmission. 
3665          *   Clearly if the first packet has expired it should be 
3666          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3667          * harder to explain:  You have to look carefully at how and when the
3668          * timer is set and with what timeout.  The most recent transmission always
3669          * sets the timer.  So in general if the most recent thing has timed
3670          * out, everything before it has as well.  So we want to go ahead and
3671          * retransmit some more.  If we didn't explicitly test for this
3672          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3673          * would not be true.  If you look at the pattern of timing, you can
3674          * show that rto is increased fast enough that the next packet would
3675          * almost never be retransmitted immediately.  Then you'd end up
3676          * waiting for a timeout to send each packet on the retransmission
3677          * queue.  With my implementation of the Karn sampling algorithm,
3678          * the timeout would double each time.  The net result is that it would
3679          * take a hideous amount of time to recover from a single dropped packet.
3680          * It's possible that there should also be a test for TIME_WRITE, but
3681          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3682          * got to be in real retransmission mode.
3683          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3684          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3685          * As long as no further losses occur, this seems reasonable.
3686          */
3687         
3688         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3689                (((flag&2) && sk->retransmits) ||
3690                (sk->send_head->when + sk->rto < jiffies))) 
3691         {
3692                 if(sk->send_head->when + sk->rto < jiffies)
3693                         tcp_retransmit(sk,0);   
3694                 else
3695                 {
3696                         tcp_do_retransmit(sk, 1);
3697                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3698                 }
3699         }
3700 
3701         return(1);
3702 }
3703 
3704 
3705 /*
3706  *      Process the FIN bit. This now behaves as it is supposed to work
3707  *      and the FIN takes effect when it is validly part of sequence
3708  *      space. Not before when we get holes.
3709  *
3710  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3711  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3712  *      TIME-WAIT)
3713  *
3714  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3715  *      close and we go into CLOSING (and later onto TIME-WAIT)
3716  *
3717  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3718  *
3719  */
3720  
3721 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3722 {
3723         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3724 
3725         if (!sk->dead) 
3726         {
3727                 sk->state_change(sk);
3728                 sock_wake_async(sk->socket, 1);
3729         }
3730 
3731         switch(sk->state) 
3732         {
3733                 case TCP_SYN_RECV:
3734                 case TCP_SYN_SENT:
3735                 case TCP_ESTABLISHED:
3736                         /*
3737                          * move to CLOSE_WAIT, tcp_data() already handled
3738                          * sending the ack.
3739                          */
3740                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3741                         if (th->rst)
3742                                 sk->shutdown = SHUTDOWN_MASK;
3743                         break;
3744 
3745                 case TCP_CLOSE_WAIT:
3746                 case TCP_CLOSING:
3747                         /*
3748                          * received a retransmission of the FIN, do
3749                          * nothing.
3750                          */
3751                         break;
3752                 case TCP_TIME_WAIT:
3753                         /*
3754                          * received a retransmission of the FIN,
3755                          * restart the TIME_WAIT timer.
3756                          */
3757                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3758                         return(0);
3759                 case TCP_FIN_WAIT1:
3760                         /*
3761                          * This case occurs when a simultaneous close
3762                          * happens, we must ack the received FIN and
3763                          * enter the CLOSING state.
3764                          *
3765                          * This causes a WRITE timeout, which will either
3766                          * move on to TIME_WAIT when we timeout, or resend
3767                          * the FIN properly (maybe we get rid of that annoying
3768                          * FIN lost hang). The TIME_WRITE code is already correct
3769                          * for handling this timeout.
3770                          */
3771 
3772                         if(sk->ip_xmit_timeout != TIME_WRITE)
3773                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3774                         tcp_set_state(sk,TCP_CLOSING);
3775                         break;
3776                 case TCP_FIN_WAIT2:
3777                         /*
3778                          * received a FIN -- send ACK and enter TIME_WAIT
3779                          */
3780                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3781                         sk->shutdown|=SHUTDOWN_MASK;
3782                         tcp_set_state(sk,TCP_TIME_WAIT);
3783                         break;
3784                 case TCP_CLOSE:
3785                         /*
3786                          * already in CLOSE
3787                          */
3788                         break;
3789                 default:
3790                         tcp_set_state(sk,TCP_LAST_ACK);
3791         
3792                         /* Start the timers. */
3793                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3794                         return(0);
3795         }
3796 
3797         return(0);
3798 }
3799 
3800 
3801 
3802 /*
3803  *      This routine handles the data.  If there is room in the buffer,
3804  *      it will be have already been moved into it.  If there is no
3805  *      room, then we will just have to discard the packet.
3806  */
3807 
3808 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3809          unsigned long saddr, unsigned short len)
3810 {
3811         struct sk_buff *skb1, *skb2;
3812         struct tcphdr *th;
3813         int dup_dumped=0;
3814         unsigned long new_seq;
3815         unsigned long shut_seq;
3816 
3817         th = skb->h.th;
3818         skb->len = len -(th->doff*4);
3819 
3820         /*
3821          *      The bytes in the receive read/assembly queue has increased. Needed for the
3822          *      low memory discard algorithm 
3823          */
3824            
3825         sk->bytes_rcv += skb->len;
3826         
3827         if (skb->len == 0 && !th->fin) 
3828         {
3829                 /* 
3830                  *      Don't want to keep passing ack's back and forth. 
3831                  *      (someone sent us dataless, boring frame)
3832                  */
3833                 if (!th->ack)
3834                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3835                 kfree_skb(skb, FREE_READ);
3836                 return(0);
3837         }
3838         
3839         /*
3840          *      We no longer have anyone receiving data on this connection.
3841          */
3842 
3843 #ifndef TCP_DONT_RST_SHUTDOWN            
3844 
3845         if(sk->shutdown & RCV_SHUTDOWN)
3846         {
3847                 /*
3848                  *      FIXME: BSD has some magic to avoid sending resets to
3849                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3850                  *      BSD stacks still have broken keepalives so we want to
3851                  *      cope with it.
3852                  */
3853 
3854                 if(skb->len)    /* We don't care if it's just an ack or
3855                                    a keepalive/window probe */
3856                 {
3857                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3858                         
3859                         /* Do this the way 4.4BSD treats it. Not what I'd
3860                            regard as the meaning of the spec but it's what BSD
3861                            does and clearly they know everything 8) */
3862 
3863                         /*
3864                          *      This is valid because of two things
3865                          *
3866                          *      a) The way tcp_data behaves at the bottom.
3867                          *      b) A fin takes effect when read not when received.
3868                          */
3869                          
3870                         shut_seq=sk->acked_seq+1;       /* Last byte */
3871                         
3872                         if(after(new_seq,shut_seq))
3873                         {
3874                                 if(sk->debug)
3875                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3876                                                 sk, new_seq, shut_seq, sk->blog);
3877                                 if(sk->dead)
3878                                 {
3879                                         sk->acked_seq = new_seq + th->fin;
3880                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3881                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3882                                         tcp_statistics.TcpEstabResets++;
3883                                         tcp_set_state(sk,TCP_CLOSE);
3884                                         sk->err = EPIPE;
3885                                         sk->shutdown = SHUTDOWN_MASK;
3886                                         kfree_skb(skb, FREE_READ);
3887                                         return 0;
3888                                 }
3889                         }
3890                 }
3891         }
3892 
3893 #endif
3894 
3895         /*
3896          *      Now we have to walk the chain, and figure out where this one
3897          *      goes into it.  This is set up so that the last packet we received
3898          *      will be the first one we look at, that way if everything comes
3899          *      in order, there will be no performance loss, and if they come
3900          *      out of order we will be able to fit things in nicely.
3901          *
3902          *      [AC: This is wrong. We should assume in order first and then walk
3903          *       forwards from the first hole based upon real traffic patterns.]
3904          *      
3905          */
3906 
3907         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3908         {
3909                 skb_queue_head(&sk->receive_queue,skb);
3910                 skb1= NULL;
3911         } 
3912         else
3913         {
3914                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3915                 {
3916                         if(sk->debug)
3917                         {
3918                                 printk("skb1=%p :", skb1);
3919                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3920                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3921                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3922                                                 sk->acked_seq);
3923                         }
3924                         
3925                         /*
3926                          *      Optimisation: Duplicate frame or extension of previous frame from
3927                          *      same sequence point (lost ack case).
3928                          *      The frame contains duplicate data or replaces a previous frame
3929                          *      discard the previous frame (safe as sk->inuse is set) and put
3930                          *      the new one in its place.
3931                          */
3932                          
3933                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3934                         {
3935                                 skb_append(skb1,skb);
3936                                 skb_unlink(skb1);
3937                                 kfree_skb(skb1,FREE_READ);
3938                                 dup_dumped=1;
3939                                 skb1=NULL;
3940                                 break;
3941                         }
3942                         
3943                         /*
3944                          *      Found where it fits
3945                          */
3946                          
3947                         if (after(th->seq+1, skb1->h.th->seq))
3948                         {
3949                                 skb_append(skb1,skb);
3950                                 break;
3951                         }
3952                         
3953                         /*
3954                          *      See if we've hit the start. If so insert.
3955                          */
3956                         if (skb1 == skb_peek(&sk->receive_queue))
3957                         {
3958                                 skb_queue_head(&sk->receive_queue, skb);
3959                                 break;
3960                         }
3961                 }
3962         }
3963 
3964         /*
3965          *      Figure out what the ack value for this frame is
3966          */
3967          
3968         th->ack_seq = th->seq + skb->len;
3969         if (th->syn) 
3970                 th->ack_seq++;
3971         if (th->fin)
3972                 th->ack_seq++;
3973 
3974         if (before(sk->acked_seq, sk->copied_seq)) 
3975         {
3976                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3977                 sk->acked_seq = sk->copied_seq;
3978         }
3979 
3980         /*
3981          *      Now figure out if we can ack anything. This is very messy because we really want two
3982          *      receive queues, a completed and an assembly queue. We also want only one transmit
3983          *      queue.
3984          */
3985 
3986         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3987         {
3988                 if (before(th->seq, sk->acked_seq+1)) 
3989                 {
3990                         int newwindow;
3991 
3992                         if (after(th->ack_seq, sk->acked_seq)) 
3993                         {
3994                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3995                                 if (newwindow < 0)
3996                                         newwindow = 0;  
3997                                 sk->window = newwindow;
3998                                 sk->acked_seq = th->ack_seq;
3999                         }
4000                         skb->acked = 1;
4001 
4002                         /*
4003                          *      When we ack the fin, we do the FIN 
4004                          *      processing.
4005                          */
4006 
4007                         if (skb->h.th->fin) 
4008                         {
4009                                 tcp_fin(skb,sk,skb->h.th);
4010                         }
4011           
4012                         for(skb2 = skb->next;
4013                             skb2 != (struct sk_buff *)&sk->receive_queue;
4014                             skb2 = skb2->next) 
4015                         {
4016                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
4017                                 {
4018                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
4019                                         {
4020                                                 newwindow = sk->window -
4021                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4022                                                 if (newwindow < 0)
4023                                                         newwindow = 0;  
4024                                                 sk->window = newwindow;
4025                                                 sk->acked_seq = skb2->h.th->ack_seq;
4026                                         }
4027                                         skb2->acked = 1;
4028                                         /*
4029                                          *      When we ack the fin, we do
4030                                          *      the fin handling.
4031                                          */
4032                                         if (skb2->h.th->fin) 
4033                                         {
4034                                                 tcp_fin(skb,sk,skb->h.th);
4035                                         }
4036 
4037                                         /*
4038                                          *      Force an immediate ack.
4039                                          */
4040                                          
4041                                         sk->ack_backlog = sk->max_ack_backlog;
4042                                 }
4043                                 else
4044                                 {
4045                                         break;
4046                                 }
4047                         }
4048 
4049                         /*
4050                          *      This also takes care of updating the window.
4051                          *      This if statement needs to be simplified.
4052                          */
4053                         if (!sk->delay_acks ||
4054                             sk->ack_backlog >= sk->max_ack_backlog || 
4055                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4056         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4057                         }
4058                         else 
4059                         {
4060                                 sk->ack_backlog++;
4061                                 if(sk->debug)
4062                                         printk("Ack queued.\n");
4063                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4064                         }
4065                 }
4066         }
4067 
4068         /*
4069          *      If we've missed a packet, send an ack.
4070          *      Also start a timer to send another.
4071          */
4072          
4073         if (!skb->acked) 
4074         {
4075         
4076         /*
4077          *      This is important.  If we don't have much room left,
4078          *      we need to throw out a few packets so we have a good
4079          *      window.  Note that mtu is used, not mss, because mss is really
4080          *      for the send side.  He could be sending us stuff as large as mtu.
4081          */
4082                  
4083                 while (sk->prot->rspace(sk) < sk->mtu) 
4084                 {
4085                         skb1 = skb_peek(&sk->receive_queue);
4086                         if (skb1 == NULL) 
4087                         {
4088                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4089                                 break;
4090                         }
4091 
4092                         /*
4093                          *      Don't throw out something that has been acked. 
4094                          */
4095                  
4096                         if (skb1->acked) 
4097                         {
4098                                 break;
4099                         }
4100                 
4101                         skb_unlink(skb1);
4102                         kfree_skb(skb1, FREE_READ);
4103                 }
4104                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4105                 sk->ack_backlog++;
4106                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4107         }
4108         else
4109         {
4110                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4111         }
4112 
4113         /*
4114          *      Now tell the user we may have some data. 
4115          */
4116          
4117         if (!sk->dead) 
4118         {
4119                 if(sk->debug)
4120                         printk("Data wakeup.\n");
4121                 sk->data_ready(sk,0);
4122         } 
4123         return(0);
4124 }
4125 
4126 
4127 /*
4128  *      This routine is only called when we have urgent data
4129  *      signalled. Its the 'slow' part of tcp_urg. It could be
4130  *      moved inline now as tcp_urg is only called from one
4131  *      place. We handle URGent data wrong. We have to - as
4132  *      BSD still doesn't use the correction from RFC961.
4133  */
4134  
4135 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4136 {
4137         unsigned long ptr = ntohs(th->urg_ptr);
4138 
4139         if (ptr)
4140                 ptr--;
4141         ptr += th->seq;
4142 
4143         /* ignore urgent data that we've already seen and read */
4144         if (after(sk->copied_seq, ptr))
4145                 return;
4146 
4147         /* do we already have a newer (or duplicate) urgent pointer? */
4148         if (sk->urg_data && !after(ptr, sk->urg_seq))
4149                 return;
4150 
4151         /* tell the world about our new urgent pointer */
4152         if (sk->proc != 0) {
4153                 if (sk->proc > 0) {
4154                         kill_proc(sk->proc, SIGURG, 1);
4155                 } else {
4156                         kill_pg(-sk->proc, SIGURG, 1);
4157                 }
4158         }
4159         sk->urg_data = URG_NOTYET;
4160         sk->urg_seq = ptr;
4161 }
4162 
4163 /*
4164  *      This is the 'fast' part of urgent handling.
4165  */
4166  
4167 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4168         unsigned long saddr, unsigned long len)
4169 {
4170         unsigned long ptr;
4171 
4172         /*
4173          *      Check if we get a new urgent pointer - normally not 
4174          */
4175          
4176         if (th->urg)
4177                 tcp_check_urg(sk,th);
4178 
4179         /*
4180          *      Do we wait for any urgent data? - normally not
4181          */
4182          
4183         if (sk->urg_data != URG_NOTYET)
4184                 return 0;
4185 
4186         /*
4187          *      Is the urgent pointer pointing into this packet? 
4188          */
4189          
4190         ptr = sk->urg_seq - th->seq + th->doff*4;
4191         if (ptr >= len)
4192                 return 0;
4193 
4194         /*
4195          *      Ok, got the correct packet, update info 
4196          */
4197          
4198         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4199         if (!sk->dead)
4200                 sk->data_ready(sk,0);
4201         return 0;
4202 }
4203 
4204 /*
4205  *      This will accept the next outstanding connection. 
4206  */
4207  
4208 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4209 {
4210         struct sock *newsk;
4211         struct sk_buff *skb;
4212   
4213   /*
4214    * We need to make sure that this socket is listening,
4215    * and that it has something pending.
4216    */
4217 
4218         if (sk->state != TCP_LISTEN) 
4219         {
4220                 sk->err = EINVAL;
4221                 return(NULL); 
4222         }
4223 
4224         /* Avoid the race. */
4225         cli();
4226         sk->inuse = 1;
4227 
4228         while((skb = tcp_dequeue_established(sk)) == NULL) 
4229         {
4230                 if (flags & O_NONBLOCK) 
4231                 {
4232                         sti();
4233                         release_sock(sk);
4234                         sk->err = EAGAIN;
4235                         return(NULL);
4236                 }
4237 
4238                 release_sock(sk);
4239                 interruptible_sleep_on(sk->sleep);
4240                 if (current->signal & ~current->blocked) 
4241                 {
4242                         sti();
4243                         sk->err = ERESTARTSYS;
4244                         return(NULL);
4245                 }
4246                 sk->inuse = 1;
4247         }
4248         sti();
4249 
4250         /*
4251          *      Now all we need to do is return skb->sk. 
4252          */
4253 
4254         newsk = skb->sk;
4255 
4256         kfree_skb(skb, FREE_READ);
4257         sk->ack_backlog--;
4258         release_sock(sk);
4259         return(newsk);
4260 }
4261 
4262 
4263 /*
4264  *      This will initiate an outgoing connection. 
4265  */
4266  
4267 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4268 {
4269         struct sk_buff *buff;
4270         struct device *dev=NULL;
4271         unsigned char *ptr;
4272         int tmp;
4273         int atype;
4274         struct tcphdr *t1;
4275         struct rtable *rt;
4276 
4277         if (sk->state != TCP_CLOSE) 
4278         {
4279                 return(-EISCONN);
4280         }
4281         
4282         if (addr_len < 8) 
4283                 return(-EINVAL);
4284 
4285         if (usin->sin_family && usin->sin_family != AF_INET) 
4286                 return(-EAFNOSUPPORT);
4287 
4288         /*
4289          *      connect() to INADDR_ANY means loopback (BSD'ism).
4290          */
4291         
4292         if(usin->sin_addr.s_addr==INADDR_ANY)
4293                 usin->sin_addr.s_addr=ip_my_addr();
4294                   
4295         /*
4296          *      Don't want a TCP connection going to a broadcast address 
4297          */
4298 
4299         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4300                 return -ENETUNREACH;
4301   
4302         sk->inuse = 1;
4303         sk->daddr = usin->sin_addr.s_addr;
4304         sk->write_seq = tcp_init_seq();
4305         sk->window_seq = sk->write_seq;
4306         sk->rcv_ack_seq = sk->write_seq -1;
4307         sk->err = 0;
4308         sk->dummy_th.dest = usin->sin_port;
4309         release_sock(sk);
4310 
4311         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4312         if (buff == NULL) 
4313         {
4314                 return(-ENOMEM);
4315         }
4316         sk->inuse = 1;
4317         buff->len = 24;
4318         buff->sk = sk;
4319         buff->free = 0;
4320         buff->localroute = sk->localroute;
4321         
4322         t1 = (struct tcphdr *) buff->data;
4323 
4324         /*
4325          *      Put in the IP header and routing stuff. 
4326          */
4327          
4328         rt=ip_rt_route(sk->daddr, NULL, NULL);
4329         
4330 
4331         /*
4332          *      We need to build the routing stuff from the things saved in skb. 
4333          */
4334 
4335         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4336                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4337         if (tmp < 0) 
4338         {
4339                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4340                 release_sock(sk);
4341                 return(-ENETUNREACH);
4342         }
4343 
4344         buff->len += tmp;
4345         t1 = (struct tcphdr *)((char *)t1 +tmp);
4346 
4347         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4348         t1->seq = ntohl(sk->write_seq++);
4349         sk->sent_seq = sk->write_seq;
4350         buff->h.seq = sk->write_seq;
4351         t1->ack = 0;
4352         t1->window = 2;
4353         t1->res1=0;
4354         t1->res2=0;
4355         t1->rst = 0;
4356         t1->urg = 0;
4357         t1->psh = 0;
4358         t1->syn = 1;
4359         t1->urg_ptr = 0;
4360         t1->doff = 6;
4361         /* use 512 or whatever user asked for */
4362         
4363         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4364                 sk->window_clamp=rt->rt_window;
4365         else
4366                 sk->window_clamp=0;
4367 
4368         if (sk->user_mss)
4369                 sk->mtu = sk->user_mss;
4370         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4371                 sk->mtu = rt->rt_mss;
4372         else 
4373         {
4374 #ifdef CONFIG_INET_SNARL
4375                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4376 #else
4377                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4378 #endif
4379                         sk->mtu = 576 - HEADER_SIZE;
4380                 else
4381                         sk->mtu = MAX_WINDOW;
4382         }
4383         /*
4384          *      but not bigger than device MTU 
4385          */
4386 
4387         if(sk->mtu <32)
4388                 sk->mtu = 32;   /* Sanity limit */
4389                 
4390         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4391         
4392         /*
4393          *      Put in the TCP options to say MTU. 
4394          */
4395 
4396         ptr = (unsigned char *)(t1+1);
4397         ptr[0] = 2;
4398         ptr[1] = 4;
4399         ptr[2] = (sk->mtu) >> 8;
4400         ptr[3] = (sk->mtu) & 0xff;
4401         tcp_send_check(t1, sk->saddr, sk->daddr,
4402                   sizeof(struct tcphdr) + 4, sk);
4403 
4404         /*
4405          *      This must go first otherwise a really quick response will get reset. 
4406          */
4407 
4408         tcp_set_state(sk,TCP_SYN_SENT);
4409         sk->rto = TCP_TIMEOUT_INIT;
4410 #if 0 /* we already did this */
4411         init_timer(&sk->retransmit_timer); 
4412 #endif
4413         sk->retransmit_timer.function=&retransmit_timer;
4414         sk->retransmit_timer.data = (unsigned long)sk;
4415         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4416         sk->retransmits = TCP_SYN_RETRIES;
4417 
4418         sk->prot->queue_xmit(sk, dev, buff, 0);  
4419         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4420         tcp_statistics.TcpActiveOpens++;
4421         tcp_statistics.TcpOutSegs++;
4422   
4423         release_sock(sk);
4424         return(0);
4425 }
4426 
4427 
4428 /* This functions checks to see if the tcp header is actually acceptable. */
4429 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4430              struct options *opt, unsigned long saddr, struct device *dev)
4431 {
4432         unsigned long next_seq;
4433 
4434         next_seq = len - 4*th->doff;
4435         if (th->fin)
4436                 next_seq++;
4437         /* if we have a zero window, we can't have any data in the packet.. */
4438         if (next_seq && !sk->window)
4439                 goto ignore_it;
4440         next_seq += th->seq;
4441 
4442         /*
4443          * This isn't quite right.  sk->acked_seq could be more recent
4444          * than sk->window.  This is however close enough.  We will accept
4445          * slightly more packets than we should, but it should not cause
4446          * problems unless someone is trying to forge packets.
4447          */
4448 
4449         /* have we already seen all of this packet? */
4450         if (!after(next_seq+1, sk->acked_seq))
4451                 goto ignore_it;
4452         /* or does it start beyond the window? */
4453         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4454                 goto ignore_it;
4455 
4456         /* ok, at least part of this packet would seem interesting.. */
4457         return 1;
4458 
4459 ignore_it:
4460         if (th->rst)
4461                 return 0;
4462 
4463         /*
4464          *      Send a reset if we get something not ours and we are
4465          *      unsynchronized. Note: We don't do anything to our end. We
4466          *      are just killing the bogus remote connection then we will
4467          *      connect again and it will work (with luck).
4468          */
4469          
4470         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4471         {
4472                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4473                 return 1;
4474         }
4475 
4476         /* Try to resync things. */
4477         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4478         return 0;
4479 }
4480 
4481 /*
4482  *      When we get a reset we do this.
4483  */
4484 
4485 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4486 {
4487         sk->zapped = 1;
4488         sk->err = ECONNRESET;
4489         if (sk->state == TCP_SYN_SENT)
4490                 sk->err = ECONNREFUSED;
4491         if (sk->state == TCP_CLOSE_WAIT)
4492                 sk->err = EPIPE;
4493 #ifdef TCP_DO_RFC1337           
4494         /*
4495          *      Time wait assassination protection [RFC1337]
4496          */
4497         if(sk->state!=TCP_TIME_WAIT)
4498         {       
4499                 tcp_set_state(sk,TCP_CLOSE);
4500                 sk->shutdown = SHUTDOWN_MASK;
4501         }
4502 #else   
4503         tcp_set_state(sk,TCP_CLOSE);
4504         sk->shutdown = SHUTDOWN_MASK;
4505 #endif  
4506         if (!sk->dead) 
4507                 sk->state_change(sk);
4508         kfree_skb(skb, FREE_READ);
4509         release_sock(sk);
4510         return(0);
4511 }
4512 
4513 /*
4514  *      A TCP packet has arrived.
4515  */
4516  
4517 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4518         unsigned long daddr, unsigned short len,
4519         unsigned long saddr, int redo, struct inet_protocol * protocol)
4520 {
4521         struct tcphdr *th;
4522         struct sock *sk;
4523         int syn_ok=0;
4524         
4525         if (!skb) 
4526         {
4527                 printk("IMPOSSIBLE 1\n");
4528                 return(0);
4529         }
4530 
4531         if (!dev) 
4532         {
4533                 printk("IMPOSSIBLE 2\n");
4534                 return(0);
4535         }
4536   
4537         tcp_statistics.TcpInSegs++;
4538   
4539         if(skb->pkt_type!=PACKET_HOST)
4540         {
4541                 kfree_skb(skb,FREE_READ);
4542                 return(0);
4543         }
4544   
4545         th = skb->h.th;
4546 
4547         /*
4548          *      Find the socket.
4549          */
4550 
4551         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4552 
4553         /*
4554          *      If this socket has got a reset it's to all intents and purposes 
4555          *      really dead. Count closed sockets as dead.
4556          *
4557          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4558          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4559          *      exist so should cause resets as if the port was unreachable.
4560          */
4561          
4562         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4563                 sk=NULL;
4564 
4565         if (!redo) 
4566         {
4567                 if (tcp_check(th, len, saddr, daddr )) 
4568                 {
4569                         skb->sk = NULL;
4570                         kfree_skb(skb,FREE_READ);
4571                         /*
4572                          *      We don't release the socket because it was
4573                          *      never marked in use.
4574                          */
4575                         return(0);
4576                 }
4577                 th->seq = ntohl(th->seq);
4578 
4579                 /* See if we know about the socket. */
4580                 if (sk == NULL) 
4581                 {
4582                         /*
4583                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4584                          */
4585                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4586                         skb->sk = NULL;
4587                         /*
4588                          *      Discard frame
4589                          */
4590                         kfree_skb(skb, FREE_READ);
4591                         return(0);
4592                 }
4593 
4594                 skb->len = len;
4595                 skb->acked = 0;
4596                 skb->used = 0;
4597                 skb->free = 0;
4598                 skb->saddr = daddr;
4599                 skb->daddr = saddr;
4600         
4601                 /* We may need to add it to the backlog here. */
4602                 cli();
4603                 if (sk->inuse) 
4604                 {
4605                         skb_queue_tail(&sk->back_log, skb);
4606                         sti();
4607                         return(0);
4608                 }
4609                 sk->inuse = 1;
4610                 sti();
4611         }
4612         else
4613         {
4614                 if (sk==NULL) 
4615                 {
4616                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4617                         skb->sk = NULL;
4618                         kfree_skb(skb, FREE_READ);
4619                         return(0);
4620                 }
4621         }
4622 
4623 
4624         if (!sk->prot) 
4625         {
4626                 printk("IMPOSSIBLE 3\n");
4627                 return(0);
4628         }
4629 
4630 
4631         /*
4632          *      Charge the memory to the socket. 
4633          */
4634          
4635         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4636         {
4637                 kfree_skb(skb, FREE_READ);
4638                 release_sock(sk);
4639                 return(0);
4640         }
4641 
4642         skb->sk=sk;
4643         sk->rmem_alloc += skb->mem_len;
4644 
4645         /*
4646          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4647          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4648          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4649          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4650          */
4651 
4652         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4653         {
4654         
4655                 /*
4656                  *      Now deal with unusual cases.
4657                  */
4658          
4659                 if(sk->state==TCP_LISTEN)
4660                 {
4661                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4662                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4663 
4664                         /*
4665                          *      We don't care for RST, and non SYN are absorbed (old segments)
4666                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4667                          *      netmask on a running connection it can go broadcast. Even Sun's have
4668                          *      this problem so I'm ignoring it 
4669                          */
4670                            
4671                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4672                         {
4673                                 kfree_skb(skb, FREE_READ);
4674                                 release_sock(sk);
4675                                 return 0;
4676                         }
4677                 
4678                         /*      
4679                          *      Guess we need to make a new socket up 
4680                          */
4681                 
4682                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4683                 
4684                         /*
4685                          *      Now we have several options: In theory there is nothing else
4686                          *      in the frame. KA9Q has an option to send data with the syn,
4687                          *      BSD accepts data with the syn up to the [to be] advertised window
4688                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4689                          *      it, that fits the spec precisely and avoids incompatibilities. It
4690                          *      would be nice in future to drop through and process the data.
4691                          */
4692                          
4693                         release_sock(sk);
4694                         return 0;
4695                 }
4696         
4697                 /* retransmitted SYN? */
4698                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4699                 {
4700                         kfree_skb(skb, FREE_READ);
4701                         release_sock(sk);
4702                         return 0;
4703                 }
4704                 
4705                 /*
4706                  *      SYN sent means we have to look for a suitable ack and either reset
4707                  *      for bad matches or go to connected 
4708                  */
4709            
4710                 if(sk->state==TCP_SYN_SENT)
4711                 {
4712                         /* Crossed SYN or previous junk segment */
4713                         if(th->ack)
4714                         {
4715                                 /* We got an ack, but it's not a good ack */
4716                                 if(!tcp_ack(sk,th,saddr,len))
4717                                 {
4718                                         /* Reset the ack - its an ack from a 
4719                                            different connection  [ th->rst is checked in tcp_reset()] */
4720                                         tcp_statistics.TcpAttemptFails++;
4721                                         tcp_reset(daddr, saddr, th,
4722                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4723                                         kfree_skb(skb, FREE_READ);
4724                                         release_sock(sk);
4725                                         return(0);
4726                                 }
4727                                 if(th->rst)
4728                                         return tcp_std_reset(sk,skb);
4729                                 if(!th->syn)
4730                                 {
4731                                         /* A valid ack from a different connection
4732                                            start. Shouldn't happen but cover it */
4733                                         kfree_skb(skb, FREE_READ);
4734                                         release_sock(sk);
4735                                         return 0;
4736                                 }
4737                                 /*
4738                                  *      Ok.. it's good. Set up sequence numbers and
4739                                  *      move to established.
4740                                  */
4741                                 syn_ok=1;       /* Don't reset this connection for the syn */
4742                                 sk->acked_seq=th->seq+1;
4743                                 sk->fin_seq=th->seq;
4744                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4745                                 tcp_set_state(sk, TCP_ESTABLISHED);
4746                                 tcp_options(sk,th);
4747                                 sk->dummy_th.dest=th->source;
4748                                 sk->copied_seq = sk->acked_seq;
4749                                 if(!sk->dead)
4750                                 {
4751                                         sk->state_change(sk);
4752                                         sock_wake_async(sk->socket, 0);
4753                                 }
4754                                 if(sk->max_window==0)
4755                                 {
4756                                         sk->max_window = 32;
4757                                         sk->mss = min(sk->max_window, sk->mtu);
4758                                 }
4759                         }
4760                         else
4761                         {
4762                                 /* See if SYN's cross. Drop if boring */
4763                                 if(th->syn && !th->rst)
4764                                 {
4765                                         /* Crossed SYN's are fine - but talking to
4766                                            yourself is right out... */
4767                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4768                                                 sk->dummy_th.source==th->source &&
4769                                                 sk->dummy_th.dest==th->dest)
4770                                         {
4771                                                 tcp_statistics.TcpAttemptFails++;
4772                                                 return tcp_std_reset(sk,skb);
4773                                         }
4774                                         tcp_set_state(sk,TCP_SYN_RECV);
4775                                         
4776                                         /*
4777                                          *      FIXME:
4778                                          *      Must send SYN|ACK here
4779                                          */
4780                                 }               
4781                                 /* Discard junk segment */
4782                                 kfree_skb(skb, FREE_READ);
4783                                 release_sock(sk);
4784                                 return 0;
4785                         }
4786                         /*
4787                          *      SYN_RECV with data maybe.. drop through
4788                          */
4789                         goto rfc_step6;
4790                 }
4791 
4792         /*
4793          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4794          *      a more complex suggestion for fixing these reuse issues in RFC1644
4795          *      but not yet ready for general use. Also see RFC1379.
4796          */
4797         
4798 #define BSD_TIME_WAIT
4799 #ifdef BSD_TIME_WAIT
4800                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4801                         after(th->seq, sk->acked_seq) && !th->rst)
4802                 {
4803                         long seq=sk->write_seq;
4804                         if(sk->debug)
4805                                 printk("Doing a BSD time wait\n");
4806                         tcp_statistics.TcpEstabResets++;           
4807                         sk->rmem_alloc -= skb->mem_len;
4808                         skb->sk = NULL;
4809                         sk->err=ECONNRESET;
4810                         tcp_set_state(sk, TCP_CLOSE);
4811                         sk->shutdown = SHUTDOWN_MASK;
4812                         release_sock(sk);
4813                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4814                         if (sk && sk->state==TCP_LISTEN)
4815                         {
4816                                 sk->inuse=1;
4817                                 skb->sk = sk;
4818                                 sk->rmem_alloc += skb->mem_len;
4819                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4820                                 release_sock(sk);
4821                                 return 0;
4822                         }
4823                         kfree_skb(skb, FREE_READ);
4824                         return 0;
4825                 }
4826 #endif  
4827         }
4828 
4829         /*
4830          *      We are now in normal data flow (see the step list in the RFC)
4831          *      Note most of these are inline now. I'll inline the lot when
4832          *      I have time to test it hard and look at what gcc outputs 
4833          */
4834         
4835         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4836         {
4837                 kfree_skb(skb, FREE_READ);
4838                 release_sock(sk);
4839                 return 0;
4840         }
4841 
4842         if(th->rst)
4843                 return tcp_std_reset(sk,skb);
4844         
4845         /*
4846          *      !syn_ok is effectively the state test in RFC793.
4847          */
4848          
4849         if(th->syn && !syn_ok)
4850         {
4851                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4852                 return tcp_std_reset(sk,skb);   
4853         }
4854 
4855         /*
4856          *      Process the ACK
4857          */
4858          
4859 
4860         if(th->ack && !tcp_ack(sk,th,saddr,len))
4861         {
4862                 /*
4863                  *      Our three way handshake failed.
4864                  */
4865                  
4866                 if(sk->state==TCP_SYN_RECV)
4867                 {
4868                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4869                 }
4870                 kfree_skb(skb, FREE_READ);
4871                 release_sock(sk);
4872                 return 0;
4873         }
4874         
4875 rfc_step6:              /* I'll clean this up later */
4876 
4877         /*
4878          *      Process urgent data
4879          */
4880                 
4881         if(tcp_urg(sk, th, saddr, len))
4882         {
4883                 kfree_skb(skb, FREE_READ);
4884                 release_sock(sk);
4885                 return 0;
4886         }
4887         
4888         
4889         /*
4890          *      Process the encapsulated data
4891          */
4892         
4893         if(tcp_data(skb,sk, saddr, len))
4894         {
4895                 kfree_skb(skb, FREE_READ);
4896                 release_sock(sk);
4897                 return 0;
4898         }
4899 
4900         /*
4901          *      And done
4902          */     
4903         
4904         release_sock(sk);
4905         return 0;
4906 }
4907 
4908 /*
4909  *      This routine sends a packet with an out of date sequence
4910  *      number. It assumes the other end will try to ack it.
4911  */
4912 
4913 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4914 {
4915         struct sk_buff *buff;
4916         struct tcphdr *t1;
4917         struct device *dev=NULL;
4918         int tmp;
4919 
4920         if (sk->zapped)
4921                 return; /* After a valid reset we can send no more */
4922 
4923         /*
4924          *      Write data can still be transmitted/retransmitted in the
4925          *      following states.  If any other state is encountered, return.
4926          *      [listen/close will never occur here anyway]
4927          */
4928 
4929         if (sk->state != TCP_ESTABLISHED && 
4930             sk->state != TCP_CLOSE_WAIT &&
4931             sk->state != TCP_FIN_WAIT1 && 
4932             sk->state != TCP_LAST_ACK &&
4933             sk->state != TCP_CLOSING
4934         ) 
4935         {
4936                 return;
4937         }
4938 
4939         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4940         if (buff == NULL) 
4941                 return;
4942 
4943         buff->len = sizeof(struct tcphdr);
4944         buff->free = 1;
4945         buff->sk = sk;
4946         buff->localroute = sk->localroute;
4947 
4948         t1 = (struct tcphdr *) buff->data;
4949 
4950         /* Put in the IP header and routing stuff. */
4951         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4952                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4953         if (tmp < 0) 
4954         {
4955                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4956                 return;
4957         }
4958 
4959         buff->len += tmp;
4960         t1 = (struct tcphdr *)((char *)t1 +tmp);
4961 
4962         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4963 
4964         /*
4965          *      Use a previous sequence.
4966          *      This should cause the other end to send an ack.
4967          */
4968          
4969         t1->seq = htonl(sk->sent_seq-1);
4970         t1->ack = 1; 
4971         t1->res1= 0;
4972         t1->res2= 0;
4973         t1->rst = 0;
4974         t1->urg = 0;
4975         t1->psh = 0;
4976         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4977         t1->syn = 0;
4978         t1->ack_seq = ntohl(sk->acked_seq);
4979         t1->window = ntohs(tcp_select_window(sk));
4980         t1->doff = sizeof(*t1)/4;
4981         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4982          /*
4983           *     Send it and free it.
4984           *     This will prevent the timer from automatically being restarted.
4985           */
4986         sk->prot->queue_xmit(sk, dev, buff, 1);
4987         tcp_statistics.TcpOutSegs++;
4988 }
4989 
4990 /*
4991  *      A window probe timeout has occurred.
4992  */
4993 
4994 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4995 {
4996         if (sk->zapped)
4997                 return;         /* After a valid reset we can send no more */
4998 
4999         tcp_write_wakeup(sk);
5000 
5001         sk->backoff++;
5002         sk->rto = min(sk->rto << 1, 120*HZ);
5003         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5004         sk->retransmits++;
5005         sk->prot->retransmits ++;
5006 }
5007 
5008 /*
5009  *      Socket option code for TCP. 
5010  */
5011   
5012 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5013 {
5014         int val,err;
5015 
5016         if(level!=SOL_TCP)
5017                 return ip_setsockopt(sk,level,optname,optval,optlen);
5018 
5019         if (optval == NULL) 
5020                 return(-EINVAL);
5021 
5022         err=verify_area(VERIFY_READ, optval, sizeof(int));
5023         if(err)
5024                 return err;
5025         
5026         val = get_fs_long((unsigned long *)optval);
5027 
5028         switch(optname)
5029         {
5030                 case TCP_MAXSEG:
5031 /*
5032  * values greater than interface MTU won't take effect.  however at
5033  * the point when this call is done we typically don't yet know
5034  * which interface is going to be used
5035  */
5036                         if(val<1||val>MAX_WINDOW)
5037                                 return -EINVAL;
5038                         sk->user_mss=val;
5039                         return 0;
5040                 case TCP_NODELAY:
5041                         sk->nonagle=(val==0)?0:1;
5042                         return 0;
5043                 default:
5044                         return(-ENOPROTOOPT);
5045         }
5046 }
5047 
5048 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5049 {
5050         int val,err;
5051 
5052         if(level!=SOL_TCP)
5053                 return ip_getsockopt(sk,level,optname,optval,optlen);
5054                         
5055         switch(optname)
5056         {
5057                 case TCP_MAXSEG:
5058                         val=sk->user_mss;
5059                         break;
5060                 case TCP_NODELAY:
5061                         val=sk->nonagle;
5062                         break;
5063                 default:
5064                         return(-ENOPROTOOPT);
5065         }
5066         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5067         if(err)
5068                 return err;
5069         put_fs_long(sizeof(int),(unsigned long *) optlen);
5070 
5071         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5072         if(err)
5073                 return err;
5074         put_fs_long(val,(unsigned long *)optval);
5075 
5076         return(0);
5077 }       
5078 
5079 
5080 struct proto tcp_prot = {
5081         sock_wmalloc,
5082         sock_rmalloc,
5083         sock_wfree,
5084         sock_rfree,
5085         sock_rspace,
5086         sock_wspace,
5087         tcp_close,
5088         tcp_read,
5089         tcp_write,
5090         tcp_sendto,
5091         tcp_recvfrom,
5092         ip_build_header,
5093         tcp_connect,
5094         tcp_accept,
5095         ip_queue_xmit,
5096         tcp_retransmit,
5097         tcp_write_wakeup,
5098         tcp_read_wakeup,
5099         tcp_rcv,
5100         tcp_select,
5101         tcp_ioctl,
5102         NULL,
5103         tcp_shutdown,
5104         tcp_setsockopt,
5105         tcp_getsockopt,
5106         128,
5107         0,
5108         {NULL,},
5109         "TCP",
5110         0, 0
5111 };

/* [previous][next][first][last][top][bottom][index][help] */