root/net/ipv4/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_cache_zap
  2. min
  3. tcp_set_state
  4. tcp_select_window
  5. tcp_find_established
  6. tcp_dequeue_established
  7. tcp_close_pending
  8. tcp_time_wait
  9. tcp_do_retransmit
  10. reset_xmit_timer
  11. tcp_retransmit_time
  12. tcp_retransmit
  13. tcp_write_timeout
  14. retransmit_timer
  15. tcp_err
  16. tcp_readable
  17. tcp_listen_select
  18. tcp_select
  19. tcp_ioctl
  20. tcp_check
  21. tcp_send_check
  22. tcp_send_skb
  23. tcp_dequeue_partial
  24. tcp_send_partial
  25. tcp_enqueue_partial
  26. tcp_send_ack
  27. tcp_build_header
  28. tcp_write
  29. tcp_sendto
  30. tcp_read_wakeup
  31. cleanup_rbuf
  32. tcp_read_urg
  33. tcp_read
  34. tcp_close_state
  35. tcp_send_fin
  36. tcp_shutdown
  37. tcp_recvfrom
  38. tcp_reset
  39. tcp_options
  40. default_mask
  41. tcp_init_seq
  42. tcp_conn_request
  43. tcp_close
  44. tcp_write_xmit
  45. tcp_ack
  46. tcp_fin
  47. tcp_data
  48. tcp_check_urg
  49. tcp_urg
  50. tcp_accept
  51. tcp_connect
  52. tcp_sequence
  53. tcp_std_reset
  54. tcp_rcv
  55. tcp_write_wakeup
  56. tcp_send_probe0
  57. tcp_setsockopt
  58. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:       
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  26  *                                      and was trying to connect (tcp_err()).
  27  *              Alan Cox        :       All icmp error handling was broken
  28  *                                      pointers passed where wrong and the
  29  *                                      socket was looked up backwards. Nobody
  30  *                                      tested any icmp error code obviously.
  31  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  32  *                                      on errors. select behaves and the icmp error race
  33  *                                      has gone by moving it into sock.c
  34  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  35  *                                      packets for unknown sockets.
  36  *              Alan Cox        :       tcp option processing.
  37  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  38  *              Herp Rosmanith  :       More reset fixes
  39  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  40  *                                      any kind of RST is right out.
  41  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  42  *                                      otherwise odd bits of prattle escape still
  43  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  44  *                                      LAN workplace lockups.
  45  *              Alan Cox        :       Some tidyups using the new skb list facilities
  46  *              Alan Cox        :       sk->keepopen now seems to work
  47  *              Alan Cox        :       Pulls options out correctly on accepts
  48  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  49  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  50  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  51  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  52  *              Alan Cox        :       Removed incorrect check for 20 * psh
  53  *      Michael O'Reilly        :       ack < copied bug fix.
  54  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  55  *              Alan Cox        :       FIN with no memory -> CRASH
  56  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  57  *              Alan Cox        :       Added TCP options (SOL_TCP)
  58  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  59  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  60  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  61  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  62  *              Alan Cox        :       Put in missing check for SYN bit.
  63  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  64  *                                      window non shrink trick.
  65  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  66  *              Charles Hedrick :       TCP fixes
  67  *              Toomas Tamm     :       TCP window fixes
  68  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  69  *              Charles Hedrick :       Rewrote most of it to actually work
  70  *              Linus           :       Rewrote tcp_read() and URG handling
  71  *                                      completely
  72  *              Gerhard Koerting:       Fixed some missing timer handling
  73  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  74  *              Gerhard Koerting:       PC/TCP workarounds
  75  *              Adam Caldwell   :       Assorted timer/timing errors
  76  *              Matthew Dillon  :       Fixed another RST bug
  77  *              Alan Cox        :       Move to kernel side addressing changes.
  78  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  79  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  80  *              Alan Cox        :       TCP fast path debugging
  81  *              Alan Cox        :       Window clamping
  82  *              Michael Riepe   :       Bug in tcp_check()
  83  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  84  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  85  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  86  *              Alan Cox        :       BSD accept semantics. 
  87  *              Alan Cox        :       Reset on closedown bug.
  88  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  89  *              Michael Pall    :       Handle select() after URG properly in all cases.
  90  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  91  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  92  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  93  *              Alan Cox        :       Changed the semantics of sk->socket to 
  94  *                                      fix a race and a signal problem with
  95  *                                      accept() and async I/O.
  96  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  97  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  98  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  99  *                                      clients/servers which listen in on
 100  *                                      fixed ports.
 101  *              Alan Cox        :       Cleaned the above up and shrank it to
 102  *                                      a sensible code size.
 103  *              Alan Cox        :       Self connect lockup fix.
 104  *              Alan Cox        :       No connect to multicast.
 105  *              Ross Biro       :       Close unaccepted children on master
 106  *                                      socket close.
 107  *              Alan Cox        :       Reset tracing code.
 108  *              Alan Cox        :       Spurious resets on shutdown.
 109  *              Alan Cox        :       Giant 15 minute/60 second timer error
 110  *              Alan Cox        :       Small whoops in selecting before an accept.
 111  *              Alan Cox        :       Kept the state trace facility since it's
 112  *                                      handy for debugging.
 113  *              Alan Cox        :       More reset handler fixes.
 114  *              Alan Cox        :       Started rewriting the code based on the RFC's
 115  *                                      for other useful protocol references see:  
 116  *                                      Comer, KA9Q NOS, and for a reference on the
 117  *                                      difference between specifications and how BSD
 118  *                                      works see the 4.4lite source.
 119  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 120  *                                      close.
 121  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 122  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 123  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 124  *                                      timers for sanity. 
 125  *              Alan Cox        :       Small bug fixes, and a lot of new
 126  *                                      comments.
 127  *              Alan Cox        :       Fixed dual reader crash by locking
 128  *                                      the buffers (much like datagram.c)
 129  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 130  *                                      now gets fed up of retrying without
 131  *                                      (even a no space) answer.
 132  *              Alan Cox        :       Extracted closing code better
 133  *              Alan Cox        :       Fixed the closing state machine to
 134  *                                      resemble the RFC.
 135  *              Alan Cox        :       More 'per spec' fixes.
 136  *              Jorge Cwik      :       Even faster checksumming.
 137  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 138  *                                      only frames. At least one pc tcp stack
 139  *                                      generates them.
 140  *              Alan Cox        :       Cache last socket.
 141  *              Alan Cox        :       Per route irtt.
 142  *              Matt Day        :       Select() match BSD precisely on error
 143  *
 144  *
 145  * To Fix:
 146  *              Fast path the code. Two things here - fix the window calculation
 147  *              so it doesn't iterate over the queue, also spot packets with no funny
 148  *              options arriving in order and process directly.
 149  *
 150  *              Implement RFC 1191 [Path MTU discovery]
 151  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 152  *              Rewrite output state machine to use a single queue and do low window
 153  *              situations as per the spec (RFC 1122)
 154  *              Speed up input assembly algorithm.
 155  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 156  *              could do with it working on IPv4
 157  *              User settable/learned rtt/max window/mtu
 158  *              Cope with MTU/device switches when retransmitting in tcp.
 159  *              Fix the window handling to use PR's new code.
 160  *
 161  *              Change the fundamental structure to a single send queue maintained
 162  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 163  *              active routes too]). Cut the queue off in tcp_retransmit/
 164  *              tcp_transmit.
 165  *              Change the receive queue to assemble as it goes. This lets us
 166  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 167  *              tcp_data/tcp_read as well as the window shrink crud.
 168  *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
 169  *              tcp_queue_skb seem obvious routines to extract.
 170  *      
 171  *              This program is free software; you can redistribute it and/or
 172  *              modify it under the terms of the GNU General Public License
 173  *              as published by the Free Software Foundation; either version
 174  *              2 of the License, or(at your option) any later version.
 175  *
 176  * Description of States:
 177  *
 178  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 179  *
 180  *      TCP_SYN_RECV            received a connection request, sent ack,
 181  *                              waiting for final ack in three-way handshake.
 182  *
 183  *      TCP_ESTABLISHED         connection established
 184  *
 185  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 186  *                              transmission of remaining buffered data
 187  *
 188  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 189  *                              to shutdown
 190  *
 191  *      TCP_CLOSING             both sides have shutdown but we still have
 192  *                              data we have to finish sending
 193  *
 194  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 195  *                              closed, can only be entered from FIN_WAIT2
 196  *                              or CLOSING.  Required because the other end
 197  *                              may not have gotten our last ACK causing it
 198  *                              to retransmit the data packet (which we ignore)
 199  *
 200  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 201  *                              us to finish writing our data and to shutdown
 202  *                              (we have to close() to move on to LAST_ACK)
 203  *
 204  *      TCP_LAST_ACK            out side has shutdown after remote has
 205  *                              shutdown.  There may still be data in our
 206  *                              buffer that we have to finish sending
 207  *              
 208  *      TCP_CLOSE               socket is finished
 209  */
 210 
 211 #include <linux/types.h>
 212 #include <linux/sched.h>
 213 #include <linux/mm.h>
 214 #include <linux/time.h>
 215 #include <linux/string.h>
 216 #include <linux/config.h>
 217 #include <linux/socket.h>
 218 #include <linux/sockios.h>
 219 #include <linux/termios.h>
 220 #include <linux/in.h>
 221 #include <linux/fcntl.h>
 222 #include <linux/inet.h>
 223 #include <linux/netdevice.h>
 224 #include <net/snmp.h>
 225 #include <net/ip.h>
 226 #include <net/protocol.h>
 227 #include <net/icmp.h>
 228 #include <net/tcp.h>
 229 #include <net/arp.h>
 230 #include <linux/skbuff.h>
 231 #include <net/sock.h>
 232 #include <net/route.h>
 233 #include <linux/errno.h>
 234 #include <linux/timer.h>
 235 #include <asm/system.h>
 236 #include <asm/segment.h>
 237 #include <linux/mm.h>
 238 #include <net/checksum.h>
 239 
 240 /*
 241  *      The MSL timer is the 'normal' timer.
 242  */
 243  
 244 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 245 
 246 #define SEQ_TICK 3
 247 unsigned long seq_offset;
 248 struct tcp_mib  tcp_statistics;
 249 
 250 /*
 251  *      Cached last hit socket
 252  */
 253  
 254 volatile unsigned long  th_cache_saddr,th_cache_daddr;
 255 volatile unsigned short  th_cache_dport, th_cache_sport;
 256 volatile struct sock *th_cache_sk;
 257 
 258 void tcp_cache_zap(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 259 {
 260         unsigned long flags;
 261         save_flags(flags);
 262         cli();
 263         th_cache_saddr=0;
 264         th_cache_daddr=0;
 265         th_cache_dport=0;
 266         th_cache_sport=0;
 267         th_cache_sk=NULL;
 268         restore_flags(flags);
 269 }
 270 
 271 static void tcp_close(struct sock *sk, int timeout);
 272 
 273 
 274 /*
 275  *      The less said about this the better, but it works and will do for 1.2 
 276  */
 277 
 278 static struct wait_queue *master_select_wakeup;
 279 
 280 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 281 {
 282         if (a < b) 
 283                 return(a);
 284         return(b);
 285 }
 286 
 287 #undef STATE_TRACE
 288 
 289 #ifdef STATE_TRACE
 290 static char *statename[]={
 291         "Unused","Established","Syn Sent","Syn Recv",
 292         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 293         "Close Wait","Last ACK","Listen","Closing"
 294 };
 295 #endif
 296 
 297 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 298 {
 299         if(sk->state==TCP_ESTABLISHED)
 300                 tcp_statistics.TcpCurrEstab--;
 301 #ifdef STATE_TRACE
 302         if(sk->debug)
 303                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 304 #endif  
 305         /* This is a hack but it doesn't occur often and it's going to
 306            be a real        to fix nicely */
 307            
 308         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 309         {
 310                 wake_up_interruptible(&master_select_wakeup);
 311         }
 312         sk->state=state;
 313         if(state==TCP_ESTABLISHED)
 314                 tcp_statistics.TcpCurrEstab++;
 315 }
 316 
 317 /*
 318  *      This routine picks a TCP windows for a socket based on
 319  *      the following constraints
 320  *  
 321  *      1. The window can never be shrunk once it is offered (RFC 793)
 322  *      2. We limit memory per socket
 323  *   
 324  *      For now we use NET2E3's heuristic of offering half the memory
 325  *      we have handy. All is not as bad as this seems however because
 326  *      of two things. Firstly we will bin packets even within the window
 327  *      in order to get the data we are waiting for into the memory limit.
 328  *      Secondly we bin common duplicate forms at receive time
 329  *      Better heuristics welcome
 330  */
 331    
 332 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 333 {
 334         int new_window = sk->prot->rspace(sk);
 335         
 336         if(sk->window_clamp)
 337                 new_window=min(sk->window_clamp,new_window);
 338         /*
 339          *      Two things are going on here.  First, we don't ever offer a
 340          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 341          *      receiver side of SWS as specified in RFC1122.
 342          *      Second, we always give them at least the window they
 343          *      had before, in order to avoid retracting window.  This
 344          *      is technically allowed, but RFC1122 advises against it and
 345          *      in practice it causes trouble.
 346          *
 347          *      Fixme: This doesn't correctly handle the case where
 348          *      new_window > sk->window but not by enough to allow for the
 349          *      shift in sequence space. 
 350          */
 351         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 352                 return(sk->window);
 353         return(new_window);
 354 }
 355 
 356 /*
 357  *      Find someone to 'accept'. Must be called with
 358  *      sk->inuse=1 or cli()
 359  */ 
 360 
 361 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363         struct sk_buff *p=skb_peek(&s->receive_queue);
 364         if(p==NULL)
 365                 return NULL;
 366         do
 367         {
 368                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 369                         return p;
 370                 p=p->next;
 371         }
 372         while(p!=(struct sk_buff *)&s->receive_queue);
 373         return NULL;
 374 }
 375 
 376 /*
 377  *      Remove a completed connection and return it. This is used by
 378  *      tcp_accept() to get connections from the queue.
 379  */
 380 
 381 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 382 {
 383         struct sk_buff *skb;
 384         unsigned long flags;
 385         save_flags(flags);
 386         cli(); 
 387         skb=tcp_find_established(s);
 388         if(skb!=NULL)
 389                 skb_unlink(skb);        /* Take it off the queue */
 390         restore_flags(flags);
 391         return skb;
 392 }
 393 
 394 /* 
 395  *      This routine closes sockets which have been at least partially
 396  *      opened, but not yet accepted. Currently it is only called by
 397  *      tcp_close, and timeout mirrors the value there. 
 398  */
 399 
 400 static void tcp_close_pending (struct sock *sk) 
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402         struct sk_buff *skb;
 403 
 404         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) 
 405         {
 406                 skb->sk->dead=1;
 407                 tcp_close(skb->sk, 0);
 408                 kfree_skb(skb, FREE_READ);
 409         }
 410         return;
 411 }
 412 
 413 /*
 414  *      Enter the time wait state. 
 415  */
 416 
 417 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 418 {
 419         tcp_set_state(sk,TCP_TIME_WAIT);
 420         sk->shutdown = SHUTDOWN_MASK;
 421         if (!sk->dead)
 422                 sk->state_change(sk);
 423         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 424 }
 425 
 426 /*
 427  *      A socket has timed out on its send queue and wants to do a
 428  *      little retransmitting. Currently this means TCP.
 429  */
 430 
 431 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 432 {
 433         struct sk_buff * skb;
 434         struct proto *prot;
 435         struct device *dev;
 436         int ct=0;
 437 
 438         prot = sk->prot;
 439         skb = sk->send_head;
 440 
 441         while (skb != NULL)
 442         {
 443                 struct tcphdr *th;
 444                 struct iphdr *iph;
 445                 int size;
 446 
 447                 dev = skb->dev;
 448                 IS_SKB(skb);
 449                 skb->when = jiffies;
 450 
 451                 /*
 452                  * In general it's OK just to use the old packet.  However we
 453                  * need to use the current ack and window fields.  Urg and
 454                  * urg_ptr could possibly stand to be updated as well, but we
 455                  * don't keep the necessary data.  That shouldn't be a problem,
 456                  * if the other end is doing the right thing.  Since we're
 457                  * changing the packet, we have to issue a new IP identifier.
 458                  */
 459 
 460                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 461                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 462                 size = skb->len - (((unsigned char *) th) - skb->data);
 463                 
 464                 /*
 465                  *      Note: We ought to check for window limits here but
 466                  *      currently this is done (less efficiently) elsewhere.
 467                  *      We do need to check for a route change but can't handle
 468                  *      that until we have the new 1.3.x buffers in.
 469                  *
 470                  */
 471 
 472                 iph->id = htons(ip_id_count++);
 473                 ip_send_check(iph);
 474 
 475                 /*
 476                  *      This is not the right way to handle this. We have to
 477                  *      issue an up to date window and ack report with this 
 478                  *      retransmit to keep the odd buggy tcp that relies on 
 479                  *      the fact BSD does this happy. 
 480                  *      We don't however need to recalculate the entire 
 481                  *      checksum, so someone wanting a small problem to play
 482                  *      with might like to implement RFC1141/RFC1624 and speed
 483                  *      this up by avoiding a full checksum.
 484                  */
 485                  
 486                 th->ack_seq = ntohl(sk->acked_seq);
 487                 th->window = ntohs(tcp_select_window(sk));
 488                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 489                 
 490                 /*
 491                  *      If the interface is (still) up and running, kick it.
 492                  */
 493 
 494                 if (dev->flags & IFF_UP)
 495                 {
 496                         /*
 497                          *      If the packet is still being sent by the device/protocol
 498                          *      below then don't retransmit. This is both needed, and good -
 499                          *      especially with connected mode AX.25 where it stops resends
 500                          *      occurring of an as yet unsent anyway frame!
 501                          *      We still add up the counts as the round trip time wants
 502                          *      adjusting.
 503                          */
 504                         if (sk && !skb_device_locked(skb))
 505                         {
 506                                 /* Remove it from any existing driver queue first! */
 507                                 skb_unlink(skb);
 508                                 /* Now queue it */
 509                                 ip_statistics.IpOutRequests++;
 510                                 dev_queue_xmit(skb, dev, sk->priority);
 511                         }
 512                 }
 513 
 514                 /*
 515                  *      Count retransmissions
 516                  */
 517                  
 518                 ct++;
 519                 sk->prot->retransmits ++;
 520 
 521                 /*
 522                  *      Only one retransmit requested.
 523                  */
 524         
 525                 if (!all)
 526                         break;
 527 
 528                 /*
 529                  *      This should cut it off before we send too many packets.
 530                  */
 531 
 532                 if (ct >= sk->cong_window)
 533                         break;
 534                 skb = skb->link3;
 535         }
 536 }
 537 
 538 /*
 539  *      Reset the retransmission timer
 540  */
 541  
 542 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 543 {
 544         del_timer(&sk->retransmit_timer);
 545         sk->ip_xmit_timeout = why;
 546         if((int)when < 0)
 547         {
 548                 when=3;
 549                 printk("Error: Negative timer in xmit_timer\n");
 550         }
 551         sk->retransmit_timer.expires=when;
 552         add_timer(&sk->retransmit_timer);
 553 }
 554 
 555 /*
 556  *      This is the normal code called for timeouts.  It does the retransmission
 557  *      and then does backoff.  tcp_do_retransmit is separated out because
 558  *      tcp_ack needs to send stuff from the retransmit queue without
 559  *      initiating a backoff.
 560  */
 561 
 562 
 563 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 564 {
 565         tcp_do_retransmit(sk, all);
 566 
 567         /*
 568          * Increase the timeout each time we retransmit.  Note that
 569          * we do not increase the rtt estimate.  rto is initialized
 570          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 571          * that doubling rto each time is the least we can get away with.
 572          * In KA9Q, Karn uses this for the first few times, and then
 573          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 574          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 575          * defined in the protocol as the maximum possible RTT.  I guess
 576          * we'll have to use something other than TCP to talk to the
 577          * University of Mars.
 578          *
 579          * PAWS allows us longer timeouts and large windows, so once
 580          * implemented ftp to mars will work nicely. We will have to fix
 581          * the 120 second clamps though!
 582          */
 583 
 584         sk->retransmits++;
 585         sk->backoff++;
 586         sk->rto = min(sk->rto << 1, 120*HZ);
 587         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 588 }
 589 
 590 
 591 /*
 592  *      A timer event has trigger a tcp retransmit timeout. The
 593  *      socket xmit queue is ready and set up to send. Because
 594  *      the ack receive code keeps the queue straight we do
 595  *      nothing clever here.
 596  */
 597 
 598 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 599 {
 600         if (all) 
 601         {
 602                 tcp_retransmit_time(sk, all);
 603                 return;
 604         }
 605 
 606         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 607         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 608         sk->cong_count = 0;
 609 
 610         sk->cong_window = 1;
 611 
 612         /* Do the actual retransmit. */
 613         tcp_retransmit_time(sk, all);
 614 }
 615 
 616 /*
 617  *      A write timeout has occurred. Process the after effects.
 618  */
 619 
 620 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 621 {
 622         /*
 623          *      Look for a 'soft' timeout.
 624          */
 625         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 626                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 627         {
 628                 /*
 629                  *      Attempt to recover if arp has changed (unlikely!) or
 630                  *      a route has shifted (not supported prior to 1.3).
 631                  */
 632                 arp_destroy (sk->daddr, 0);
 633                 /*ip_route_check (sk->daddr);*/
 634         }
 635         /*
 636          *      Has it gone just too far ?
 637          */
 638         if (sk->retransmits > TCP_RETR2) 
 639         {
 640                 sk->err = ETIMEDOUT;
 641                 sk->error_report(sk);
 642                 del_timer(&sk->retransmit_timer);
 643                 /*
 644                  *      Time wait the socket 
 645                  */
 646                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
 647                 {
 648                         tcp_set_state(sk,TCP_TIME_WAIT);
 649                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 650                 }
 651                 else
 652                 {
 653                         /*
 654                          *      Clean up time.
 655                          */
 656                         tcp_set_state(sk, TCP_CLOSE);
 657                         return 0;
 658                 }
 659         }
 660         return 1;
 661 }
 662 
 663 /*
 664  *      The TCP retransmit timer. This lacks a few small details.
 665  *
 666  *      1.      An initial rtt timeout on the probe0 should cause what we can
 667  *              of the first write queue buffer to be split and sent.
 668  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 669  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 670  *              tcp_err should save a 'soft error' for us.
 671  */
 672 
 673 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 674 {
 675         struct sock *sk = (struct sock*)data;
 676         int why = sk->ip_xmit_timeout;
 677 
 678         /* 
 679          * only process if socket is not in use
 680          */
 681 
 682         cli();
 683         if (sk->inuse || in_bh) 
 684         {
 685                 /* Try again in 1 second */
 686                 sk->retransmit_timer.expires = HZ;
 687                 add_timer(&sk->retransmit_timer);
 688                 sti();
 689                 return;
 690         }
 691 
 692         sk->inuse = 1;
 693         sti();
 694 
 695         /* Always see if we need to send an ack. */
 696 
 697         if (sk->ack_backlog && !sk->zapped) 
 698         {
 699                 sk->prot->read_wakeup (sk);
 700                 if (! sk->dead)
 701                         sk->data_ready(sk,0);
 702         }
 703 
 704         /* Now we need to figure out why the socket was on the timer. */
 705 
 706         switch (why) 
 707         {
 708                 /* Window probing */
 709                 case TIME_PROBE0:
 710                         tcp_send_probe0(sk);
 711                         tcp_write_timeout(sk);
 712                         break;
 713                 /* Retransmitting */
 714                 case TIME_WRITE:
 715                         /* It could be we got here because we needed to send an ack.
 716                          * So we need to check for that.
 717                          */
 718                 {
 719                         struct sk_buff *skb;
 720                         unsigned long flags;
 721 
 722                         save_flags(flags);
 723                         cli();
 724                         skb = sk->send_head;
 725                         if (!skb) 
 726                         {
 727                                 restore_flags(flags);
 728                         } 
 729                         else 
 730                         {
 731                                 /*
 732                                  *      Kicked by a delayed ack. Reset timer
 733                                  *      correctly now
 734                                  */
 735                                 if (jiffies < skb->when + sk->rto) 
 736                                 {
 737                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 738                                         restore_flags(flags);
 739                                         break;
 740                                 }
 741                                 restore_flags(flags);
 742                                 /*
 743                                  *      Retransmission
 744                                  */
 745                                 sk->prot->retransmit (sk, 0);
 746                                 tcp_write_timeout(sk);
 747                         }
 748                         break;
 749                 }
 750                 /* Sending Keepalives */
 751                 case TIME_KEEPOPEN:
 752                         /* 
 753                          * this reset_timer() call is a hack, this is not
 754                          * how KEEPOPEN is supposed to work.
 755                          */
 756                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 757 
 758                         /* Send something to keep the connection open. */
 759                         if (sk->prot->write_wakeup)
 760                                   sk->prot->write_wakeup (sk);
 761                         sk->retransmits++;
 762                         tcp_write_timeout(sk);
 763                         break;
 764                 default:
 765                         printk ("rexmit_timer: timer expired - reason unknown\n");
 766                         break;
 767         }
 768         release_sock(sk);
 769 }
 770 
 771 /*
 772  * This routine is called by the ICMP module when it gets some
 773  * sort of error condition.  If err < 0 then the socket should
 774  * be closed and the error returned to the user.  If err > 0
 775  * it's just the icmp type << 8 | icmp code.  After adjustment
 776  * header points to the first 8 bytes of the tcp header.  We need
 777  * to find the appropriate port.
 778  */
 779 
 780 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 781         unsigned long saddr, struct inet_protocol *protocol)
 782 {
 783         struct tcphdr *th;
 784         struct sock *sk;
 785         struct iphdr *iph=(struct iphdr *)header;
 786   
 787         header+=4*iph->ihl;
 788    
 789 
 790         th =(struct tcphdr *)header;
 791         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 792 
 793         if (sk == NULL) 
 794                 return;
 795   
 796         if(err<0)
 797         {
 798                 sk->err = -err;
 799                 sk->error_report(sk);
 800                 return;
 801         }
 802 
 803         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 804         {
 805                 /*
 806                  * FIXME:
 807                  * For now we will just trigger a linear backoff.
 808                  * The slow start code should cause a real backoff here.
 809                  */
 810                 if (sk->cong_window > 4)
 811                         sk->cong_window--;
 812                 return;
 813         }
 814 
 815 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 816 
 817         /*
 818          * If we've already connected we will keep trying
 819          * until we time out, or the user gives up.
 820          */
 821 
 822         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 823         {
 824                 if (sk->state == TCP_SYN_SENT) 
 825                 {
 826                         tcp_statistics.TcpAttemptFails++;
 827                         tcp_set_state(sk,TCP_CLOSE);
 828                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 829                 }
 830                 sk->err = icmp_err_convert[err & 0xff].errno;           
 831         }
 832         return;
 833 }
 834 
 835 
 836 /*
 837  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 838  *      in the received data queue (ie a frame missing that needs sending to us). Not
 839  *      sorting using two queues as data arrives makes life so much harder.
 840  */
 841 
 842 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 843 {
 844         unsigned long counted;
 845         unsigned long amount;
 846         struct sk_buff *skb;
 847         int sum;
 848         unsigned long flags;
 849 
 850         if(sk && sk->debug)
 851                 printk("tcp_readable: %p - ",sk);
 852 
 853         save_flags(flags);
 854         cli();
 855         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 856         {
 857                 restore_flags(flags);
 858                 if(sk && sk->debug) 
 859                         printk("empty\n");
 860                 return(0);
 861         }
 862   
 863         counted = sk->copied_seq;       /* Where we are at the moment */
 864         amount = 0;
 865   
 866         /* 
 867          *      Do until a push or until we are out of data. 
 868          */
 869          
 870         do 
 871         {
 872                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 873                         break;
 874                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 875                 if (skb->h.th->syn)
 876                         sum++;
 877                 if (sum > 0) 
 878                 {                                       /* Add it up, move on */
 879                         amount += sum;
 880                         if (skb->h.th->syn) 
 881                                 amount--;
 882                         counted += sum;
 883                 }
 884                 /*
 885                  * Don't count urg data ... but do it in the right place!
 886                  * Consider: "old_data (ptr is here) URG PUSH data"
 887                  * The old code would stop at the first push because
 888                  * it counted the urg (amount==1) and then does amount--
 889                  * *after* the loop.  This means tcp_readable() always
 890                  * returned zero if any URG PUSH was in the queue, even
 891                  * though there was normal data available. If we subtract
 892                  * the urg data right here, we even get it to work for more
 893                  * than one URG PUSH skb without normal data.
 894                  * This means that select() finally works now with urg data
 895                  * in the queue.  Note that rlogin was never affected
 896                  * because it doesn't use select(); it uses two processes
 897                  * and a blocking read().  And the queue scan in tcp_read()
 898                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 899                  */
 900                 if (skb->h.th->urg)
 901                         amount--;       /* don't count urg data */
 902                 if (amount && skb->h.th->psh) break;
 903                 skb = skb->next;
 904         }
 905         while(skb != (struct sk_buff *)&sk->receive_queue);
 906 
 907         restore_flags(flags);
 908         if(sk->debug)
 909                 printk("got %lu bytes.\n",amount);
 910         return(amount);
 911 }
 912 
 913 /*
 914  * LISTEN is a special case for select..
 915  */
 916 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 917 {
 918         if (sel_type == SEL_IN) {
 919                 int retval;
 920 
 921                 sk->inuse = 1;
 922                 retval = (tcp_find_established(sk) != NULL);
 923                 release_sock(sk);
 924                 if (!retval)
 925                         select_wait(&master_select_wakeup,wait);
 926                 return retval;
 927         }
 928         return 0;
 929 }
 930 
 931 
 932 /*
 933  *      Wait for a TCP event.
 934  *
 935  *      Note that we don't need to set "sk->inuse", as the upper select layers
 936  *      take care of normal races (between the test and the event) and we don't
 937  *      go look at any of the socket buffers directly.
 938  */
 939 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 940 {
 941         if (sk->state == TCP_LISTEN)
 942                 return tcp_listen_select(sk, sel_type, wait);
 943 
 944         switch(sel_type) {
 945         case SEL_IN:
 946                 if (sk->err)
 947                         return 1;
 948                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 949                         break;
 950 
 951                 if (sk->shutdown & RCV_SHUTDOWN)
 952                         return 1;
 953                         
 954                 if (sk->acked_seq == sk->copied_seq)
 955                         break;
 956 
 957                 if (sk->urg_seq != sk->copied_seq ||
 958                     sk->acked_seq != sk->copied_seq+1 ||
 959                     sk->urginline || !sk->urg_data)
 960                         return 1;
 961                 break;
 962 
 963         case SEL_OUT:
 964                 if (sk->err)
 965                         return 1;
 966                 if (sk->shutdown & SEND_SHUTDOWN) 
 967                         return 0;
 968                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 969                         break;
 970                 /*
 971                  * This is now right thanks to a small fix
 972                  * by Matt Dillon.
 973                  */
 974 
 975                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 976                         break;
 977                 return 1;
 978 
 979         case SEL_EX:
 980                 if (sk->urg_data)
 981                         return 1;
 982                 break;
 983         }
 984         select_wait(sk->sleep, wait);
 985         return 0;
 986 }
 987 
 988 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 989 {
 990         int err;
 991         switch(cmd) 
 992         {
 993 
 994                 case TIOCINQ:
 995 #ifdef FIXME    /* FIXME: */
 996                 case FIONREAD:
 997 #endif
 998                 {
 999                         unsigned long amount;
1000 
1001                         if (sk->state == TCP_LISTEN) 
1002                                 return(-EINVAL);
1003 
1004                         sk->inuse = 1;
1005                         amount = tcp_readable(sk);
1006                         release_sock(sk);
1007                         err=verify_area(VERIFY_WRITE,(void *)arg,
1008                                                    sizeof(unsigned long));
1009                         if(err)
1010                                 return err;
1011                         put_fs_long(amount,(unsigned long *)arg);
1012                         return(0);
1013                 }
1014                 case SIOCATMARK:
1015                 {
1016                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1017 
1018                         err = verify_area(VERIFY_WRITE,(void *) arg,
1019                                                   sizeof(unsigned long));
1020                         if (err)
1021                                 return err;
1022                         put_fs_long(answ,(int *) arg);
1023                         return(0);
1024                 }
1025                 case TIOCOUTQ:
1026                 {
1027                         unsigned long amount;
1028 
1029                         if (sk->state == TCP_LISTEN) return(-EINVAL);
1030                         amount = sk->prot->wspace(sk);
1031                         err=verify_area(VERIFY_WRITE,(void *)arg,
1032                                                    sizeof(unsigned long));
1033                         if(err)
1034                                 return err;
1035                         put_fs_long(amount,(unsigned long *)arg);
1036                         return(0);
1037                 }
1038                 default:
1039                         return(-EINVAL);
1040         }
1041 }
1042 
1043 
1044 /*
1045  *      This routine computes a TCP checksum. 
1046  *
1047  *      Modified January 1995 from a go-faster DOS routine by
1048  *      Jorge Cwik <jorge@laser.satlink.net>
1049  */
1050  
1051 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1052           unsigned long saddr, unsigned long daddr)
1053 {     
1054         return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,
1055                 csum_partial((char *)th,len,0));
1056 }
1057 
1058 
1059 
1060 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1061                 unsigned long daddr, int len, struct sock *sk)
1062 {
1063         th->check = 0;
1064         th->check = tcp_check(th, len, saddr, daddr);
1065         return;
1066 }
1067 
1068 /*
1069  *      This is the main buffer sending routine. We queue the buffer
1070  *      having checked it is sane seeming.
1071  */
1072  
1073 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1074 {
1075         int size;
1076         struct tcphdr * th = skb->h.th;
1077 
1078         /*
1079          *      length of packet (not counting length of pre-tcp headers) 
1080          */
1081          
1082         size = skb->len - ((unsigned char *) th - skb->data);
1083 
1084         /*
1085          *      Sanity check it.. 
1086          */
1087          
1088         if (size < sizeof(struct tcphdr) || size > skb->len) 
1089         {
1090                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1091                         skb, skb->data, th, skb->len);
1092                 kfree_skb(skb, FREE_WRITE);
1093                 return;
1094         }
1095 
1096         /*
1097          *      If we have queued a header size packet.. (these crash a few
1098          *      tcp stacks if ack is not set)
1099          */
1100          
1101         if (size == sizeof(struct tcphdr)) 
1102         {
1103                 /* If it's got a syn or fin it's notionally included in the size..*/
1104                 if(!th->syn && !th->fin) 
1105                 {
1106                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1107                         kfree_skb(skb,FREE_WRITE);
1108                         return;
1109                 }
1110         }
1111 
1112         /*
1113          *      Actual processing.
1114          */
1115          
1116         tcp_statistics.TcpOutSegs++;  
1117         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1118         
1119         /*
1120          *      We must queue if
1121          *
1122          *      a) The right edge of this frame exceeds the window
1123          *      b) We are retransmitting (Nagle's rule)
1124          *      c) We have too many packets 'in flight'
1125          */
1126          
1127         if (after(skb->h.seq, sk->window_seq) ||
1128             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1129              sk->packets_out >= sk->cong_window) 
1130         {
1131                 /* checksum will be supplied by tcp_write_xmit.  So
1132                  * we shouldn't need to set it at all.  I'm being paranoid */
1133                 th->check = 0;
1134                 if (skb->next != NULL) 
1135                 {
1136                         printk("tcp_send_partial: next != NULL\n");
1137                         skb_unlink(skb);
1138                 }
1139                 skb_queue_tail(&sk->write_queue, skb);
1140                 
1141                 /*
1142                  *      If we don't fit we have to start the zero window
1143                  *      probes. This is broken - we really need to do a partial
1144                  *      send _first_ (This is what causes the Cisco and PC/TCP
1145                  *      grief).
1146                  */
1147                  
1148                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1149                     sk->send_head == NULL && sk->ack_backlog == 0)
1150                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1151         } 
1152         else 
1153         {
1154                 /*
1155                  *      This is going straight out
1156                  */
1157                  
1158                 th->ack_seq = ntohl(sk->acked_seq);
1159                 th->window = ntohs(tcp_select_window(sk));
1160 
1161                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1162 
1163                 sk->sent_seq = sk->write_seq;
1164                 
1165                 /*
1166                  *      This is mad. The tcp retransmit queue is put together
1167                  *      by the ip layer. This causes half the problems with
1168                  *      unroutable FIN's and other things.
1169                  */
1170                  
1171                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1172                 
1173                 /*
1174                  *      Set for next retransmit based on expected ACK time.
1175                  *      FIXME: We set this every time which means our 
1176                  *      retransmits are really about a window behind.
1177                  */
1178 
1179                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1180         }
1181 }
1182 
1183 /*
1184  *      Locking problems lead us to a messy situation where we can have
1185  *      multiple partially complete buffers queued up. This is really bad
1186  *      as we don't want to be sending partial buffers. Fix this with
1187  *      a semaphore or similar to lock tcp_write per socket.
1188  *
1189  *      These routines are pretty self descriptive.
1190  */
1191  
1192 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1193 {
1194         struct sk_buff * skb;
1195         unsigned long flags;
1196 
1197         save_flags(flags);
1198         cli();
1199         skb = sk->partial;
1200         if (skb) {
1201                 sk->partial = NULL;
1202                 del_timer(&sk->partial_timer);
1203         }
1204         restore_flags(flags);
1205         return skb;
1206 }
1207 
1208 /*
1209  *      Empty the partial queue
1210  */
1211  
1212 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1213 {
1214         struct sk_buff *skb;
1215 
1216         if (sk == NULL)
1217                 return;
1218         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1219                 tcp_send_skb(sk, skb);
1220 }
1221 
1222 /*
1223  *      Queue a partial frame
1224  */
1225  
1226 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1227 {
1228         struct sk_buff * tmp;
1229         unsigned long flags;
1230 
1231         save_flags(flags);
1232         cli();
1233         tmp = sk->partial;
1234         if (tmp)
1235                 del_timer(&sk->partial_timer);
1236         sk->partial = skb;
1237         init_timer(&sk->partial_timer);
1238         /*
1239          *      Wait up to 1 second for the buffer to fill.
1240          */
1241         sk->partial_timer.expires = HZ;
1242         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1243         sk->partial_timer.data = (unsigned long) sk;
1244         add_timer(&sk->partial_timer);
1245         restore_flags(flags);
1246         if (tmp)
1247                 tcp_send_skb(sk, tmp);
1248 }
1249 
1250 
1251 /*
1252  *      This routine sends an ack and also updates the window. 
1253  */
1254  
1255 static void tcp_send_ack(u32 sequence, u32 ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1256              struct sock *sk,
1257              struct tcphdr *th, unsigned long daddr)
1258 {
1259         struct sk_buff *buff;
1260         struct tcphdr *t1;
1261         struct device *dev = NULL;
1262         int tmp;
1263 
1264         if(sk->zapped)
1265                 return;         /* We have been reset, we may not send again */
1266                 
1267         /*
1268          * We need to grab some memory, and put together an ack,
1269          * and then put it into the queue to be sent.
1270          */
1271 
1272         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1273         if (buff == NULL) 
1274         {
1275                 /* 
1276                  *      Force it to send an ack. We don't have to do this
1277                  *      (ACK is unreliable) but it's much better use of 
1278                  *      bandwidth on slow links to send a spare ack than
1279                  *      resend packets. 
1280                  */
1281                  
1282                 sk->ack_backlog++;
1283                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1284                 {
1285                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1286                 }
1287                 return;
1288         }
1289 
1290         /*
1291          *      Assemble a suitable TCP frame
1292          */
1293          
1294         buff->len = sizeof(struct tcphdr);
1295         buff->sk = sk;
1296         buff->localroute = sk->localroute;
1297         t1 =(struct tcphdr *) buff->data;
1298 
1299         /* 
1300          *      Put in the IP header and routing stuff. 
1301          */
1302          
1303         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1304                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1305         if (tmp < 0) 
1306         {
1307                 buff->free = 1;
1308                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1309                 return;
1310         }
1311         buff->len += tmp;
1312         t1 =(struct tcphdr *)((char *)t1 +tmp);
1313 
1314         memcpy(t1, th, sizeof(*t1));
1315 
1316         /*
1317          *      Swap the send and the receive. 
1318          */
1319          
1320         t1->dest = th->source;
1321         t1->source = th->dest;
1322         t1->seq = ntohl(sequence);
1323         t1->ack = 1;
1324         sk->window = tcp_select_window(sk);
1325         t1->window = ntohs(sk->window);
1326         t1->res1 = 0;
1327         t1->res2 = 0;
1328         t1->rst = 0;
1329         t1->urg = 0;
1330         t1->syn = 0;
1331         t1->psh = 0;
1332         t1->fin = 0;
1333         
1334         /*
1335          *      If we have nothing queued for transmit and the transmit timer
1336          *      is on we are just doing an ACK timeout and need to switch
1337          *      to a keepalive.
1338          */
1339          
1340         if (ack == sk->acked_seq) 
1341         {
1342                 sk->ack_backlog = 0;
1343                 sk->bytes_rcv = 0;
1344                 sk->ack_timed = 0;
1345                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1346                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1347                 {
1348                         if(sk->keepopen) {
1349                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1350                         } else {
1351                                 delete_timer(sk);
1352                         }
1353                 }
1354         }
1355         
1356         /*
1357          *      Fill in the packet and send it
1358          */
1359          
1360         t1->ack_seq = ntohl(ack);
1361         t1->doff = sizeof(*t1)/4;
1362         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1363         if (sk->debug)
1364                  printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1365         tcp_statistics.TcpOutSegs++;
1366         sk->prot->queue_xmit(sk, dev, buff, 1);
1367 }
1368 
1369 
1370 /* 
1371  *      This routine builds a generic TCP header. 
1372  */
1373  
1374 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1375 {
1376 
1377         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1378         th->seq = htonl(sk->write_seq);
1379         th->psh =(push == 0) ? 1 : 0;
1380         th->doff = sizeof(*th)/4;
1381         th->ack = 1;
1382         th->fin = 0;
1383         sk->ack_backlog = 0;
1384         sk->bytes_rcv = 0;
1385         sk->ack_timed = 0;
1386         th->ack_seq = htonl(sk->acked_seq);
1387         sk->window = tcp_select_window(sk);
1388         th->window = htons(sk->window);
1389 
1390         return(sizeof(*th));
1391 }
1392 
1393 /*
1394  *      This routine copies from a user buffer into a socket,
1395  *      and starts the transmit system.
1396  */
1397 
1398 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1399           int len, int nonblock, unsigned flags)
1400 {
1401         int copied = 0;
1402         int copy;
1403         int tmp;
1404         struct sk_buff *skb;
1405         struct sk_buff *send_tmp;
1406         unsigned char *buff;
1407         struct proto *prot;
1408         struct device *dev = NULL;
1409 
1410         sk->inuse=1;
1411         prot = sk->prot;
1412         while(len > 0) 
1413         {
1414                 if (sk->err) 
1415                 {                       /* Stop on an error */
1416                         release_sock(sk);
1417                         if (copied) 
1418                                 return(copied);
1419                         tmp = -sk->err;
1420                         sk->err = 0;
1421                         return(tmp);
1422                 }
1423 
1424                 /*
1425                  *      First thing we do is make sure that we are established. 
1426                  */
1427         
1428                 if (sk->shutdown & SEND_SHUTDOWN) 
1429                 {
1430                         release_sock(sk);
1431                         sk->err = EPIPE;
1432                         if (copied) 
1433                                 return(copied);
1434                         sk->err = 0;
1435                         return(-EPIPE);
1436                 }
1437 
1438                 /* 
1439                  *      Wait for a connection to finish.
1440                  */
1441         
1442                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1443                 {
1444                         if (sk->err) 
1445                         {
1446                                 release_sock(sk);
1447                                 if (copied) 
1448                                         return(copied);
1449                                 tmp = -sk->err;
1450                                 sk->err = 0;
1451                                 return(tmp);
1452                         }
1453 
1454                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1455                         {
1456                                 release_sock(sk);
1457                                 if (copied) 
1458                                         return(copied);
1459 
1460                                 if (sk->err) 
1461                                 {
1462                                         tmp = -sk->err;
1463                                         sk->err = 0;
1464                                         return(tmp);
1465                                 }
1466 
1467                                 if (sk->keepopen) 
1468                                 {
1469                                         send_sig(SIGPIPE, current, 0);
1470                                 }
1471                                 return(-EPIPE);
1472                         }
1473 
1474                         if (nonblock || copied) 
1475                         {
1476                                 release_sock(sk);
1477                                 if (copied) 
1478                                         return(copied);
1479                                 return(-EAGAIN);
1480                         }
1481 
1482                         release_sock(sk);
1483                         cli();
1484                 
1485                         if (sk->state != TCP_ESTABLISHED &&
1486                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1487                         {
1488                                 interruptible_sleep_on(sk->sleep);
1489                                 if (current->signal & ~current->blocked) 
1490                                 {
1491                                         sti();
1492                                         if (copied) 
1493                                                 return(copied);
1494                                         return(-ERESTARTSYS);
1495                                 }
1496                         }
1497                         sk->inuse = 1;
1498                         sti();
1499                 }
1500 
1501         /*
1502          * The following code can result in copy <= if sk->mss is ever
1503          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1504          * sk->mtu is constant once SYN processing is finished.  I.e. we
1505          * had better not get here until we've seen his SYN and at least one
1506          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1507          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1508          * non-decreasing.  Note that any ioctl to set user_mss must be done
1509          * before the exchange of SYN's.  If the initial ack from the other
1510          * end has a window of 0, max_window and thus mss will both be 0.
1511          */
1512 
1513         /* 
1514          *      Now we need to check if we have a half built packet. 
1515          */
1516 
1517                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1518                 {
1519                         int hdrlen;
1520 
1521                          /* IP header + TCP header */
1522                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1523                                  + sizeof(struct tcphdr);
1524         
1525                         /* Add more stuff to the end of skb->len */
1526                         if (!(flags & MSG_OOB)) 
1527                         {
1528                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1529                                 /* FIXME: this is really a bug. */
1530                                 if (copy <= 0) 
1531                                 {
1532                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1533                                         copy = 0;
1534                                 }
1535           
1536                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1537                                 skb->len += copy;
1538                                 from += copy;
1539                                 copied += copy;
1540                                 len -= copy;
1541                                 sk->write_seq += copy;
1542                         }
1543                         if ((skb->len - hdrlen) >= sk->mss ||
1544                                 (flags & MSG_OOB) || !sk->packets_out)
1545                                 tcp_send_skb(sk, skb);
1546                         else
1547                                 tcp_enqueue_partial(skb, sk);
1548                         continue;
1549                 }
1550 
1551         /*
1552          * We also need to worry about the window.
1553          * If window < 1/2 the maximum window we've seen from this
1554          *   host, don't use it.  This is sender side
1555          *   silly window prevention, as specified in RFC1122.
1556          *   (Note that this is different than earlier versions of
1557          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1558          *   use the whole MSS.  Since the results in the right
1559          *   edge of the packet being outside the window, it will
1560          *   be queued for later rather than sent.
1561          */
1562 
1563                 copy = sk->window_seq - sk->write_seq;
1564                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1565                         copy = sk->mss;
1566                 if (copy > len)
1567                         copy = len;
1568 
1569         /*
1570          *      We should really check the window here also. 
1571          */
1572          
1573                 send_tmp = NULL;
1574                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1575                 {
1576                         /*
1577                          *      We will release the socket in case we sleep here. 
1578                          */
1579                         release_sock(sk);
1580                         /*
1581                          *      NB: following must be mtu, because mss can be increased.
1582                          *      mss is always <= mtu 
1583                          */
1584                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1585                         sk->inuse = 1;
1586                         send_tmp = skb;
1587                 } 
1588                 else 
1589                 {
1590                         /*
1591                          *      We will release the socket in case we sleep here. 
1592                          */
1593                         release_sock(sk);
1594                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1595                         sk->inuse = 1;
1596                 }
1597 
1598                 /*
1599                  *      If we didn't get any memory, we need to sleep. 
1600                  */
1601 
1602                 if (skb == NULL) 
1603                 {
1604                         sk->socket->flags |= SO_NOSPACE;
1605                         if (nonblock) 
1606                         {
1607                                 release_sock(sk);
1608                                 if (copied) 
1609                                         return(copied);
1610                                 return(-EAGAIN);
1611                         }
1612 
1613                         /*
1614                          *      FIXME: here is another race condition. 
1615                          */
1616 
1617                         tmp = sk->wmem_alloc;
1618                         release_sock(sk);
1619                         cli();
1620                         /*
1621                          *      Again we will try to avoid it. 
1622                          */
1623                         if (tmp <= sk->wmem_alloc &&
1624                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1625                                 && sk->err == 0) 
1626                         {
1627                                 sk->socket->flags &= ~SO_NOSPACE;
1628                                 interruptible_sleep_on(sk->sleep);
1629                                 if (current->signal & ~current->blocked) 
1630                                 {
1631                                         sti();
1632                                         if (copied) 
1633                                                 return(copied);
1634                                         return(-ERESTARTSYS);
1635                                 }
1636                         }
1637                         sk->inuse = 1;
1638                         sti();
1639                         continue;
1640                 }
1641 
1642                 skb->len = 0;
1643                 skb->sk = sk;
1644                 skb->free = 0;
1645                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1646         
1647                 buff = skb->data;
1648         
1649                 /*
1650                  * FIXME: we need to optimize this.
1651                  * Perhaps some hints here would be good.
1652                  */
1653                 
1654                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1655                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1656                 if (tmp < 0 ) 
1657                 {
1658                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1659                         release_sock(sk);
1660                         if (copied) 
1661                                 return(copied);
1662                         return(tmp);
1663                 }
1664                 skb->len += tmp;
1665                 skb->dev = dev;
1666                 buff += tmp;
1667                 skb->h.th =(struct tcphdr *) buff;
1668                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1669                 if (tmp < 0) 
1670                 {
1671                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1672                         release_sock(sk);
1673                         if (copied) 
1674                                 return(copied);
1675                         return(tmp);
1676                 }
1677 
1678                 if (flags & MSG_OOB) 
1679                 {
1680                         ((struct tcphdr *)buff)->urg = 1;
1681                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1682                 }
1683                 skb->len += tmp;
1684                 memcpy_fromfs(buff+tmp, from, copy);
1685 
1686                 from += copy;
1687                 copied += copy;
1688                 len -= copy;
1689                 skb->len += copy;
1690                 skb->free = 0;
1691                 sk->write_seq += copy;
1692         
1693                 if (send_tmp != NULL && sk->packets_out) 
1694                 {
1695                         tcp_enqueue_partial(send_tmp, sk);
1696                         continue;
1697                 }
1698                 tcp_send_skb(sk, skb);
1699         }
1700         sk->err = 0;
1701 
1702 /*
1703  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1704  *      interactive fast network servers. It's meant to be on and
1705  *      it really improves the throughput though not the echo time
1706  *      on my slow slip link - Alan
1707  */
1708 
1709 /*
1710  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1711  */
1712  
1713         if(sk->partial && ((!sk->packets_out) 
1714      /* If not nagling we can send on the before case too.. */
1715               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1716         ))
1717                 tcp_send_partial(sk);
1718 
1719         release_sock(sk);
1720         return(copied);
1721 }
1722 
1723 /*
1724  *      This is just a wrapper. 
1725  */
1726 
1727 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1728            int len, int nonblock, unsigned flags,
1729            struct sockaddr_in *addr, int addr_len)
1730 {
1731         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1732                 return -EINVAL;
1733         if (sk->state == TCP_CLOSE)
1734                 return -ENOTCONN;
1735         if (addr_len < sizeof(*addr))
1736                 return -EINVAL;
1737         if (addr->sin_family && addr->sin_family != AF_INET) 
1738                 return -EINVAL;
1739         if (addr->sin_port != sk->dummy_th.dest) 
1740                 return -EISCONN;
1741         if (addr->sin_addr.s_addr != sk->daddr) 
1742                 return -EISCONN;
1743         return tcp_write(sk, from, len, nonblock, flags);
1744 }
1745 
1746 
1747 /*
1748  *      Send an ack if one is backlogged at this point. Ought to merge
1749  *      this with tcp_send_ack().
1750  */
1751  
1752 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1753 {
1754         int tmp;
1755         struct device *dev = NULL;
1756         struct tcphdr *t1;
1757         struct sk_buff *buff;
1758 
1759         if (!sk->ack_backlog) 
1760                 return;
1761 
1762         /*
1763          * FIXME: we need to put code here to prevent this routine from
1764          * being called.  Being called once in a while is ok, so only check
1765          * if this is the second time in a row.
1766          */
1767 
1768         /*
1769          * We need to grab some memory, and put together an ack,
1770          * and then put it into the queue to be sent.
1771          */
1772 
1773         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1774         if (buff == NULL) 
1775         {
1776                 /* Try again real soon. */
1777                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1778                 return;
1779         }
1780 
1781         buff->len = sizeof(struct tcphdr);
1782         buff->sk = sk;
1783         buff->localroute = sk->localroute;
1784         
1785         /*
1786          *      Put in the IP header and routing stuff. 
1787          */
1788 
1789         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1790                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1791         if (tmp < 0) 
1792         {
1793                 buff->free = 1;
1794                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1795                 return;
1796         }
1797 
1798         buff->len += tmp;
1799         t1 =(struct tcphdr *)(buff->data +tmp);
1800 
1801         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1802         t1->seq = htonl(sk->sent_seq);
1803         t1->ack = 1;
1804         t1->res1 = 0;
1805         t1->res2 = 0;
1806         t1->rst = 0;
1807         t1->urg = 0;
1808         t1->syn = 0;
1809         t1->psh = 0;
1810         sk->ack_backlog = 0;
1811         sk->bytes_rcv = 0;
1812         sk->window = tcp_select_window(sk);
1813         t1->window = ntohs(sk->window);
1814         t1->ack_seq = ntohl(sk->acked_seq);
1815         t1->doff = sizeof(*t1)/4;
1816         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1817         sk->prot->queue_xmit(sk, dev, buff, 1);
1818         tcp_statistics.TcpOutSegs++;
1819 }
1820 
1821 
1822 /*
1823  *      FIXME:
1824  *      This routine frees used buffers.
1825  *      It should consider sending an ACK to let the
1826  *      other end know we now have a bigger window.
1827  */
1828 
1829 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1830 {
1831         unsigned long flags;
1832         unsigned long left;
1833         struct sk_buff *skb;
1834         unsigned long rspace;
1835 
1836         if(sk->debug)
1837                 printk("cleaning rbuf for sk=%p\n", sk);
1838   
1839         save_flags(flags);
1840         cli();
1841   
1842         left = sk->prot->rspace(sk);
1843  
1844         /*
1845          *      We have to loop through all the buffer headers,
1846          *      and try to free up all the space we can.
1847          */
1848 
1849         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1850         {
1851                 if (!skb->used || skb->users) 
1852                         break;
1853                 skb_unlink(skb);
1854                 skb->sk = sk;
1855                 kfree_skb(skb, FREE_READ);
1856         }
1857 
1858         restore_flags(flags);
1859 
1860         /*
1861          *      FIXME:
1862          *      At this point we should send an ack if the difference
1863          *      in the window, and the amount of space is bigger than
1864          *      TCP_WINDOW_DIFF.
1865          */
1866 
1867         if(sk->debug)
1868                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1869                                             left);
1870         if ((rspace=sk->prot->rspace(sk)) != left) 
1871         {
1872                 /*
1873                  * This area has caused the most trouble.  The current strategy
1874                  * is to simply do nothing if the other end has room to send at
1875                  * least 3 full packets, because the ack from those will auto-
1876                  * matically update the window.  If the other end doesn't think
1877                  * we have much space left, but we have room for at least 1 more
1878                  * complete packet than it thinks we do, we will send an ack
1879                  * immediately.  Otherwise we will wait up to .5 seconds in case
1880                  * the user reads some more.
1881                  */
1882                 sk->ack_backlog++;
1883         /*
1884          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1885          * if the other end is offering a window smaller than the agreed on MSS
1886          * (called sk->mtu here).  In theory there's no connection between send
1887          * and receive, and so no reason to think that they're going to send
1888          * small packets.  For the moment I'm using the hack of reducing the mss
1889          * only on the send side, so I'm putting mtu here.
1890          */
1891 
1892                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1893                 {
1894                         /* Send an ack right now. */
1895                         tcp_read_wakeup(sk);
1896                 } 
1897                 else 
1898                 {
1899                         /* Force it to send an ack soon. */
1900                         int was_active = del_timer(&sk->retransmit_timer);
1901                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1902                         {
1903                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1904                         } 
1905                         else
1906                                 add_timer(&sk->retransmit_timer);
1907                 }
1908         }
1909 } 
1910 
1911 
1912 /*
1913  *      Handle reading urgent data. BSD has very simple semantics for
1914  *      this, no blocking and very strange errors 8)
1915  */
1916  
1917 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1918              unsigned char *to, int len, unsigned flags)
1919 {
1920         /*
1921          *      No URG data to read
1922          */
1923         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1924                 return -EINVAL; /* Yes this is right ! */
1925                 
1926         if (sk->err) 
1927         {
1928                 int tmp = -sk->err;
1929                 sk->err = 0;
1930                 return tmp;
1931         }
1932 
1933         if (sk->state == TCP_CLOSE || sk->done) 
1934         {
1935                 if (!sk->done) {
1936                         sk->done = 1;
1937                         return 0;
1938                 }
1939                 return -ENOTCONN;
1940         }
1941 
1942         if (sk->shutdown & RCV_SHUTDOWN) 
1943         {
1944                 sk->done = 1;
1945                 return 0;
1946         }
1947         sk->inuse = 1;
1948         if (sk->urg_data & URG_VALID) 
1949         {
1950                 char c = sk->urg_data;
1951                 if (!(flags & MSG_PEEK))
1952                         sk->urg_data = URG_READ;
1953                 put_fs_byte(c, to);
1954                 release_sock(sk);
1955                 return 1;
1956         }
1957         release_sock(sk);
1958         
1959         /*
1960          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1961          * the available implementations agree in this case:
1962          * this call should never block, independent of the
1963          * blocking state of the socket.
1964          * Mike <pall@rz.uni-karlsruhe.de>
1965          */
1966         return -EAGAIN;
1967 }
1968 
1969 
1970 /*
1971  *      This routine copies from a sock struct into the user buffer. 
1972  */
1973  
1974 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
1975         int len, int nonblock, unsigned flags)
1976 {
1977         struct wait_queue wait = { current, NULL };
1978         int copied = 0;
1979         u32 peek_seq;
1980         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1981         unsigned long used;
1982 
1983         /* 
1984          *      This error should be checked. 
1985          */
1986          
1987         if (sk->state == TCP_LISTEN)
1988                 return -ENOTCONN;
1989 
1990         /*
1991          *      Urgent data needs to be handled specially. 
1992          */
1993          
1994         if (flags & MSG_OOB)
1995                 return tcp_read_urg(sk, nonblock, to, len, flags);
1996 
1997         /*
1998          *      Copying sequence to update. This is volatile to handle
1999          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2000          *      inline and thus not flush cached variables otherwise).
2001          */
2002          
2003         peek_seq = sk->copied_seq;
2004         seq = &sk->copied_seq;
2005         if (flags & MSG_PEEK)
2006                 seq = &peek_seq;
2007 
2008         add_wait_queue(sk->sleep, &wait);
2009         sk->inuse = 1;
2010         while (len > 0) 
2011         {
2012                 struct sk_buff * skb;
2013                 unsigned long offset;
2014         
2015                 /*
2016                  * Are we at urgent data? Stop if we have read anything.
2017                  */
2018                  
2019                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2020                         break;
2021 
2022                 /*
2023                  *      Next get a buffer.
2024                  */
2025                  
2026                 current->state = TASK_INTERRUPTIBLE;
2027 
2028                 skb = skb_peek(&sk->receive_queue);
2029                 do 
2030                 {
2031                         if (!skb)
2032                                 break;
2033                         if (before(*seq, skb->h.th->seq))
2034                                 break;
2035                         offset = *seq - skb->h.th->seq;
2036                         if (skb->h.th->syn)
2037                                 offset--;
2038                         if (offset < skb->len)
2039                                 goto found_ok_skb;
2040                         if (skb->h.th->fin)
2041                                 goto found_fin_ok;
2042                         if (!(flags & MSG_PEEK))
2043                                 skb->used = 1;
2044                         skb = skb->next;
2045                 }
2046                 while (skb != (struct sk_buff *)&sk->receive_queue);
2047 
2048                 if (copied)
2049                         break;
2050 
2051                 if (sk->err) 
2052                 {
2053                         copied = -sk->err;
2054                         sk->err = 0;
2055                         break;
2056                 }
2057 
2058                 if (sk->state == TCP_CLOSE) 
2059                 {
2060                         if (!sk->done) 
2061                         {
2062                                 sk->done = 1;
2063                                 break;
2064                         }
2065                         copied = -ENOTCONN;
2066                         break;
2067                 }
2068 
2069                 if (sk->shutdown & RCV_SHUTDOWN) 
2070                 {
2071                         sk->done = 1;
2072                         break;
2073                 }
2074                         
2075                 if (nonblock) 
2076                 {
2077                         copied = -EAGAIN;
2078                         break;
2079                 }
2080 
2081                 cleanup_rbuf(sk);
2082                 release_sock(sk);
2083                 sk->socket->flags |= SO_WAITDATA;
2084                 schedule();
2085                 sk->socket->flags &= ~SO_WAITDATA;
2086                 sk->inuse = 1;
2087 
2088                 if (current->signal & ~current->blocked) 
2089                 {
2090                         copied = -ERESTARTSYS;
2091                         break;
2092                 }
2093                 continue;
2094 
2095         found_ok_skb:
2096                 /*
2097                  *      Lock the buffer. We can be fairly relaxed as
2098                  *      an interrupt will never steal a buffer we are 
2099                  *      using unless I've missed something serious in
2100                  *      tcp_data.
2101                  */
2102                 
2103                 skb->users++;
2104                 
2105                 /*
2106                  *      Ok so how much can we use ? 
2107                  */
2108                  
2109                 used = skb->len - offset;
2110                 if (len < used)
2111                         used = len;
2112                 /*
2113                  *      Do we have urgent data here? 
2114                  */
2115                 
2116                 if (sk->urg_data) 
2117                 {
2118                         unsigned long urg_offset = sk->urg_seq - *seq;
2119                         if (urg_offset < used) 
2120                         {
2121                                 if (!urg_offset) 
2122                                 {
2123                                         if (!sk->urginline) 
2124                                         {
2125                                                 ++*seq;
2126                                                 offset++;
2127                                                 used--;
2128                                         }
2129                                 }
2130                                 else
2131                                         used = urg_offset;
2132                         }
2133                 }
2134                 
2135                 /*
2136                  *      Copy it - We _MUST_ update *seq first so that we
2137                  *      don't ever double read when we have dual readers
2138                  */
2139                  
2140                 *seq += used;
2141 
2142                 /*
2143                  *      This memcpy_tofs can sleep. If it sleeps and we
2144                  *      do a second read it relies on the skb->users to avoid
2145                  *      a crash when cleanup_rbuf() gets called.
2146                  */
2147                  
2148                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2149                         skb->h.th->doff*4 + offset, used);
2150                 copied += used;
2151                 len -= used;
2152                 to += used;
2153                 
2154                 /*
2155                  *      We now will not sleep again until we are finished
2156                  *      with skb. Sorry if you are doing the SMP port
2157                  *      but you'll just have to fix it neatly ;)
2158                  */
2159                  
2160                 skb->users --;
2161                 
2162                 if (after(sk->copied_seq,sk->urg_seq))
2163                         sk->urg_data = 0;
2164                 if (used + offset < skb->len)
2165                         continue;
2166                 
2167                 /*
2168                  *      Process the FIN.
2169                  */
2170 
2171                 if (skb->h.th->fin)
2172                         goto found_fin_ok;
2173                 if (flags & MSG_PEEK)
2174                         continue;
2175                 skb->used = 1;
2176                 continue;
2177 
2178         found_fin_ok:
2179                 ++*seq;
2180                 if (flags & MSG_PEEK)
2181                         break;
2182                         
2183                 /*
2184                  *      All is done
2185                  */
2186                  
2187                 skb->used = 1;
2188                 sk->shutdown |= RCV_SHUTDOWN;
2189                 break;
2190 
2191         }
2192         remove_wait_queue(sk->sleep, &wait);
2193         current->state = TASK_RUNNING;
2194 
2195         /* Clean up data we have read: This will do ACK frames */
2196         cleanup_rbuf(sk);
2197         release_sock(sk);
2198         return copied;
2199 }
2200 
2201 /*
2202  *      State processing on a close. This implements the state shift for
2203  *      sending our FIN frame. Note that we only send a FIN for some 
2204  *      states. A shutdown() may have already sent the FIN, or we may be
2205  *      closed.
2206  */
2207  
2208 static int tcp_close_state(struct sock *sk, int dead)
     /* [previous][next][first][last][top][bottom][index][help] */
2209 {
2210         int ns=TCP_CLOSE;
2211         int send_fin=0;
2212         switch(sk->state)
2213         {
2214                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2215                         break;
2216                 case TCP_SYN_RECV:
2217                 case TCP_ESTABLISHED:   /* Closedown begin */
2218                         ns=TCP_FIN_WAIT1;
2219                         send_fin=1;
2220                         break;
2221                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2222                 case TCP_FIN_WAIT2:
2223                 case TCP_CLOSING:
2224                         ns=sk->state;
2225                         break;
2226                 case TCP_CLOSE:
2227                 case TCP_LISTEN:
2228                         break;
2229                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2230                                            wait only for the ACK */
2231                         ns=TCP_LAST_ACK;
2232                         send_fin=1;
2233         }
2234         
2235         tcp_set_state(sk,ns);
2236                 
2237         /*
2238          *      This is a (useful) BSD violating of the RFC. There is a
2239          *      problem with TCP as specified in that the other end could
2240          *      keep a socket open forever with no application left this end.
2241          *      We use a 3 minute timeout (about the same as BSD) then kill
2242          *      our end. If they send after that then tough - BUT: long enough
2243          *      that we won't make the old 4*rto = almost no time - whoops
2244          *      reset mistake.
2245          */
2246         if(dead && ns==TCP_FIN_WAIT2)
2247         {
2248                 int timer_active=del_timer(&sk->timer);
2249                 if(timer_active)
2250                         add_timer(&sk->timer);
2251                 else
2252                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2253         }
2254         
2255         return send_fin;
2256 }
2257 
2258 /*
2259  *      Send a fin.
2260  */
2261 
2262 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2263 {
2264         struct proto *prot =(struct proto *)sk->prot;
2265         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2266         struct tcphdr *t1;
2267         struct sk_buff *buff;
2268         struct device *dev=NULL;
2269         int tmp;
2270                 
2271         release_sock(sk); /* in case the malloc sleeps. */
2272         
2273         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2274         sk->inuse = 1;
2275 
2276         if (buff == NULL)
2277         {
2278                 /* This is a disaster if it occurs */
2279                 printk("tcp_send_fin: Impossible malloc failure");
2280                 return;
2281         }
2282 
2283         /*
2284          *      Administrivia
2285          */
2286          
2287         buff->sk = sk;
2288         buff->len = sizeof(*t1);
2289         buff->localroute = sk->localroute;
2290         t1 =(struct tcphdr *) buff->data;
2291 
2292         /*
2293          *      Put in the IP header and routing stuff. 
2294          */
2295 
2296         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2297                            IPPROTO_TCP, sk->opt,
2298                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2299         if (tmp < 0) 
2300         {
2301                 int t;
2302                 /*
2303                  *      Finish anyway, treat this as a send that got lost. 
2304                  *      (Not good).
2305                  */
2306                  
2307                 buff->free = 1;
2308                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2309                 sk->write_seq++;
2310                 t=del_timer(&sk->timer);
2311                 if(t)
2312                         add_timer(&sk->timer);
2313                 else
2314                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2315                 return;
2316         }
2317         
2318         /*
2319          *      We ought to check if the end of the queue is a buffer and
2320          *      if so simply add the fin to that buffer, not send it ahead.
2321          */
2322 
2323         t1 =(struct tcphdr *)((char *)t1 +tmp);
2324         buff->len += tmp;
2325         buff->dev = dev;
2326         memcpy(t1, th, sizeof(*t1));
2327         t1->seq = ntohl(sk->write_seq);
2328         sk->write_seq++;
2329         buff->h.seq = sk->write_seq;
2330         t1->ack = 1;
2331         t1->ack_seq = ntohl(sk->acked_seq);
2332         t1->window = ntohs(sk->window=tcp_select_window(sk));
2333         t1->fin = 1;
2334         t1->rst = 0;
2335         t1->doff = sizeof(*t1)/4;
2336         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2337 
2338         /*
2339          * If there is data in the write queue, the fin must be appended to
2340          * the write queue.
2341          */
2342         
2343         if (skb_peek(&sk->write_queue) != NULL) 
2344         {
2345                 buff->free = 0;
2346                 if (buff->next != NULL) 
2347                 {
2348                         printk("tcp_send_fin: next != NULL\n");
2349                         skb_unlink(buff);
2350                 }
2351                 skb_queue_tail(&sk->write_queue, buff);
2352         } 
2353         else 
2354         {
2355                 sk->sent_seq = sk->write_seq;
2356                 sk->prot->queue_xmit(sk, dev, buff, 0);
2357                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2358         }
2359 }
2360 
2361 /*
2362  *      Shutdown the sending side of a connection. Much like close except
2363  *      that we don't receive shut down or set sk->dead=1.
2364  */
2365 
2366 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2367 {
2368         /*
2369          *      We need to grab some memory, and put together a FIN,
2370          *      and then put it into the queue to be sent.
2371          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2372          */
2373 
2374         if (!(how & SEND_SHUTDOWN)) 
2375                 return;
2376          
2377         /*
2378          *      If we've already sent a FIN, or it's a closed state
2379          */
2380          
2381         if (sk->state == TCP_FIN_WAIT1 ||
2382             sk->state == TCP_FIN_WAIT2 ||
2383             sk->state == TCP_CLOSING ||
2384             sk->state == TCP_LAST_ACK ||
2385             sk->state == TCP_TIME_WAIT || 
2386             sk->state == TCP_CLOSE ||
2387             sk->state == TCP_LISTEN
2388           )
2389         {
2390                 return;
2391         }
2392         sk->inuse = 1;
2393 
2394         /*
2395          * flag that the sender has shutdown
2396          */
2397 
2398         sk->shutdown |= SEND_SHUTDOWN;
2399 
2400         /*
2401          *  Clear out any half completed packets. 
2402          */
2403 
2404         if (sk->partial)
2405                 tcp_send_partial(sk);
2406                 
2407         /*
2408          *      FIN if needed
2409          */
2410          
2411         if(tcp_close_state(sk,0))
2412                 tcp_send_fin(sk);
2413                 
2414         release_sock(sk);
2415 }
2416 
2417 
2418 static int
2419 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2420              int to_len, int nonblock, unsigned flags,
2421              struct sockaddr_in *addr, int *addr_len)
2422 {
2423         int result;
2424   
2425         /* 
2426          *      Have to check these first unlike the old code. If 
2427          *      we check them after we lose data on an error
2428          *      which is wrong 
2429          */
2430 
2431         if(addr_len)
2432                 *addr_len = sizeof(*addr);
2433         result=tcp_read(sk, to, to_len, nonblock, flags);
2434 
2435         if (result < 0) 
2436                 return(result);
2437   
2438         if(addr)
2439         {
2440                 addr->sin_family = AF_INET;
2441                 addr->sin_port = sk->dummy_th.dest;
2442                 addr->sin_addr.s_addr = sk->daddr;
2443         }
2444         return(result);
2445 }
2446 
2447 
2448 /*
2449  *      This routine will send an RST to the other tcp. 
2450  */
2451  
2452 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2453           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2454 {
2455         struct sk_buff *buff;
2456         struct tcphdr *t1;
2457         int tmp;
2458         struct device *ndev=NULL;
2459 
2460         /*
2461          *      Cannot reset a reset (Think about it).
2462          */
2463          
2464         if(th->rst)
2465                 return;
2466   
2467         /*
2468          * We need to grab some memory, and put together an RST,
2469          * and then put it into the queue to be sent.
2470          */
2471 
2472         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2473         if (buff == NULL) 
2474                 return;
2475 
2476         buff->len = sizeof(*t1);
2477         buff->sk = NULL;
2478         buff->dev = dev;
2479         buff->localroute = 0;
2480 
2481         t1 =(struct tcphdr *) buff->data;
2482 
2483         /*
2484          *      Put in the IP header and routing stuff. 
2485          */
2486 
2487         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2488                            sizeof(struct tcphdr),tos,ttl);
2489         if (tmp < 0) 
2490         {
2491                 buff->free = 1;
2492                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2493                 return;
2494         }
2495 
2496         t1 =(struct tcphdr *)((char *)t1 +tmp);
2497         buff->len += tmp;
2498         memcpy(t1, th, sizeof(*t1));
2499 
2500         /*
2501          *      Swap the send and the receive. 
2502          */
2503 
2504         t1->dest = th->source;
2505         t1->source = th->dest;
2506         t1->rst = 1;  
2507         t1->window = 0;
2508   
2509         if(th->ack)
2510         {
2511                 t1->ack = 0;
2512                 t1->seq = th->ack_seq;
2513                 t1->ack_seq = 0;
2514         }
2515         else
2516         {
2517                 t1->ack = 1;
2518                 if(!th->syn)
2519                         t1->ack_seq=htonl(th->seq);
2520                 else
2521                         t1->ack_seq=htonl(th->seq+1);
2522                 t1->seq=0;
2523         }
2524 
2525         t1->syn = 0;
2526         t1->urg = 0;
2527         t1->fin = 0;
2528         t1->psh = 0;
2529         t1->doff = sizeof(*t1)/4;
2530         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2531         prot->queue_xmit(NULL, ndev, buff, 1);
2532         tcp_statistics.TcpOutSegs++;
2533 }
2534 
2535 
2536 /*
2537  *      Look for tcp options. Parses everything but only knows about MSS.
2538  *      This routine is always called with the packet containing the SYN.
2539  *      However it may also be called with the ack to the SYN.  So you
2540  *      can't assume this is always the SYN.  It's always called after
2541  *      we have set up sk->mtu to our own MTU.
2542  *
2543  *      We need at minimum to add PAWS support here. Possibly large windows
2544  *      as Linux gets deployed on 100Mb/sec networks.
2545  */
2546  
2547 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2548 {
2549         unsigned char *ptr;
2550         int length=(th->doff*4)-sizeof(struct tcphdr);
2551         int mss_seen = 0;
2552     
2553         ptr = (unsigned char *)(th + 1);
2554   
2555         while(length>0)
2556         {
2557                 int opcode=*ptr++;
2558                 int opsize=*ptr++;
2559                 switch(opcode)
2560                 {
2561                         case TCPOPT_EOL:
2562                                 return;
2563                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
2564                                 length--;
2565                                 ptr--;          /* the opsize=*ptr++ above was a mistake */
2566                                 continue;
2567                         
2568                         default:
2569                                 if(opsize<=2)   /* Avoid silly options looping forever */
2570                                         return;
2571                                 switch(opcode)
2572                                 {
2573                                         case TCPOPT_MSS:
2574                                                 if(opsize==4 && th->syn)
2575                                                 {
2576                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2577                                                         mss_seen = 1;
2578                                                 }
2579                                                 break;
2580                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2581                                 }
2582                                 ptr+=opsize-2;
2583                                 length-=opsize;
2584                 }
2585         }
2586         if (th->syn) 
2587         {
2588                 if (! mss_seen)
2589                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2590         }
2591 #ifdef CONFIG_INET_PCTCP
2592         sk->mss = min(sk->max_window >> 1, sk->mtu);
2593 #else    
2594         sk->mss = min(sk->max_window, sk->mtu);
2595 #endif  
2596 }
2597 
2598 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2599 {
2600         dst = ntohl(dst);
2601         if (IN_CLASSA(dst))
2602                 return htonl(IN_CLASSA_NET);
2603         if (IN_CLASSB(dst))
2604                 return htonl(IN_CLASSB_NET);
2605         return htonl(IN_CLASSC_NET);
2606 }
2607 
2608 /*
2609  *      Default sequence number picking algorithm.
2610  *      As close as possible to RFC 793, which
2611  *      suggests using a 250kHz clock.
2612  *      Further reading shows this assumes 2MB/s networks.
2613  *      For 10MB/s ethernet, a 1MHz clock is appropriate.
2614  *      That's funny, Linux has one built in!  Use it!
2615  */
2616 
2617 extern inline u32 tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2618 {
2619         struct timeval tv;
2620         do_gettimeofday(&tv);
2621         return tv.tv_usec+tv.tv_sec*1000000;
2622 }
2623 
2624 /*
2625  *      This routine handles a connection request.
2626  *      It should make sure we haven't already responded.
2627  *      Because of the way BSD works, we have to send a syn/ack now.
2628  *      This also means it will be harder to close a socket which is
2629  *      listening.
2630  */
2631  
2632 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2633                  unsigned long daddr, unsigned long saddr,
2634                  struct options *opt, struct device *dev, u32 seq)
2635 {
2636         struct sk_buff *buff;
2637         struct tcphdr *t1;
2638         unsigned char *ptr;
2639         struct sock *newsk;
2640         struct tcphdr *th;
2641         struct device *ndev=NULL;
2642         int tmp;
2643         struct rtable *rt;
2644   
2645         th = skb->h.th;
2646 
2647         /* If the socket is dead, don't accept the connection. */
2648         if (!sk->dead) 
2649         {
2650                 sk->data_ready(sk,0);
2651         }
2652         else 
2653         {
2654                 if(sk->debug)
2655                         printk("Reset on %p: Connect on dead socket.\n",sk);
2656                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2657                 tcp_statistics.TcpAttemptFails++;
2658                 kfree_skb(skb, FREE_READ);
2659                 return;
2660         }
2661 
2662         /*
2663          * Make sure we can accept more.  This will prevent a
2664          * flurry of syns from eating up all our memory.
2665          */
2666 
2667         if (sk->ack_backlog >= sk->max_ack_backlog) 
2668         {
2669                 tcp_statistics.TcpAttemptFails++;
2670                 kfree_skb(skb, FREE_READ);
2671                 return;
2672         }
2673 
2674         /*
2675          * We need to build a new sock struct.
2676          * It is sort of bad to have a socket without an inode attached
2677          * to it, but the wake_up's will just wake up the listening socket,
2678          * and if the listening socket is destroyed before this is taken
2679          * off of the queue, this will take care of it.
2680          */
2681 
2682         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2683         if (newsk == NULL) 
2684         {
2685                 /* just ignore the syn.  It will get retransmitted. */
2686                 tcp_statistics.TcpAttemptFails++;
2687                 kfree_skb(skb, FREE_READ);
2688                 return;
2689         }
2690 
2691         memcpy(newsk, sk, sizeof(*newsk));
2692         skb_queue_head_init(&newsk->write_queue);
2693         skb_queue_head_init(&newsk->receive_queue);
2694         newsk->send_head = NULL;
2695         newsk->send_tail = NULL;
2696         skb_queue_head_init(&newsk->back_log);
2697         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2698         newsk->rto = TCP_TIMEOUT_INIT;
2699         newsk->mdev = 0;
2700         newsk->max_window = 0;
2701         newsk->cong_window = 1;
2702         newsk->cong_count = 0;
2703         newsk->ssthresh = 0;
2704         newsk->backoff = 0;
2705         newsk->blog = 0;
2706         newsk->intr = 0;
2707         newsk->proc = 0;
2708         newsk->done = 0;
2709         newsk->partial = NULL;
2710         newsk->pair = NULL;
2711         newsk->wmem_alloc = 0;
2712         newsk->rmem_alloc = 0;
2713         newsk->localroute = sk->localroute;
2714 
2715         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2716 
2717         newsk->err = 0;
2718         newsk->shutdown = 0;
2719         newsk->ack_backlog = 0;
2720         newsk->acked_seq = skb->h.th->seq+1;
2721         newsk->copied_seq = skb->h.th->seq+1;
2722         newsk->fin_seq = skb->h.th->seq;
2723         newsk->state = TCP_SYN_RECV;
2724         newsk->timeout = 0;
2725         newsk->ip_xmit_timeout = 0;
2726         newsk->write_seq = seq; 
2727         newsk->window_seq = newsk->write_seq;
2728         newsk->rcv_ack_seq = newsk->write_seq;
2729         newsk->urg_data = 0;
2730         newsk->retransmits = 0;
2731         newsk->linger=0;
2732         newsk->destroy = 0;
2733         init_timer(&newsk->timer);
2734         newsk->timer.data = (unsigned long)newsk;
2735         newsk->timer.function = &net_timer;
2736         init_timer(&newsk->retransmit_timer);
2737         newsk->retransmit_timer.data = (unsigned long)newsk;
2738         newsk->retransmit_timer.function=&retransmit_timer;
2739         newsk->dummy_th.source = skb->h.th->dest;
2740         newsk->dummy_th.dest = skb->h.th->source;
2741         
2742         /*
2743          *      Swap these two, they are from our point of view. 
2744          */
2745          
2746         newsk->daddr = saddr;
2747         newsk->saddr = daddr;
2748 
2749         put_sock(newsk->num,newsk);
2750         newsk->dummy_th.res1 = 0;
2751         newsk->dummy_th.doff = 6;
2752         newsk->dummy_th.fin = 0;
2753         newsk->dummy_th.syn = 0;
2754         newsk->dummy_th.rst = 0;        
2755         newsk->dummy_th.psh = 0;
2756         newsk->dummy_th.ack = 0;
2757         newsk->dummy_th.urg = 0;
2758         newsk->dummy_th.res2 = 0;
2759         newsk->acked_seq = skb->h.th->seq + 1;
2760         newsk->copied_seq = skb->h.th->seq + 1;
2761         newsk->socket = NULL;
2762 
2763         /*
2764          *      Grab the ttl and tos values and use them 
2765          */
2766 
2767         newsk->ip_ttl=sk->ip_ttl;
2768         newsk->ip_tos=skb->ip_hdr->tos;
2769 
2770         /*
2771          *      Use 512 or whatever user asked for 
2772          */
2773 
2774         /*
2775          *      Note use of sk->user_mss, since user has no direct access to newsk 
2776          */
2777 
2778         rt=ip_rt_route(saddr, NULL,NULL);
2779         
2780         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2781                 newsk->window_clamp = rt->rt_window;
2782         else
2783                 newsk->window_clamp = 0;
2784                 
2785         if (sk->user_mss)
2786                 newsk->mtu = sk->user_mss;
2787         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2788                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2789         else 
2790         {
2791 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2792                 if ((saddr ^ daddr) & default_mask(saddr))
2793 #else
2794                 if ((saddr ^ daddr) & dev->pa_mask)
2795 #endif
2796                         newsk->mtu = 576 - HEADER_SIZE;
2797                 else
2798                         newsk->mtu = MAX_WINDOW;
2799         }
2800 
2801         /*
2802          *      But not bigger than device MTU 
2803          */
2804 
2805         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2806 
2807         /*
2808          *      This will min with what arrived in the packet 
2809          */
2810 
2811         tcp_options(newsk,skb->h.th);
2812         
2813         tcp_cache_zap();
2814 
2815         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2816         if (buff == NULL) 
2817         {
2818                 sk->err = ENOMEM;
2819                 newsk->dead = 1;
2820                 newsk->state = TCP_CLOSE;
2821                 /* And this will destroy it */
2822                 release_sock(newsk);
2823                 kfree_skb(skb, FREE_READ);
2824                 tcp_statistics.TcpAttemptFails++;
2825                 return;
2826         }
2827   
2828         buff->len = sizeof(struct tcphdr)+4;
2829         buff->sk = newsk;
2830         buff->localroute = newsk->localroute;
2831 
2832         t1 =(struct tcphdr *) buff->data;
2833 
2834         /*
2835          *      Put in the IP header and routing stuff. 
2836          */
2837 
2838         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840 
2841         /*
2842          *      Something went wrong. 
2843          */
2844 
2845         if (tmp < 0) 
2846         {
2847                 sk->err = tmp;
2848                 buff->free = 1;
2849                 kfree_skb(buff,FREE_WRITE);
2850                 newsk->dead = 1;
2851                 newsk->state = TCP_CLOSE;
2852                 release_sock(newsk);
2853                 skb->sk = sk;
2854                 kfree_skb(skb, FREE_READ);
2855                 tcp_statistics.TcpAttemptFails++;
2856                 return;
2857         }
2858 
2859         buff->len += tmp;
2860         t1 =(struct tcphdr *)((char *)t1 +tmp);
2861   
2862         memcpy(t1, skb->h.th, sizeof(*t1));
2863         buff->h.seq = newsk->write_seq;
2864         /*
2865          *      Swap the send and the receive. 
2866          */
2867         t1->dest = skb->h.th->source;
2868         t1->source = newsk->dummy_th.source;
2869         t1->seq = ntohl(newsk->write_seq++);
2870         t1->ack = 1;
2871         newsk->window = tcp_select_window(newsk);
2872         newsk->sent_seq = newsk->write_seq;
2873         t1->window = ntohs(newsk->window);
2874         t1->res1 = 0;
2875         t1->res2 = 0;
2876         t1->rst = 0;
2877         t1->urg = 0;
2878         t1->psh = 0;
2879         t1->syn = 1;
2880         t1->ack_seq = ntohl(skb->h.th->seq+1);
2881         t1->doff = sizeof(*t1)/4+1;
2882         ptr =(unsigned char *)(t1+1);
2883         ptr[0] = 2;
2884         ptr[1] = 4;
2885         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2886         ptr[3] =(newsk->mtu) & 0xff;
2887 
2888         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2889         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2890         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2891         skb->sk = newsk;
2892 
2893         /*
2894          *      Charge the sock_buff to newsk. 
2895          */
2896          
2897         sk->rmem_alloc -= skb->mem_len;
2898         newsk->rmem_alloc += skb->mem_len;
2899         
2900         skb_queue_tail(&sk->receive_queue,skb);
2901         sk->ack_backlog++;
2902         release_sock(newsk);
2903         tcp_statistics.TcpOutSegs++;
2904 }
2905 
2906 
2907 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2908 {
2909         /*
2910          * We need to grab some memory, and put together a FIN, 
2911          * and then put it into the queue to be sent.
2912          */
2913         
2914         sk->inuse = 1;
2915         
2916         if(th_cache_sk==sk)
2917                 tcp_cache_zap();
2918         if(sk->state == TCP_LISTEN)
2919         {
2920                 /* Special case */
2921                 tcp_set_state(sk, TCP_CLOSE);
2922                 tcp_close_pending(sk);
2923                 release_sock(sk);
2924                 return;
2925         }
2926         
2927         sk->keepopen = 1;
2928         sk->shutdown = SHUTDOWN_MASK;
2929 
2930         if (!sk->dead) 
2931                 sk->state_change(sk);
2932 
2933         if (timeout == 0) 
2934         {
2935                 struct sk_buff *skb;
2936                 
2937                 /*
2938                  *  We need to flush the recv. buffs.  We do this only on the
2939                  *  descriptor close, not protocol-sourced closes, because the
2940                  *  reader process may not have drained the data yet!
2941                  */
2942                  
2943                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2944                         kfree_skb(skb, FREE_READ);
2945                 /*
2946                  *      Get rid off any half-completed packets. 
2947                  */
2948 
2949                 if (sk->partial) 
2950                         tcp_send_partial(sk);
2951         }
2952 
2953                 
2954         /*
2955          *      Timeout is not the same thing - however the code likes
2956          *      to send both the same way (sigh).
2957          */
2958          
2959         if(timeout)
2960         {
2961                 tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2962         }
2963         else
2964         {
2965                 if(tcp_close_state(sk,1)==1)
2966                 {
2967                         tcp_send_fin(sk);
2968                 }
2969         }
2970         release_sock(sk);
2971 }
2972 
2973 
2974 /*
2975  *      This routine takes stuff off of the write queue,
2976  *      and puts it in the xmit queue. This happens as incoming acks
2977  *      open up the remote window for us.
2978  */
2979  
2980 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2981 {
2982         struct sk_buff *skb;
2983 
2984         /*
2985          *      The bytes will have to remain here. In time closedown will
2986          *      empty the write queue and all will be happy 
2987          */
2988 
2989         if(sk->zapped)
2990                 return;
2991 
2992         /*
2993          *      Anything on the transmit queue that fits the window can
2994          *      be added providing we are not
2995          *
2996          *      a) retransmitting (Nagle's rule)
2997          *      b) exceeding our congestion window.
2998          */
2999          
3000         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3001                 before(skb->h.seq, sk->window_seq + 1) &&
3002                 (sk->retransmits == 0 ||
3003                  sk->ip_xmit_timeout != TIME_WRITE ||
3004                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3005                 && sk->packets_out < sk->cong_window) 
3006         {
3007                 IS_SKB(skb);
3008                 skb_unlink(skb);
3009                 
3010                 /*
3011                  *      See if we really need to send the packet. 
3012                  */
3013                  
3014                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3015                 {
3016                         /*
3017                          *      This is acked data. We can discard it. This 
3018                          *      cannot currently occur.
3019                          */
3020                          
3021                         sk->retransmits = 0;
3022                         kfree_skb(skb, FREE_WRITE);
3023                         if (!sk->dead) 
3024                                 sk->write_space(sk);
3025                 } 
3026                 else
3027                 {
3028                         struct tcphdr *th;
3029                         struct iphdr *iph;
3030                         int size;
3031 /*
3032  * put in the ack seq and window at this point rather than earlier,
3033  * in order to keep them monotonic.  We really want to avoid taking
3034  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3035  * Ack and window will in general have changed since this packet was put
3036  * on the write queue.
3037  */
3038                         iph = (struct iphdr *)(skb->data +
3039                                                skb->dev->hard_header_len);
3040                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3041                         size = skb->len - (((unsigned char *) th) - skb->data);
3042                         
3043                         th->ack_seq = ntohl(sk->acked_seq);
3044                         th->window = ntohs(tcp_select_window(sk));
3045 
3046                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3047 
3048                         sk->sent_seq = skb->h.seq;
3049                         
3050                         /*
3051                          *      IP manages our queue for some crazy reason
3052                          */
3053                          
3054                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3055                         
3056                         /*
3057                          *      Again we slide the timer wrongly
3058                          */
3059                          
3060                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3061                 }
3062         }
3063 }
3064 
3065 
3066 /*
3067  *      This routine deals with incoming acks, but not outgoing ones.
3068  */
3069 
3070 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3071 {
3072         u32 ack;
3073         int flag = 0;
3074 
3075         /* 
3076          * 1 - there was data in packet as well as ack or new data is sent or 
3077          *     in shutdown state
3078          * 2 - data from retransmit queue was acked and removed
3079          * 4 - window shrunk or data from retransmit queue was acked and removed
3080          */
3081 
3082         if(sk->zapped)
3083                 return(1);      /* Dead, cant ack any more so why bother */
3084 
3085         /*
3086          *      Have we discovered a larger window
3087          */
3088          
3089         ack = ntohl(th->ack_seq);
3090 
3091         if (ntohs(th->window) > sk->max_window) 
3092         {
3093                 sk->max_window = ntohs(th->window);
3094 #ifdef CONFIG_INET_PCTCP
3095                 /* Hack because we don't send partial packets to non SWS
3096                    handling hosts */
3097                 sk->mss = min(sk->max_window>>1, sk->mtu);
3098 #else
3099                 sk->mss = min(sk->max_window, sk->mtu);
3100 #endif  
3101         }
3102 
3103         /*
3104          *      We have dropped back to keepalive timeouts. Thus we have
3105          *      no retransmits pending.
3106          */
3107          
3108         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3109                 sk->retransmits = 0;
3110 
3111         /*
3112          *      If the ack is newer than sent or older than previous acks
3113          *      then we can probably ignore it.
3114          */
3115          
3116         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3117         {
3118                 if(sk->debug)
3119                         printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3120                         
3121                 /*
3122                  *      Keepalive processing.
3123                  */
3124                  
3125                 if (after(ack, sk->sent_seq)) 
3126                 {
3127                         return(0);
3128                 }
3129                 
3130                 /*
3131                  *      Restart the keepalive timer.
3132                  */
3133                  
3134                 if (sk->keepopen) 
3135                 {
3136                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3137                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3138                 }
3139                 return(1);
3140         }
3141 
3142         /*
3143          *      If there is data set flag 1
3144          */
3145          
3146         if (len != th->doff*4) 
3147                 flag |= 1;
3148 
3149         /*
3150          *      See if our window has been shrunk. 
3151          */
3152 
3153         if (after(sk->window_seq, ack+ntohs(th->window))) 
3154         {
3155                 /*
3156                  * We may need to move packets from the send queue
3157                  * to the write queue, if the window has been shrunk on us.
3158                  * The RFC says you are not allowed to shrink your window
3159                  * like this, but if the other end does, you must be able
3160                  * to deal with it.
3161                  */
3162                 struct sk_buff *skb;
3163                 struct sk_buff *skb2;
3164                 struct sk_buff *wskb = NULL;
3165         
3166                 skb2 = sk->send_head;
3167                 sk->send_head = NULL;
3168                 sk->send_tail = NULL;
3169         
3170                 /*
3171                  *      This is an artifact of a flawed concept. We want one
3172                  *      queue and a smarter send routine when we send all.
3173                  */
3174         
3175                 flag |= 4;      /* Window changed */
3176         
3177                 sk->window_seq = ack + ntohs(th->window);
3178                 cli();
3179                 while (skb2 != NULL) 
3180                 {
3181                         skb = skb2;
3182                         skb2 = skb->link3;
3183                         skb->link3 = NULL;
3184                         if (after(skb->h.seq, sk->window_seq)) 
3185                         {
3186                                 if (sk->packets_out > 0) 
3187                                         sk->packets_out--;
3188                                 /* We may need to remove this from the dev send list. */
3189                                 if (skb->next != NULL) 
3190                                 {
3191                                         skb_unlink(skb);                                
3192                                 }
3193                                 /* Now add it to the write_queue. */
3194                                 if (wskb == NULL)
3195                                         skb_queue_head(&sk->write_queue,skb);
3196                                 else
3197                                         skb_append(wskb,skb);
3198                                 wskb = skb;
3199                         } 
3200                         else 
3201                         {
3202                                 if (sk->send_head == NULL) 
3203                                 {
3204                                         sk->send_head = skb;
3205                                         sk->send_tail = skb;
3206                                 }
3207                                 else
3208                                 {
3209                                         sk->send_tail->link3 = skb;
3210                                         sk->send_tail = skb;
3211                                 }
3212                                 skb->link3 = NULL;
3213                         }
3214                 }
3215                 sti();
3216         }
3217 
3218         /*
3219          *      Pipe has emptied
3220          */
3221          
3222         if (sk->send_tail == NULL || sk->send_head == NULL) 
3223         {
3224                 sk->send_head = NULL;
3225                 sk->send_tail = NULL;
3226                 sk->packets_out= 0;
3227         }
3228 
3229         /*
3230          *      Update the right hand window edge of the host
3231          */
3232          
3233         sk->window_seq = ack + ntohs(th->window);
3234 
3235         /*
3236          *      We don't want too many packets out there. 
3237          */
3238          
3239         if (sk->ip_xmit_timeout == TIME_WRITE && 
3240                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3241         {
3242                 /* 
3243                  * This is Jacobson's slow start and congestion avoidance. 
3244                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3245                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3246                  * counter and increment it once every cwnd times.  It's possible
3247                  * that this should be done only if sk->retransmits == 0.  I'm
3248                  * interpreting "new data is acked" as including data that has
3249                  * been retransmitted but is just now being acked.
3250                  */
3251                 if (sk->cong_window < sk->ssthresh)  
3252                         /* 
3253                          *      In "safe" area, increase
3254                          */
3255                         sk->cong_window++;
3256                 else 
3257                 {
3258                         /*
3259                          *      In dangerous area, increase slowly.  In theory this is
3260                          *      sk->cong_window += 1 / sk->cong_window
3261                          */
3262                         if (sk->cong_count >= sk->cong_window) 
3263                         {
3264                                 sk->cong_window++;
3265                                 sk->cong_count = 0;
3266                         }
3267                         else 
3268                                 sk->cong_count++;
3269                 }
3270         }
3271 
3272         /*
3273          *      Remember the highest ack received.
3274          */
3275          
3276         sk->rcv_ack_seq = ack;
3277 
3278         /*
3279          *      If this ack opens up a zero window, clear backoff.  It was
3280          *      being used to time the probes, and is probably far higher than
3281          *      it needs to be for normal retransmission.
3282          */
3283 
3284         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3285         {
3286                 sk->retransmits = 0;    /* Our probe was answered */
3287                 
3288                 /*
3289                  *      Was it a usable window open ?
3290                  */
3291                  
3292                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3293                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3294                 {
3295                         sk->backoff = 0;
3296                         
3297                         /*
3298                          *      Recompute rto from rtt.  this eliminates any backoff.
3299                          */
3300 
3301                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3302                         if (sk->rto > 120*HZ)
3303                                 sk->rto = 120*HZ;
3304                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3305                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3306                                                    .2 of a second is going to need huge windows (SIGH) */
3307                         sk->rto = 20;
3308                 }
3309         }
3310 
3311         /* 
3312          *      See if we can take anything off of the retransmit queue.
3313          */
3314    
3315         while(sk->send_head != NULL) 
3316         {
3317                 /* Check for a bug. */
3318                 if (sk->send_head->link3 &&
3319                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3320                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3321                         
3322                 /*
3323                  *      If our packet is before the ack sequence we can
3324                  *      discard it as it's confirmed to have arrived the other end.
3325                  */
3326                  
3327                 if (before(sk->send_head->h.seq, ack+1)) 
3328                 {
3329                         struct sk_buff *oskb;   
3330                         if (sk->retransmits) 
3331                         {       
3332                                 /*
3333                                  *      We were retransmitting.  don't count this in RTT est 
3334                                  */
3335                                 flag |= 2;
3336 
3337                                 /*
3338                                  * even though we've gotten an ack, we're still
3339                                  * retransmitting as long as we're sending from
3340                                  * the retransmit queue.  Keeping retransmits non-zero
3341                                  * prevents us from getting new data interspersed with
3342                                  * retransmissions.
3343                                  */
3344 
3345                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3346                                         sk->retransmits = 1;
3347                                 else
3348                                         sk->retransmits = 0;
3349                         }
3350                         /*
3351                          * Note that we only reset backoff and rto in the
3352                          * rtt recomputation code.  And that doesn't happen
3353                          * if there were retransmissions in effect.  So the
3354                          * first new packet after the retransmissions is
3355                          * sent with the backoff still in effect.  Not until
3356                          * we get an ack from a non-retransmitted packet do
3357                          * we reset the backoff and rto.  This allows us to deal
3358                          * with a situation where the network delay has increased
3359                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3360                          */
3361 
3362                         /*
3363                          *      We have one less packet out there. 
3364                          */
3365                          
3366                         if (sk->packets_out > 0) 
3367                                 sk->packets_out --;
3368                         /* 
3369                          *      Wake up the process, it can probably write more. 
3370                          */
3371                         if (!sk->dead) 
3372                                 sk->write_space(sk);
3373                         oskb = sk->send_head;
3374 
3375                         if (!(flag&2))  /* Not retransmitting */
3376                         {
3377                                 long m;
3378         
3379                                 /*
3380                                  *      The following amusing code comes from Jacobson's
3381                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3382                                  *      are scaled versions of rtt and mean deviation.
3383                                  *      This is designed to be as fast as possible 
3384                                  *      m stands for "measurement".
3385                                  */
3386         
3387                                 m = jiffies - oskb->when;  /* RTT */
3388                                 if(m<=0)
3389                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3390                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3391                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3392                                 if (m < 0)
3393                                         m = -m;         /* m is now abs(error) */
3394                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3395                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3396         
3397                                 /*
3398                                  *      Now update timeout.  Note that this removes any backoff.
3399                                  */
3400                          
3401                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3402                                 if (sk->rto > 120*HZ)
3403                                         sk->rto = 120*HZ;
3404                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3405                                         sk->rto = 20;
3406                                 sk->backoff = 0;
3407                         }
3408                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3409                                            In this case as we just set it up */
3410                         cli();
3411                         oskb = sk->send_head;
3412                         IS_SKB(oskb);
3413                         sk->send_head = oskb->link3;
3414                         if (sk->send_head == NULL) 
3415                         {
3416                                 sk->send_tail = NULL;
3417                         }
3418 
3419                 /*
3420                  *      We may need to remove this from the dev send list. 
3421                  */
3422 
3423                         if (oskb->next)
3424                                 skb_unlink(oskb);
3425                         sti();
3426                         kfree_skb(oskb, FREE_WRITE); /* write. */
3427                         if (!sk->dead) 
3428                                 sk->write_space(sk);
3429                 }
3430                 else
3431                 {
3432                         break;
3433                 }
3434         }
3435 
3436         /*
3437          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3438          * returns non-NULL, we complete ignore the timer stuff in the else
3439          * clause.  We ought to organize the code so that else clause can
3440          * (should) be executed regardless, possibly moving the PROBE timer
3441          * reset over.  The skb_peek() thing should only move stuff to the
3442          * write queue, NOT also manage the timer functions.
3443          */
3444 
3445         /*
3446          * Maybe we can take some stuff off of the write queue,
3447          * and put it onto the xmit queue.
3448          */
3449         if (skb_peek(&sk->write_queue) != NULL) 
3450         {
3451                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3452                         (sk->retransmits == 0 || 
3453                          sk->ip_xmit_timeout != TIME_WRITE ||
3454                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3455                         && sk->packets_out < sk->cong_window) 
3456                 {
3457                         /*
3458                          *      Add more data to the send queue.
3459                          */
3460                         flag |= 1;
3461                         tcp_write_xmit(sk);
3462                 }
3463                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3464                         sk->send_head == NULL &&
3465                         sk->ack_backlog == 0 &&
3466                         sk->state != TCP_TIME_WAIT) 
3467                 {
3468                         /*
3469                          *      Data to queue but no room.
3470                          */
3471                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3472                 }               
3473         }
3474         else
3475         {
3476                 /*
3477                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3478                  * from TCP_CLOSE we don't do anything
3479                  *
3480                  * from anything else, if there is write data (or fin) pending,
3481                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3482                  * a KEEPALIVE timeout, else we delete the timer.
3483                  *
3484                  * We do not set flag for nominal write data, otherwise we may
3485                  * force a state where we start to write itsy bitsy tidbits
3486                  * of data.
3487                  */
3488 
3489                 switch(sk->state) {
3490                 case TCP_TIME_WAIT:
3491                         /*
3492                          * keep us in TIME_WAIT until we stop getting packets,
3493                          * reset the timeout.
3494                          */
3495                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3496                         break;
3497                 case TCP_CLOSE:
3498                         /*
3499                          * don't touch the timer.
3500                          */
3501                         break;
3502                 default:
3503                         /*
3504                          *      Must check send_head, write_queue, and ack_backlog
3505                          *      to determine which timeout to use.
3506                          */
3507                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3508                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3509                         } else if (sk->keepopen) {
3510                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3511                         } else {
3512                                 del_timer(&sk->retransmit_timer);
3513                                 sk->ip_xmit_timeout = 0;
3514                         }
3515                         break;
3516                 }
3517         }
3518 
3519         /*
3520          *      We have nothing queued but space to send. Send any partial
3521          *      packets immediately (end of Nagle rule application).
3522          */
3523          
3524         if (sk->packets_out == 0 && sk->partial != NULL &&
3525                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3526         {
3527                 flag |= 1;
3528                 tcp_send_partial(sk);
3529         }
3530 
3531         /*
3532          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3533          * we are now waiting for an acknowledge to our FIN.  The other end is
3534          * already in TIME_WAIT.
3535          *
3536          * Move to TCP_CLOSE on success.
3537          */
3538 
3539         if (sk->state == TCP_LAST_ACK) 
3540         {
3541                 if (!sk->dead)
3542                         sk->state_change(sk);
3543                 if(sk->debug)
3544                         printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3545                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3546                 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
3547                 {
3548                         flag |= 1;
3549                         tcp_set_state(sk,TCP_CLOSE);
3550                         sk->shutdown = SHUTDOWN_MASK;
3551                 }
3552         }
3553 
3554         /*
3555          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3556          *
3557          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3558          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3559          */
3560 
3561         if (sk->state == TCP_FIN_WAIT1) 
3562         {
3563 
3564                 if (!sk->dead) 
3565                         sk->state_change(sk);
3566                 if (sk->rcv_ack_seq == sk->write_seq) 
3567                 {
3568                         flag |= 1;
3569                         sk->shutdown |= SEND_SHUTDOWN;
3570                         tcp_set_state(sk, TCP_FIN_WAIT2);
3571                 }
3572         }
3573 
3574         /*
3575          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3576          *
3577          *      Move to TIME_WAIT
3578          */
3579 
3580         if (sk->state == TCP_CLOSING) 
3581         {
3582 
3583                 if (!sk->dead) 
3584                         sk->state_change(sk);
3585                 if (sk->rcv_ack_seq == sk->write_seq) 
3586                 {
3587                         flag |= 1;
3588                         tcp_time_wait(sk);
3589                 }
3590         }
3591         
3592         /*
3593          *      Final ack of a three way shake 
3594          */
3595          
3596         if(sk->state==TCP_SYN_RECV)
3597         {
3598                 tcp_set_state(sk, TCP_ESTABLISHED);
3599                 tcp_options(sk,th);
3600                 sk->dummy_th.dest=th->source;
3601                 sk->copied_seq = sk->acked_seq;
3602                 if(!sk->dead)
3603                         sk->state_change(sk);
3604                 if(sk->max_window==0)
3605                 {
3606                         sk->max_window=32;      /* Sanity check */
3607                         sk->mss=min(sk->max_window,sk->mtu);
3608                 }
3609         }
3610         
3611         /*
3612          * I make no guarantees about the first clause in the following
3613          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3614          * what conditions "!flag" would be true.  However I think the rest
3615          * of the conditions would prevent that from causing any
3616          * unnecessary retransmission. 
3617          *   Clearly if the first packet has expired it should be 
3618          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3619          * harder to explain:  You have to look carefully at how and when the
3620          * timer is set and with what timeout.  The most recent transmission always
3621          * sets the timer.  So in general if the most recent thing has timed
3622          * out, everything before it has as well.  So we want to go ahead and
3623          * retransmit some more.  If we didn't explicitly test for this
3624          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3625          * would not be true.  If you look at the pattern of timing, you can
3626          * show that rto is increased fast enough that the next packet would
3627          * almost never be retransmitted immediately.  Then you'd end up
3628          * waiting for a timeout to send each packet on the retransmission
3629          * queue.  With my implementation of the Karn sampling algorithm,
3630          * the timeout would double each time.  The net result is that it would
3631          * take a hideous amount of time to recover from a single dropped packet.
3632          * It's possible that there should also be a test for TIME_WRITE, but
3633          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3634          * got to be in real retransmission mode.
3635          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3636          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3637          * As long as no further losses occur, this seems reasonable.
3638          */
3639         
3640         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3641                (((flag&2) && sk->retransmits) ||
3642                (sk->send_head->when + sk->rto < jiffies))) 
3643         {
3644                 if(sk->send_head->when + sk->rto < jiffies)
3645                         tcp_retransmit(sk,0);   
3646                 else
3647                 {
3648                         tcp_do_retransmit(sk, 1);
3649                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3650                 }
3651         }
3652 
3653         return(1);
3654 }
3655 
3656 
3657 /*
3658  *      Process the FIN bit. This now behaves as it is supposed to work
3659  *      and the FIN takes effect when it is validly part of sequence
3660  *      space. Not before when we get holes.
3661  *
3662  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3663  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3664  *      TIME-WAIT)
3665  *
3666  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3667  *      close and we go into CLOSING (and later onto TIME-WAIT)
3668  *
3669  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3670  *
3671  */
3672  
3673 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3674 {
3675         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3676 
3677         if (!sk->dead) 
3678         {
3679                 sk->state_change(sk);
3680                 sock_wake_async(sk->socket, 1);
3681         }
3682 
3683         switch(sk->state) 
3684         {
3685                 case TCP_SYN_RECV:
3686                 case TCP_SYN_SENT:
3687                 case TCP_ESTABLISHED:
3688                         /*
3689                          * move to CLOSE_WAIT, tcp_data() already handled
3690                          * sending the ack.
3691                          */
3692                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3693                         if (th->rst)
3694                                 sk->shutdown = SHUTDOWN_MASK;
3695                         break;
3696 
3697                 case TCP_CLOSE_WAIT:
3698                 case TCP_CLOSING:
3699                         /*
3700                          * received a retransmission of the FIN, do
3701                          * nothing.
3702                          */
3703                         break;
3704                 case TCP_TIME_WAIT:
3705                         /*
3706                          * received a retransmission of the FIN,
3707                          * restart the TIME_WAIT timer.
3708                          */
3709                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3710                         return(0);
3711                 case TCP_FIN_WAIT1:
3712                         /*
3713                          * This case occurs when a simultaneous close
3714                          * happens, we must ack the received FIN and
3715                          * enter the CLOSING state.
3716                          *
3717                          * This causes a WRITE timeout, which will either
3718                          * move on to TIME_WAIT when we timeout, or resend
3719                          * the FIN properly (maybe we get rid of that annoying
3720                          * FIN lost hang). The TIME_WRITE code is already correct
3721                          * for handling this timeout.
3722                          */
3723 
3724                         if(sk->ip_xmit_timeout != TIME_WRITE)
3725                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3726                         tcp_set_state(sk,TCP_CLOSING);
3727                         break;
3728                 case TCP_FIN_WAIT2:
3729                         /*
3730                          * received a FIN -- send ACK and enter TIME_WAIT
3731                          */
3732                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3733                         sk->shutdown|=SHUTDOWN_MASK;
3734                         tcp_set_state(sk,TCP_TIME_WAIT);
3735                         break;
3736                 case TCP_CLOSE:
3737                         /*
3738                          * already in CLOSE
3739                          */
3740                         break;
3741                 default:
3742                         tcp_set_state(sk,TCP_LAST_ACK);
3743         
3744                         /* Start the timers. */
3745                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3746                         return(0);
3747         }
3748 
3749         return(0);
3750 }
3751 
3752 
3753 
3754 /*
3755  *      This routine handles the data.  If there is room in the buffer,
3756  *      it will be have already been moved into it.  If there is no
3757  *      room, then we will just have to discard the packet.
3758  */
3759 
3760 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3761          unsigned long saddr, unsigned short len)
3762 {
3763         struct sk_buff *skb1, *skb2;
3764         struct tcphdr *th;
3765         int dup_dumped=0;
3766         u32 new_seq, shut_seq;
3767 
3768         th = skb->h.th;
3769         skb->len = len -(th->doff*4);
3770 
3771         /*
3772          *      The bytes in the receive read/assembly queue has increased. Needed for the
3773          *      low memory discard algorithm 
3774          */
3775            
3776         sk->bytes_rcv += skb->len;
3777         
3778         if (skb->len == 0 && !th->fin) 
3779         {
3780                 /* 
3781                  *      Don't want to keep passing ack's back and forth. 
3782                  *      (someone sent us dataless, boring frame)
3783                  */
3784                 if (!th->ack)
3785                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3786                 kfree_skb(skb, FREE_READ);
3787                 return(0);
3788         }
3789         
3790         /*
3791          *      We no longer have anyone receiving data on this connection.
3792          */
3793 
3794 #ifndef TCP_DONT_RST_SHUTDOWN            
3795 
3796         if(sk->shutdown & RCV_SHUTDOWN)
3797         {
3798                 /*
3799                  *      FIXME: BSD has some magic to avoid sending resets to
3800                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3801                  *      BSD stacks still have broken keepalives so we want to
3802                  *      cope with it.
3803                  */
3804 
3805                 if(skb->len)    /* We don't care if it's just an ack or
3806                                    a keepalive/window probe */
3807                 {
3808                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3809                         
3810                         /* Do this the way 4.4BSD treats it. Not what I'd
3811                            regard as the meaning of the spec but it's what BSD
3812                            does and clearly they know everything 8) */
3813 
3814                         /*
3815                          *      This is valid because of two things
3816                          *
3817                          *      a) The way tcp_data behaves at the bottom.
3818                          *      b) A fin takes effect when read not when received.
3819                          */
3820                          
3821                         shut_seq=sk->acked_seq+1;       /* Last byte */
3822                         
3823                         if(after(new_seq,shut_seq))
3824                         {
3825                                 if(sk->debug)
3826                                         printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3827                                                 sk, new_seq, shut_seq, sk->blog);
3828                                 if(sk->dead)
3829                                 {
3830                                         sk->acked_seq = new_seq + th->fin;
3831                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3832                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3833                                         tcp_statistics.TcpEstabResets++;
3834                                         tcp_set_state(sk,TCP_CLOSE);
3835                                         sk->err = EPIPE;
3836                                         sk->shutdown = SHUTDOWN_MASK;
3837                                         kfree_skb(skb, FREE_READ);
3838                                         return 0;
3839                                 }
3840                         }
3841                 }
3842         }
3843 
3844 #endif
3845 
3846         /*
3847          *      Now we have to walk the chain, and figure out where this one
3848          *      goes into it.  This is set up so that the last packet we received
3849          *      will be the first one we look at, that way if everything comes
3850          *      in order, there will be no performance loss, and if they come
3851          *      out of order we will be able to fit things in nicely.
3852          *
3853          *      [AC: This is wrong. We should assume in order first and then walk
3854          *       forwards from the first hole based upon real traffic patterns.]
3855          *      
3856          */
3857 
3858         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3859         {
3860                 skb_queue_head(&sk->receive_queue,skb);
3861                 skb1= NULL;
3862         } 
3863         else
3864         {
3865                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3866                 {
3867                         if(sk->debug)
3868                         {
3869                                 printk("skb1=%p :", skb1);
3870                                 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3871                                 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3872                                 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3873                                                 sk->acked_seq);
3874                         }
3875                         
3876                         /*
3877                          *      Optimisation: Duplicate frame or extension of previous frame from
3878                          *      same sequence point (lost ack case).
3879                          *      The frame contains duplicate data or replaces a previous frame
3880                          *      discard the previous frame (safe as sk->inuse is set) and put
3881                          *      the new one in its place.
3882                          */
3883                          
3884                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3885                         {
3886                                 skb_append(skb1,skb);
3887                                 skb_unlink(skb1);
3888                                 kfree_skb(skb1,FREE_READ);
3889                                 dup_dumped=1;
3890                                 skb1=NULL;
3891                                 break;
3892                         }
3893                         
3894                         /*
3895                          *      Found where it fits
3896                          */
3897                          
3898                         if (after(th->seq+1, skb1->h.th->seq))
3899                         {
3900                                 skb_append(skb1,skb);
3901                                 break;
3902                         }
3903                         
3904                         /*
3905                          *      See if we've hit the start. If so insert.
3906                          */
3907                         if (skb1 == skb_peek(&sk->receive_queue))
3908                         {
3909                                 skb_queue_head(&sk->receive_queue, skb);
3910                                 break;
3911                         }
3912                 }
3913         }
3914 
3915         /*
3916          *      Figure out what the ack value for this frame is
3917          */
3918          
3919         th->ack_seq = th->seq + skb->len;
3920         if (th->syn) 
3921                 th->ack_seq++;
3922         if (th->fin)
3923                 th->ack_seq++;
3924 
3925         if (before(sk->acked_seq, sk->copied_seq)) 
3926         {
3927                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3928                 sk->acked_seq = sk->copied_seq;
3929         }
3930 
3931         /*
3932          *      Now figure out if we can ack anything. This is very messy because we really want two
3933          *      receive queues, a completed and an assembly queue. We also want only one transmit
3934          *      queue.
3935          */
3936 
3937         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3938         {
3939                 if (before(th->seq, sk->acked_seq+1)) 
3940                 {
3941                         int newwindow;
3942 
3943                         if (after(th->ack_seq, sk->acked_seq)) 
3944                         {
3945                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3946                                 if (newwindow < 0)
3947                                         newwindow = 0;  
3948                                 sk->window = newwindow;
3949                                 sk->acked_seq = th->ack_seq;
3950                         }
3951                         skb->acked = 1;
3952 
3953                         /*
3954                          *      When we ack the fin, we do the FIN 
3955                          *      processing.
3956                          */
3957 
3958                         if (skb->h.th->fin) 
3959                         {
3960                                 tcp_fin(skb,sk,skb->h.th);
3961                         }
3962           
3963                         for(skb2 = skb->next;
3964                             skb2 != (struct sk_buff *)&sk->receive_queue;
3965                             skb2 = skb2->next) 
3966                         {
3967                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3968                                 {
3969                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3970                                         {
3971                                                 newwindow = sk->window -
3972                                                  (skb2->h.th->ack_seq - sk->acked_seq);
3973                                                 if (newwindow < 0)
3974                                                         newwindow = 0;  
3975                                                 sk->window = newwindow;
3976                                                 sk->acked_seq = skb2->h.th->ack_seq;
3977                                         }
3978                                         skb2->acked = 1;
3979                                         /*
3980                                          *      When we ack the fin, we do
3981                                          *      the fin handling.
3982                                          */
3983                                         if (skb2->h.th->fin) 
3984                                         {
3985                                                 tcp_fin(skb,sk,skb->h.th);
3986                                         }
3987 
3988                                         /*
3989                                          *      Force an immediate ack.
3990                                          */
3991                                          
3992                                         sk->ack_backlog = sk->max_ack_backlog;
3993                                 }
3994                                 else
3995                                 {
3996                                         break;
3997                                 }
3998                         }
3999 
4000                         /*
4001                          *      This also takes care of updating the window.
4002                          *      This if statement needs to be simplified.
4003                          */
4004                         if (!sk->delay_acks ||
4005                             sk->ack_backlog >= sk->max_ack_backlog || 
4006                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4007         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4008                         }
4009                         else 
4010                         {
4011                                 sk->ack_backlog++;
4012                                 if(sk->debug)
4013                                         printk("Ack queued.\n");
4014                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4015                         }
4016                 }
4017         }
4018 
4019         /*
4020          *      If we've missed a packet, send an ack.
4021          *      Also start a timer to send another.
4022          */
4023          
4024         if (!skb->acked) 
4025         {
4026         
4027         /*
4028          *      This is important.  If we don't have much room left,
4029          *      we need to throw out a few packets so we have a good
4030          *      window.  Note that mtu is used, not mss, because mss is really
4031          *      for the send side.  He could be sending us stuff as large as mtu.
4032          */
4033                  
4034                 while (sk->prot->rspace(sk) < sk->mtu) 
4035                 {
4036                         skb1 = skb_peek(&sk->receive_queue);
4037                         if (skb1 == NULL) 
4038                         {
4039                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4040                                 break;
4041                         }
4042 
4043                         /*
4044                          *      Don't throw out something that has been acked. 
4045                          */
4046                  
4047                         if (skb1->acked) 
4048                         {
4049                                 break;
4050                         }
4051                 
4052                         skb_unlink(skb1);
4053                         kfree_skb(skb1, FREE_READ);
4054                 }
4055                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4056                 sk->ack_backlog++;
4057                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4058         }
4059         else
4060         {
4061                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4062         }
4063 
4064         /*
4065          *      Now tell the user we may have some data. 
4066          */
4067          
4068         if (!sk->dead) 
4069         {
4070                 if(sk->debug)
4071                         printk("Data wakeup.\n");
4072                 sk->data_ready(sk,0);
4073         } 
4074         return(0);
4075 }
4076 
4077 
4078 /*
4079  *      This routine is only called when we have urgent data
4080  *      signalled. Its the 'slow' part of tcp_urg. It could be
4081  *      moved inline now as tcp_urg is only called from one
4082  *      place. We handle URGent data wrong. We have to - as
4083  *      BSD still doesn't use the correction from RFC961.
4084  */
4085  
4086 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4087 {
4088         unsigned long ptr = ntohs(th->urg_ptr);
4089 
4090         if (ptr)
4091                 ptr--;
4092         ptr += th->seq;
4093 
4094         /* ignore urgent data that we've already seen and read */
4095         if (after(sk->copied_seq, ptr))
4096                 return;
4097 
4098         /* do we already have a newer (or duplicate) urgent pointer? */
4099         if (sk->urg_data && !after(ptr, sk->urg_seq))
4100                 return;
4101 
4102         /* tell the world about our new urgent pointer */
4103         if (sk->proc != 0) {
4104                 if (sk->proc > 0) {
4105                         kill_proc(sk->proc, SIGURG, 1);
4106                 } else {
4107                         kill_pg(-sk->proc, SIGURG, 1);
4108                 }
4109         }
4110         sk->urg_data = URG_NOTYET;
4111         sk->urg_seq = ptr;
4112 }
4113 
4114 /*
4115  *      This is the 'fast' part of urgent handling.
4116  */
4117  
4118 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4119         unsigned long saddr, unsigned long len)
4120 {
4121         unsigned long ptr;
4122 
4123         /*
4124          *      Check if we get a new urgent pointer - normally not 
4125          */
4126          
4127         if (th->urg)
4128                 tcp_check_urg(sk,th);
4129 
4130         /*
4131          *      Do we wait for any urgent data? - normally not
4132          */
4133          
4134         if (sk->urg_data != URG_NOTYET)
4135                 return 0;
4136 
4137         /*
4138          *      Is the urgent pointer pointing into this packet? 
4139          */
4140          
4141         ptr = sk->urg_seq - th->seq + th->doff*4;
4142         if (ptr >= len)
4143                 return 0;
4144 
4145         /*
4146          *      Ok, got the correct packet, update info 
4147          */
4148          
4149         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4150         if (!sk->dead)
4151                 sk->data_ready(sk,0);
4152         return 0;
4153 }
4154 
4155 /*
4156  *      This will accept the next outstanding connection. 
4157  */
4158  
4159 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4160 {
4161         struct sock *newsk;
4162         struct sk_buff *skb;
4163   
4164   /*
4165    * We need to make sure that this socket is listening,
4166    * and that it has something pending.
4167    */
4168 
4169         if (sk->state != TCP_LISTEN) 
4170         {
4171                 sk->err = EINVAL;
4172                 return(NULL); 
4173         }
4174 
4175         /* Avoid the race. */
4176         cli();
4177         sk->inuse = 1;
4178 
4179         while((skb = tcp_dequeue_established(sk)) == NULL) 
4180         {
4181                 if (flags & O_NONBLOCK) 
4182                 {
4183                         sti();
4184                         release_sock(sk);
4185                         sk->err = EAGAIN;
4186                         return(NULL);
4187                 }
4188 
4189                 release_sock(sk);
4190                 interruptible_sleep_on(sk->sleep);
4191                 if (current->signal & ~current->blocked) 
4192                 {
4193                         sti();
4194                         sk->err = ERESTARTSYS;
4195                         return(NULL);
4196                 }
4197                 sk->inuse = 1;
4198         }
4199         sti();
4200 
4201         /*
4202          *      Now all we need to do is return skb->sk. 
4203          */
4204 
4205         newsk = skb->sk;
4206 
4207         kfree_skb(skb, FREE_READ);
4208         sk->ack_backlog--;
4209         release_sock(sk);
4210         return(newsk);
4211 }
4212 
4213 
4214 /*
4215  *      This will initiate an outgoing connection. 
4216  */
4217  
4218 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4219 {
4220         struct sk_buff *buff;
4221         struct device *dev=NULL;
4222         unsigned char *ptr;
4223         int tmp;
4224         int atype;
4225         struct tcphdr *t1;
4226         struct rtable *rt;
4227 
4228         if (sk->state != TCP_CLOSE) 
4229         {
4230                 return(-EISCONN);
4231         }
4232         
4233         if (addr_len < 8) 
4234                 return(-EINVAL);
4235 
4236         if (usin->sin_family && usin->sin_family != AF_INET) 
4237                 return(-EAFNOSUPPORT);
4238 
4239         /*
4240          *      connect() to INADDR_ANY means loopback (BSD'ism).
4241          */
4242         
4243         if(usin->sin_addr.s_addr==INADDR_ANY)
4244                 usin->sin_addr.s_addr=ip_my_addr();
4245                   
4246         /*
4247          *      Don't want a TCP connection going to a broadcast address 
4248          */
4249 
4250         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4251                 return -ENETUNREACH;
4252   
4253         sk->inuse = 1;
4254         sk->daddr = usin->sin_addr.s_addr;
4255         sk->write_seq = tcp_init_seq();
4256         sk->window_seq = sk->write_seq;
4257         sk->rcv_ack_seq = sk->write_seq -1;
4258         sk->err = 0;
4259         sk->dummy_th.dest = usin->sin_port;
4260         release_sock(sk);
4261 
4262         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4263         if (buff == NULL) 
4264         {
4265                 return(-ENOMEM);
4266         }
4267         sk->inuse = 1;
4268         buff->len = 24;
4269         buff->sk = sk;
4270         buff->free = 0;
4271         buff->localroute = sk->localroute;
4272         
4273         t1 = (struct tcphdr *) buff->data;
4274 
4275         /*
4276          *      Put in the IP header and routing stuff. 
4277          */
4278          
4279         rt=ip_rt_route(sk->daddr, NULL, NULL);
4280         
4281 
4282         /*
4283          *      We need to build the routing stuff from the things saved in skb. 
4284          */
4285 
4286         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4287                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4288         if (tmp < 0) 
4289         {
4290                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4291                 release_sock(sk);
4292                 return(-ENETUNREACH);
4293         }
4294 
4295         buff->len += tmp;
4296         t1 = (struct tcphdr *)((char *)t1 +tmp);
4297 
4298         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4299         t1->seq = ntohl(sk->write_seq++);
4300         sk->sent_seq = sk->write_seq;
4301         buff->h.seq = sk->write_seq;
4302         t1->ack = 0;
4303         t1->window = 2;
4304         t1->res1=0;
4305         t1->res2=0;
4306         t1->rst = 0;
4307         t1->urg = 0;
4308         t1->psh = 0;
4309         t1->syn = 1;
4310         t1->urg_ptr = 0;
4311         t1->doff = 6;
4312         /* use 512 or whatever user asked for */
4313         
4314         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4315                 sk->window_clamp=rt->rt_window;
4316         else
4317                 sk->window_clamp=0;
4318 
4319         if (sk->user_mss)
4320                 sk->mtu = sk->user_mss;
4321         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4322                 sk->mtu = rt->rt_mss;
4323         else 
4324         {
4325 #ifdef CONFIG_INET_SNARL
4326                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4327 #else
4328                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4329 #endif
4330                         sk->mtu = 576 - HEADER_SIZE;
4331                 else
4332                         sk->mtu = MAX_WINDOW;
4333         }
4334         /*
4335          *      but not bigger than device MTU 
4336          */
4337 
4338         if(sk->mtu <32)
4339                 sk->mtu = 32;   /* Sanity limit */
4340                 
4341         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4342         
4343         /*
4344          *      Put in the TCP options to say MTU. 
4345          */
4346 
4347         ptr = (unsigned char *)(t1+1);
4348         ptr[0] = 2;
4349         ptr[1] = 4;
4350         ptr[2] = (sk->mtu) >> 8;
4351         ptr[3] = (sk->mtu) & 0xff;
4352         tcp_send_check(t1, sk->saddr, sk->daddr,
4353                   sizeof(struct tcphdr) + 4, sk);
4354 
4355         /*
4356          *      This must go first otherwise a really quick response will get reset. 
4357          */
4358 
4359         tcp_cache_zap();
4360         tcp_set_state(sk,TCP_SYN_SENT);
4361         if(rt&&rt->rt_flags&RTF_IRTT)
4362                 sk->rto = rt->rt_irtt;
4363         else
4364                 sk->rto = TCP_TIMEOUT_INIT;
4365         sk->retransmit_timer.function=&retransmit_timer;
4366         sk->retransmit_timer.data = (unsigned long)sk;
4367         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4368         sk->retransmits = TCP_SYN_RETRIES;
4369 
4370         sk->prot->queue_xmit(sk, dev, buff, 0);  
4371         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4372         tcp_statistics.TcpActiveOpens++;
4373         tcp_statistics.TcpOutSegs++;
4374   
4375         release_sock(sk);
4376         return(0);
4377 }
4378 
4379 
4380 /* This functions checks to see if the tcp header is actually acceptable. */
4381 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4382              struct options *opt, unsigned long saddr, struct device *dev)
4383 {
4384         u32 next_seq;
4385 
4386         next_seq = len - 4*th->doff;
4387         if (th->fin)
4388                 next_seq++;
4389         /* if we have a zero window, we can't have any data in the packet.. */
4390         if (next_seq && !sk->window)
4391                 goto ignore_it;
4392         next_seq += th->seq;
4393 
4394         /*
4395          * This isn't quite right.  sk->acked_seq could be more recent
4396          * than sk->window.  This is however close enough.  We will accept
4397          * slightly more packets than we should, but it should not cause
4398          * problems unless someone is trying to forge packets.
4399          */
4400 
4401         /* have we already seen all of this packet? */
4402         if (!after(next_seq+1, sk->acked_seq))
4403                 goto ignore_it;
4404         /* or does it start beyond the window? */
4405         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4406                 goto ignore_it;
4407 
4408         /* ok, at least part of this packet would seem interesting.. */
4409         return 1;
4410 
4411 ignore_it:
4412         if (th->rst)
4413                 return 0;
4414 
4415         /*
4416          *      Send a reset if we get something not ours and we are
4417          *      unsynchronized. Note: We don't do anything to our end. We
4418          *      are just killing the bogus remote connection then we will
4419          *      connect again and it will work (with luck).
4420          */
4421          
4422         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4423         {
4424                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4425                 return 1;
4426         }
4427 
4428         /* Try to resync things. */
4429         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4430         return 0;
4431 }
4432 
4433 /*
4434  *      When we get a reset we do this.
4435  */
4436 
4437 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4438 {
4439         sk->zapped = 1;
4440         sk->err = ECONNRESET;
4441         if (sk->state == TCP_SYN_SENT)
4442                 sk->err = ECONNREFUSED;
4443         if (sk->state == TCP_CLOSE_WAIT)
4444                 sk->err = EPIPE;
4445 #ifdef TCP_DO_RFC1337           
4446         /*
4447          *      Time wait assassination protection [RFC1337]
4448          */
4449         if(sk->state!=TCP_TIME_WAIT)
4450         {       
4451                 tcp_set_state(sk,TCP_CLOSE);
4452                 sk->shutdown = SHUTDOWN_MASK;
4453         }
4454 #else   
4455         tcp_set_state(sk,TCP_CLOSE);
4456         sk->shutdown = SHUTDOWN_MASK;
4457 #endif  
4458         if (!sk->dead) 
4459                 sk->state_change(sk);
4460         kfree_skb(skb, FREE_READ);
4461         release_sock(sk);
4462         return(0);
4463 }
4464 
4465 /*
4466  *      A TCP packet has arrived.
4467  */
4468  
4469 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4470         unsigned long daddr, unsigned short len,
4471         unsigned long saddr, int redo, struct inet_protocol * protocol)
4472 {
4473         struct tcphdr *th;
4474         struct sock *sk;
4475         int syn_ok=0;
4476         
4477         tcp_statistics.TcpInSegs++;
4478   
4479         if(skb->pkt_type!=PACKET_HOST)
4480         {
4481                 kfree_skb(skb,FREE_READ);
4482                 return(0);
4483         }
4484   
4485         th = skb->h.th;
4486 
4487         /*
4488          *      Find the socket, using the last hit cache if applicable.
4489          */
4490 
4491         if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4492                 sk=(struct sock *)th_cache_sk;
4493         else
4494         {
4495                 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4496                 th_cache_saddr=saddr;
4497                 th_cache_daddr=daddr;
4498                 th_cache_dport=th->dest;
4499                 th_cache_sport=th->source;
4500                 th_cache_sk=sk;
4501         }               
4502 
4503         /*
4504          *      If this socket has got a reset it's to all intents and purposes 
4505          *      really dead. Count closed sockets as dead.
4506          *
4507          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4508          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4509          *      exist so should cause resets as if the port was unreachable.
4510          */
4511          
4512         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4513                 sk=NULL;
4514 
4515         if (!redo) 
4516         {
4517                 if (tcp_check(th, len, saddr, daddr )) 
4518                 {
4519                         skb->sk = NULL;
4520                         kfree_skb(skb,FREE_READ);
4521                         /*
4522                          *      We don't release the socket because it was
4523                          *      never marked in use.
4524                          */
4525                         return(0);
4526                 }
4527                 th->seq = ntohl(th->seq);
4528 
4529                 /* See if we know about the socket. */
4530                 if (sk == NULL) 
4531                 {
4532                         /*
4533                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4534                          */
4535                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4536                         skb->sk = NULL;
4537                         /*
4538                          *      Discard frame
4539                          */
4540                         kfree_skb(skb, FREE_READ);
4541                         return(0);
4542                 }
4543 
4544                 skb->len = len;
4545                 skb->acked = 0;
4546                 skb->used = 0;
4547                 skb->free = 0;
4548                 skb->saddr = daddr;
4549                 skb->daddr = saddr;
4550         
4551                 /* We may need to add it to the backlog here. */
4552                 cli();
4553                 if (sk->inuse) 
4554                 {
4555                         skb_queue_tail(&sk->back_log, skb);
4556                         sti();
4557                         return(0);
4558                 }
4559                 sk->inuse = 1;
4560                 sti();
4561         }
4562         else
4563         {
4564                 if (sk==NULL) 
4565                 {
4566                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4567                         skb->sk = NULL;
4568                         kfree_skb(skb, FREE_READ);
4569                         return(0);
4570                 }
4571         }
4572 
4573 
4574         if (!sk->prot) 
4575         {
4576                 printk("IMPOSSIBLE 3\n");
4577                 return(0);
4578         }
4579 
4580 
4581         /*
4582          *      Charge the memory to the socket. 
4583          */
4584          
4585         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4586         {
4587                 kfree_skb(skb, FREE_READ);
4588                 release_sock(sk);
4589                 return(0);
4590         }
4591 
4592         skb->sk=sk;
4593         sk->rmem_alloc += skb->mem_len;
4594 
4595         /*
4596          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4597          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4598          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4599          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4600          */
4601 
4602         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4603         {
4604         
4605                 /*
4606                  *      Now deal with unusual cases.
4607                  */
4608          
4609                 if(sk->state==TCP_LISTEN)
4610                 {
4611                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4612                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4613 
4614                         /*
4615                          *      We don't care for RST, and non SYN are absorbed (old segments)
4616                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4617                          *      netmask on a running connection it can go broadcast. Even Sun's have
4618                          *      this problem so I'm ignoring it 
4619                          */
4620                            
4621                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4622                         {
4623                                 kfree_skb(skb, FREE_READ);
4624                                 release_sock(sk);
4625                                 return 0;
4626                         }
4627                 
4628                         /*      
4629                          *      Guess we need to make a new socket up 
4630                          */
4631                 
4632                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4633                 
4634                         /*
4635                          *      Now we have several options: In theory there is nothing else
4636                          *      in the frame. KA9Q has an option to send data with the syn,
4637                          *      BSD accepts data with the syn up to the [to be] advertised window
4638                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4639                          *      it, that fits the spec precisely and avoids incompatibilities. It
4640                          *      would be nice in future to drop through and process the data.
4641                          */
4642                          
4643                         release_sock(sk);
4644                         return 0;
4645                 }
4646         
4647                 /* retransmitted SYN? */
4648                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4649                 {
4650                         kfree_skb(skb, FREE_READ);
4651                         release_sock(sk);
4652                         return 0;
4653                 }
4654                 
4655                 /*
4656                  *      SYN sent means we have to look for a suitable ack and either reset
4657                  *      for bad matches or go to connected 
4658                  */
4659            
4660                 if(sk->state==TCP_SYN_SENT)
4661                 {
4662                         /* Crossed SYN or previous junk segment */
4663                         if(th->ack)
4664                         {
4665                                 /* We got an ack, but it's not a good ack */
4666                                 if(!tcp_ack(sk,th,saddr,len))
4667                                 {
4668                                         /* Reset the ack - its an ack from a 
4669                                            different connection  [ th->rst is checked in tcp_reset()] */
4670                                         tcp_statistics.TcpAttemptFails++;
4671                                         tcp_reset(daddr, saddr, th,
4672                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4673                                         kfree_skb(skb, FREE_READ);
4674                                         release_sock(sk);
4675                                         return(0);
4676                                 }
4677                                 if(th->rst)
4678                                         return tcp_std_reset(sk,skb);
4679                                 if(!th->syn)
4680                                 {
4681                                         /* A valid ack from a different connection
4682                                            start. Shouldn't happen but cover it */
4683                                         kfree_skb(skb, FREE_READ);
4684                                         release_sock(sk);
4685                                         return 0;
4686                                 }
4687                                 /*
4688                                  *      Ok.. it's good. Set up sequence numbers and
4689                                  *      move to established.
4690                                  */
4691                                 syn_ok=1;       /* Don't reset this connection for the syn */
4692                                 sk->acked_seq=th->seq+1;
4693                                 sk->fin_seq=th->seq;
4694                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4695                                 tcp_set_state(sk, TCP_ESTABLISHED);
4696                                 tcp_options(sk,th);
4697                                 sk->dummy_th.dest=th->source;
4698                                 sk->copied_seq = sk->acked_seq;
4699                                 if(!sk->dead)
4700                                 {
4701                                         sk->state_change(sk);
4702                                         sock_wake_async(sk->socket, 0);
4703                                 }
4704                                 if(sk->max_window==0)
4705                                 {
4706                                         sk->max_window = 32;
4707                                         sk->mss = min(sk->max_window, sk->mtu);
4708                                 }
4709                         }
4710                         else
4711                         {
4712                                 /* See if SYN's cross. Drop if boring */
4713                                 if(th->syn && !th->rst)
4714                                 {
4715                                         /* Crossed SYN's are fine - but talking to
4716                                            yourself is right out... */
4717                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4718                                                 sk->dummy_th.source==th->source &&
4719                                                 sk->dummy_th.dest==th->dest)
4720                                         {
4721                                                 tcp_statistics.TcpAttemptFails++;
4722                                                 return tcp_std_reset(sk,skb);
4723                                         }
4724                                         tcp_set_state(sk,TCP_SYN_RECV);
4725                                         
4726                                         /*
4727                                          *      FIXME:
4728                                          *      Must send SYN|ACK here
4729                                          */
4730                                 }               
4731                                 /* Discard junk segment */
4732                                 kfree_skb(skb, FREE_READ);
4733                                 release_sock(sk);
4734                                 return 0;
4735                         }
4736                         /*
4737                          *      SYN_RECV with data maybe.. drop through
4738                          */
4739                         goto rfc_step6;
4740                 }
4741 
4742         /*
4743          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4744          *      a more complex suggestion for fixing these reuse issues in RFC1644
4745          *      but not yet ready for general use. Also see RFC1379.
4746          */
4747         
4748 #define BSD_TIME_WAIT
4749 #ifdef BSD_TIME_WAIT
4750                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4751                         after(th->seq, sk->acked_seq) && !th->rst)
4752                 {
4753                         u32 seq = sk->write_seq;
4754                         if(sk->debug)
4755                                 printk("Doing a BSD time wait\n");
4756                         tcp_statistics.TcpEstabResets++;           
4757                         sk->rmem_alloc -= skb->mem_len;
4758                         skb->sk = NULL;
4759                         sk->err=ECONNRESET;
4760                         tcp_set_state(sk, TCP_CLOSE);
4761                         sk->shutdown = SHUTDOWN_MASK;
4762                         release_sock(sk);
4763                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4764                         if (sk && sk->state==TCP_LISTEN)
4765                         {
4766                                 sk->inuse=1;
4767                                 skb->sk = sk;
4768                                 sk->rmem_alloc += skb->mem_len;
4769                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4770                                 release_sock(sk);
4771                                 return 0;
4772                         }
4773                         kfree_skb(skb, FREE_READ);
4774                         return 0;
4775                 }
4776 #endif  
4777         }
4778 
4779         /*
4780          *      We are now in normal data flow (see the step list in the RFC)
4781          *      Note most of these are inline now. I'll inline the lot when
4782          *      I have time to test it hard and look at what gcc outputs 
4783          */
4784         
4785         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4786         {
4787                 kfree_skb(skb, FREE_READ);
4788                 release_sock(sk);
4789                 return 0;
4790         }
4791 
4792         if(th->rst)
4793                 return tcp_std_reset(sk,skb);
4794         
4795         /*
4796          *      !syn_ok is effectively the state test in RFC793.
4797          */
4798          
4799         if(th->syn && !syn_ok)
4800         {
4801                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4802                 return tcp_std_reset(sk,skb);   
4803         }
4804 
4805         /*
4806          *      Process the ACK
4807          */
4808          
4809 
4810         if(th->ack && !tcp_ack(sk,th,saddr,len))
4811         {
4812                 /*
4813                  *      Our three way handshake failed.
4814                  */
4815                  
4816                 if(sk->state==TCP_SYN_RECV)
4817                 {
4818                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4819                 }
4820                 kfree_skb(skb, FREE_READ);
4821                 release_sock(sk);
4822                 return 0;
4823         }
4824         
4825 rfc_step6:              /* I'll clean this up later */
4826 
4827         /*
4828          *      Process urgent data
4829          */
4830                 
4831         if(tcp_urg(sk, th, saddr, len))
4832         {
4833                 kfree_skb(skb, FREE_READ);
4834                 release_sock(sk);
4835                 return 0;
4836         }
4837         
4838         
4839         /*
4840          *      Process the encapsulated data
4841          */
4842         
4843         if(tcp_data(skb,sk, saddr, len))
4844         {
4845                 kfree_skb(skb, FREE_READ);
4846                 release_sock(sk);
4847                 return 0;
4848         }
4849 
4850         /*
4851          *      And done
4852          */     
4853         
4854         release_sock(sk);
4855         return 0;
4856 }
4857 
4858 /*
4859  *      This routine sends a packet with an out of date sequence
4860  *      number. It assumes the other end will try to ack it.
4861  */
4862 
4863 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4864 {
4865         struct sk_buff *buff,*skb;
4866         struct tcphdr *t1;
4867         struct device *dev=NULL;
4868         int tmp;
4869 
4870         if (sk->zapped)
4871                 return; /* After a valid reset we can send no more */
4872 
4873         /*
4874          *      Write data can still be transmitted/retransmitted in the
4875          *      following states.  If any other state is encountered, return.
4876          *      [listen/close will never occur here anyway]
4877          */
4878 
4879         if (sk->state != TCP_ESTABLISHED && 
4880             sk->state != TCP_CLOSE_WAIT &&
4881             sk->state != TCP_FIN_WAIT1 && 
4882             sk->state != TCP_LAST_ACK &&
4883             sk->state != TCP_CLOSING
4884         ) 
4885         {
4886                 return;
4887         }
4888 
4889         if (before(sk->sent_seq, sk->window_seq) && 
4890             (skb=skb_peek(&sk->write_queue)))
4891         {
4892                 /*
4893                  * We are probing the opening of a window
4894                  * but the window size is != 0
4895                  * must have been a result SWS advoidance ( sender )
4896                  */
4897             
4898                 struct iphdr *iph;
4899                 struct tcphdr *th;
4900                 struct tcphdr *nth;
4901                 unsigned long win_size, ow_size;
4902                 void * tcp_data_start;
4903         
4904                 win_size = sk->window_seq - sk->sent_seq;
4905 
4906                 iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len);
4907                 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4908 
4909                 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + 
4910                                      (iph->ihl << 2) +
4911                                      skb->dev->hard_header_len, 
4912                                      1, GFP_ATOMIC);
4913                 if ( buff == NULL )
4914                         return;
4915 
4916                 buff->len = 0;
4917 
4918                 /* 
4919                  *      If we strip the packet on the write queue we must
4920                  *      be ready to retransmit this one 
4921                  */
4922             
4923                 buff->free = 0;
4924 
4925                 buff->sk = sk;
4926                 buff->localroute = sk->localroute;
4927 
4928                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4929                                          IPPROTO_TCP, sk->opt, buff->mem_len,
4930                                          sk->ip_tos,sk->ip_ttl);
4931                 if (tmp < 0) 
4932                 {
4933                         sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4934                         return;
4935                 }
4936 
4937                 buff->len += tmp;
4938                 buff->dev = dev;
4939 
4940                 nth = (struct tcphdr *) (buff->data + buff->len);
4941                 buff->len += th->doff * 4;
4942 
4943                 memcpy(nth, th, th->doff * 4);
4944 
4945                 nth->ack = 1; 
4946                 nth->ack_seq = ntohl(sk->acked_seq);
4947                 nth->window = ntohs(tcp_select_window(sk));
4948                 nth->check = 0;
4949 
4950                 tcp_data_start = skb->data + skb->dev->hard_header_len + 
4951                                 (iph->ihl << 2) + th->doff * 4;
4952 
4953                 memcpy(buff->data + buff->len, tcp_data_start, win_size);
4954                 buff->len += win_size;
4955                 buff->h.seq = sk->sent_seq + win_size;
4956 
4957                 /*
4958                  *      now: shrink the queue head segment 
4959                  */
4960                  
4961                 th->check = 0;
4962                 ow_size = skb->len - win_size - 
4963                         ((unsigned long) (tcp_data_start - (void *) skb->data));
4964 
4965                 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
4966                 skb->len -= win_size;
4967                 sk->sent_seq += win_size;
4968                 th->seq = htonl(sk->sent_seq);
4969 
4970                 if (th->urg)
4971                 {
4972                         unsigned short urg_ptr;
4973         
4974                         urg_ptr = ntohs(th->urg_ptr);
4975                         if (urg_ptr <= win_size)
4976                                 th->urg = 0;
4977                         else
4978                         {
4979                                 urg_ptr -= win_size;
4980                                 th->urg_ptr = htons(urg_ptr);
4981                                 nth->urg_ptr = htons(win_size);
4982                         }
4983                 }
4984 
4985                 tcp_send_check(nth, sk->saddr, sk->daddr, 
4986                            nth->doff * 4 + win_size , sk);
4987         }
4988         else
4989         {
4990                 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4991                 if (buff == NULL) 
4992                         return;
4993 
4994                 buff->len = sizeof(struct tcphdr);
4995                 buff->free = 1;
4996                 buff->sk = sk;
4997                 buff->localroute = sk->localroute;
4998 
4999                 t1 = (struct tcphdr *) buff->data;
5000 
5001                 /*
5002                  *      Put in the IP header and routing stuff. 
5003                  */
5004                  
5005                 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5006                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5007                 if (tmp < 0) 
5008                 {
5009                         sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
5010                         return;
5011                 }
5012 
5013                 buff->len += tmp;
5014                 t1 = (struct tcphdr *)((char *)t1 +tmp);
5015 
5016                 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5017 
5018                 /*
5019                  *      Use a previous sequence.
5020                  *      This should cause the other end to send an ack.
5021                  */
5022          
5023                 t1->seq = htonl(sk->sent_seq-1);
5024                 t1->ack = 1; 
5025                 t1->res1= 0;
5026                 t1->res2= 0;
5027                 t1->rst = 0;
5028                 t1->urg = 0;
5029                 t1->psh = 0;
5030                 t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5031                 t1->syn = 0;
5032                 t1->ack_seq = ntohl(sk->acked_seq);
5033                 t1->window = ntohs(tcp_select_window(sk));
5034                 t1->doff = sizeof(*t1)/4;
5035                 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5036 
5037         }               
5038 
5039         /*
5040          *      Send it.
5041          */
5042         
5043         sk->prot->queue_xmit(sk, dev, buff, 1);
5044         tcp_statistics.TcpOutSegs++;
5045 }
5046 
5047 /*
5048  *      A window probe timeout has occurred.
5049  */
5050 
5051 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
5052 {
5053         if (sk->zapped)
5054                 return;         /* After a valid reset we can send no more */
5055 
5056         tcp_write_wakeup(sk);
5057 
5058         sk->backoff++;
5059         sk->rto = min(sk->rto << 1, 120*HZ);
5060         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5061         sk->retransmits++;
5062         sk->prot->retransmits ++;
5063 }
5064 
5065 /*
5066  *      Socket option code for TCP. 
5067  */
5068   
5069 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5070 {
5071         int val,err;
5072 
5073         if(level!=SOL_TCP)
5074                 return ip_setsockopt(sk,level,optname,optval,optlen);
5075 
5076         if (optval == NULL) 
5077                 return(-EINVAL);
5078 
5079         err=verify_area(VERIFY_READ, optval, sizeof(int));
5080         if(err)
5081                 return err;
5082         
5083         val = get_fs_long((unsigned long *)optval);
5084 
5085         switch(optname)
5086         {
5087                 case TCP_MAXSEG:
5088 /*
5089  * values greater than interface MTU won't take effect.  however at
5090  * the point when this call is done we typically don't yet know
5091  * which interface is going to be used
5092  */
5093                         if(val<1||val>MAX_WINDOW)
5094                                 return -EINVAL;
5095                         sk->user_mss=val;
5096                         return 0;
5097                 case TCP_NODELAY:
5098                         sk->nonagle=(val==0)?0:1;
5099                         return 0;
5100                 default:
5101                         return(-ENOPROTOOPT);
5102         }
5103 }
5104 
5105 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5106 {
5107         int val,err;
5108 
5109         if(level!=SOL_TCP)
5110                 return ip_getsockopt(sk,level,optname,optval,optlen);
5111                         
5112         switch(optname)
5113         {
5114                 case TCP_MAXSEG:
5115                         val=sk->user_mss;
5116                         break;
5117                 case TCP_NODELAY:
5118                         val=sk->nonagle;
5119                         break;
5120                 default:
5121                         return(-ENOPROTOOPT);
5122         }
5123         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5124         if(err)
5125                 return err;
5126         put_fs_long(sizeof(int),(unsigned long *) optlen);
5127 
5128         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5129         if(err)
5130                 return err;
5131         put_fs_long(val,(unsigned long *)optval);
5132 
5133         return(0);
5134 }       
5135 
5136 
5137 struct proto tcp_prot = {
5138         sock_wmalloc,
5139         sock_rmalloc,
5140         sock_wfree,
5141         sock_rfree,
5142         sock_rspace,
5143         sock_wspace,
5144         tcp_close,
5145         tcp_read,
5146         tcp_write,
5147         tcp_sendto,
5148         tcp_recvfrom,
5149         ip_build_header,
5150         tcp_connect,
5151         tcp_accept,
5152         ip_queue_xmit,
5153         tcp_retransmit,
5154         tcp_write_wakeup,
5155         tcp_read_wakeup,
5156         tcp_rcv,
5157         tcp_select,
5158         tcp_ioctl,
5159         NULL,
5160         tcp_shutdown,
5161         tcp_setsockopt,
5162         tcp_getsockopt,
5163         128,
5164         0,
5165         "TCP",
5166         0, 0,
5167         {NULL,}
5168 };

/* [previous][next][first][last][top][bottom][index][help] */