root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_dequeue_established
  6. tcp_close_pending
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. reset_xmit_timer
  10. tcp_retransmit_time
  11. tcp_retransmit
  12. tcp_write_timeout
  13. retransmit_timer
  14. tcp_err
  15. tcp_readable
  16. tcp_listen_select
  17. tcp_select
  18. tcp_ioctl
  19. tcp_check
  20. tcp_send_check
  21. tcp_send_skb
  22. tcp_dequeue_partial
  23. tcp_send_partial
  24. tcp_enqueue_partial
  25. tcp_send_ack
  26. tcp_build_header
  27. tcp_write
  28. tcp_sendto
  29. tcp_read_wakeup
  30. cleanup_rbuf
  31. tcp_read_urg
  32. tcp_read
  33. tcp_close_state
  34. tcp_send_fin
  35. tcp_shutdown
  36. tcp_recvfrom
  37. tcp_reset
  38. tcp_options
  39. default_mask
  40. tcp_init_seq
  41. tcp_conn_request
  42. tcp_close
  43. tcp_write_xmit
  44. tcp_ack
  45. tcp_fin
  46. tcp_data
  47. tcp_check_urg
  48. tcp_urg
  49. tcp_accept
  50. tcp_connect
  51. tcp_sequence
  52. tcp_std_reset
  53. tcp_rcv
  54. tcp_write_wakeup
  55. tcp_send_probe0
  56. tcp_setsockopt
  57. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 121  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 122  *              Alan Cox        :       Reimplemented timers as per the RFC and using multiple
 123  *                                      timers for sanity. 
 124  *              Alan Cox        :       Small bug fixes, and a lot of new
 125  *                                      comments.
 126  *              Alan Cox        :       Fixed dual reader crash by locking
 127  *                                      the buffers (much like datagram.c)
 128  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 129  *                                      now gets fed up of retrying without
 130  *                                      (even a no space) answer.
 131  *              Alan Cox        :       Extracted closing code better
 132  *              Alan Cox        :       Fixed the closing state machine to
 133  *                                      resemble the RFC.
 134  *
 135  *
 136  * To Fix:
 137  *              Fast path the code. Two things here - fix the window calculation
 138  *              so it doesn't iterate over the queue, also spot packets with no funny
 139  *              options arriving in order and process directly.
 140  *
 141  *              Implement RFC 1191 [Path MTU discovery]
 142  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 143  *              Rewrite output state machine to use a single queue and do low window
 144  *              situations as per the spec (RFC 1122)
 145  *              Speed up input assembly algorithm.
 146  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 147  *              could do with it working on IPv4
 148  *              User settable/learned rtt/max window/mtu
 149  *              Cope with MTU/device switches when retransmitting in tcp.
 150  *              Fix the window handling to use PR's new code.
 151  *
 152  *              Change the fundamental structure to a single send queue maintained
 153  *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
 154  *              active routes too]). Cut the queue off in tcp_retransmit/
 155  *              tcp_transmit.
 156  *              Change the receive queue to assemble as it goes. This lets us
 157  *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
 158  *              tcp_data/tcp_read as well as the window shrink crud.
 159  *              Seperate out duplicated code - tcp_alloc_skb, tcp_build_ack
 160  *              tcp_queue_skb seem obvious routines to extract.
 161  *      
 162  *              This program is free software; you can redistribute it and/or
 163  *              modify it under the terms of the GNU General Public License
 164  *              as published by the Free Software Foundation; either version
 165  *              2 of the License, or(at your option) any later version.
 166  *
 167  * Description of States:
 168  *
 169  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 170  *
 171  *      TCP_SYN_RECV            received a connection request, sent ack,
 172  *                              waiting for final ack in three-way handshake.
 173  *
 174  *      TCP_ESTABLISHED         connection established
 175  *
 176  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 177  *                              transmission of remaining buffered data
 178  *
 179  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 180  *                              to shutdown
 181  *
 182  *      TCP_CLOSING             both sides have shutdown but we still have
 183  *                              data we have to finish sending
 184  *
 185  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 186  *                              closed, can only be entered from FIN_WAIT2
 187  *                              or CLOSING.  Required because the other end
 188  *                              may not have gotten our last ACK causing it
 189  *                              to retransmit the data packet (which we ignore)
 190  *
 191  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 192  *                              us to finish writing our data and to shutdown
 193  *                              (we have to close() to move on to LAST_ACK)
 194  *
 195  *      TCP_LAST_ACK            out side has shutdown after remote has
 196  *                              shutdown.  There may still be data in our
 197  *                              buffer that we have to finish sending
 198  *              
 199  *      TCP_CLOSE               socket is finished
 200  */
 201 
 202 #include <linux/types.h>
 203 #include <linux/sched.h>
 204 #include <linux/mm.h>
 205 #include <linux/string.h>
 206 #include <linux/config.h>
 207 #include <linux/socket.h>
 208 #include <linux/sockios.h>
 209 #include <linux/termios.h>
 210 #include <linux/in.h>
 211 #include <linux/fcntl.h>
 212 #include <linux/inet.h>
 213 #include <linux/netdevice.h>
 214 #include "snmp.h"
 215 #include "ip.h"
 216 #include "protocol.h"
 217 #include "icmp.h"
 218 #include "tcp.h"
 219 #include "arp.h"
 220 #include <linux/skbuff.h>
 221 #include "sock.h"
 222 #include "route.h"
 223 #include <linux/errno.h>
 224 #include <linux/timer.h>
 225 #include <asm/system.h>
 226 #include <asm/segment.h>
 227 #include <linux/mm.h>
 228 
 229 /*
 230  *      The MSL timer is the 'normal' timer.
 231  */
 232  
 233 #define reset_msl_timer(x,y,z)  reset_timer(x,y,z)
 234 
 235 #define SEQ_TICK 3
 236 unsigned long seq_offset;
 237 struct tcp_mib  tcp_statistics;
 238 
 239 static void tcp_close(struct sock *sk, int timeout);
 240 
 241 
 242 /*
 243  *      The less said about this the better, but it works and will do for 1.2 
 244  */
 245 
 246 static struct wait_queue *master_select_wakeup;
 247 
 248 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 249 {
 250         if (a < b) 
 251                 return(a);
 252         return(b);
 253 }
 254 
 255 #undef STATE_TRACE
 256 
 257 #ifdef STATE_TRACE
 258 static char *statename[]={
 259         "Unused","Established","Syn Sent","Syn Recv",
 260         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 261         "Close Wait","Last ACK","Listen","Closing"
 262 };
 263 #endif
 264 
 265 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 266 {
 267         if(sk->state==TCP_ESTABLISHED)
 268                 tcp_statistics.TcpCurrEstab--;
 269 #ifdef STATE_TRACE
 270         if(sk->debug)
 271                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 272 #endif  
 273         /* This is a hack but it doesn't occur often and its going to
 274            be a real        to fix nicely */
 275            
 276         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 277         {
 278                 wake_up_interruptible(&master_select_wakeup);
 279         }
 280         sk->state=state;
 281         if(state==TCP_ESTABLISHED)
 282                 tcp_statistics.TcpCurrEstab++;
 283 }
 284 
 285 /*
 286  *      This routine picks a TCP windows for a socket based on
 287  *      the following constraints
 288  *  
 289  *      1. The window can never be shrunk once it is offered (RFC 793)
 290  *      2. We limit memory per socket
 291  *   
 292  *      For now we use NET2E3's heuristic of offering half the memory
 293  *      we have handy. All is not as bad as this seems however because
 294  *      of two things. Firstly we will bin packets even within the window
 295  *      in order to get the data we are waiting for into the memory limit.
 296  *      Secondly we bin common duplicate forms at receive time
 297  *      Better heuristics welcome
 298  */
 299    
 300 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 301 {
 302         int new_window = sk->prot->rspace(sk);
 303         
 304         if(sk->window_clamp)
 305                 new_window=min(sk->window_clamp,new_window);
 306         /*
 307          *      Two things are going on here.  First, we don't ever offer a
 308          *      window less than min(sk->mss, MAX_WINDOW/2).  This is the
 309          *      receiver side of SWS as specified in RFC1122.
 310          *      Second, we always give them at least the window they
 311          *      had before, in order to avoid retracting window.  This
 312          *      is technically allowed, but RFC1122 advises against it and
 313          *      in practice it causes trouble.
 314          *
 315          *      Fixme: This doesn't correctly handle the case where
 316          *      new_window > sk->window but not by enough to allow for the
 317          *      shift in sequence space. 
 318          */
 319         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 320                 return(sk->window);
 321         return(new_window);
 322 }
 323 
 324 /*
 325  *      Find someone to 'accept'. Must be called with
 326  *      sk->inuse=1 or cli()
 327  */ 
 328 
 329 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 330 {
 331         struct sk_buff *p=skb_peek(&s->receive_queue);
 332         if(p==NULL)
 333                 return NULL;
 334         do
 335         {
 336                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 337                         return p;
 338                 p=p->next;
 339         }
 340         while(p!=(struct sk_buff *)&s->receive_queue);
 341         return NULL;
 342 }
 343 
 344 /*
 345  *      Remove a completed connection and return it. This is used by
 346  *      tcp_accept() to get connections from the queue.
 347  */
 348 
 349 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 350 {
 351         struct sk_buff *skb;
 352         unsigned long flags;
 353         save_flags(flags);
 354         cli(); 
 355         skb=tcp_find_established(s);
 356         if(skb!=NULL)
 357                 skb_unlink(skb);        /* Take it off the queue */
 358         restore_flags(flags);
 359         return skb;
 360 }
 361 
 362 /* 
 363  *      This routine closes sockets which have been at least partially
 364  *      opened, but not yet accepted. Currently it is only called by
 365  *      tcp_close, and timeout mirrors the value there. 
 366  */
 367 
 368 static void tcp_close_pending (struct sock *sk, int timeout) 
     /* [previous][next][first][last][top][bottom][index][help] */
 369 {
 370         struct sk_buff *skb;
 371 
 372         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
 373                 tcp_close(skb->sk, timeout);
 374                 kfree_skb(skb, FREE_READ);
 375         }
 376         return;
 377 }
 378 
 379 /*
 380  *      Enter the time wait state. 
 381  */
 382 
 383 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 384 {
 385         tcp_set_state(sk,TCP_TIME_WAIT);
 386         sk->shutdown = SHUTDOWN_MASK;
 387         if (!sk->dead)
 388                 sk->state_change(sk);
 389         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 390 }
 391 
 392 /*
 393  *      A socket has timed out on its send queue and wants to do a
 394  *      little retransmitting. Currently this means TCP.
 395  */
 396 
 397 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 398 {
 399         struct sk_buff * skb;
 400         struct proto *prot;
 401         struct device *dev;
 402 
 403         prot = sk->prot;
 404         skb = sk->send_head;
 405 
 406         while (skb != NULL)
 407         {
 408                 struct tcphdr *th;
 409                 struct iphdr *iph;
 410                 int size;
 411 
 412                 dev = skb->dev;
 413                 IS_SKB(skb);
 414                 skb->when = jiffies;
 415 
 416                 /*
 417                  * In general it's OK just to use the old packet.  However we
 418                  * need to use the current ack and window fields.  Urg and
 419                  * urg_ptr could possibly stand to be updated as well, but we
 420                  * don't keep the necessary data.  That shouldn't be a problem,
 421                  * if the other end is doing the right thing.  Since we're
 422                  * changing the packet, we have to issue a new IP identifier.
 423                  */
 424 
 425                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 426                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 427                 size = skb->len - (((unsigned char *) th) - skb->data);
 428                 
 429                 /*
 430                  *      Note: We ought to check for window limits here but
 431                  *      currently this is done (less efficiently) elsewhere.
 432                  *      We do need to check for a route change but can't handle
 433                  *      that until we have the new 1.3.x buffers in.
 434                  *
 435                  */
 436 
 437                 iph->id = htons(ip_id_count++);
 438                 ip_send_check(iph);
 439 
 440                 /*
 441                  *      This is not the right way to handle this. We have to
 442                  *      issue an up to date window and ack report with this 
 443                  *      retransmit to keep the odd buggy tcp that relies on 
 444                  *      the fact BSD does this happy. 
 445                  *      We don't however need to recalculate the entire 
 446                  *      checksum, so someone wanting a small problem to play
 447                  *      with might like to implement RFC1141/RFC1624 and speed
 448                  *      this up by avoiding a full checksum.
 449                  */
 450                  
 451                 th->ack_seq = ntohl(sk->acked_seq);
 452                 th->window = ntohs(tcp_select_window(sk));
 453                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 454                 
 455                 /*
 456                  *      If the interface is (still) up and running, kick it.
 457                  */
 458 
 459                 if (dev->flags & IFF_UP)
 460                 {
 461                         /*
 462                          *      If the packet is still being sent by the device/protocol
 463                          *      below then don't retransmit. This is both needed, and good -
 464                          *      especially with connected mode AX.25 where it stops resends
 465                          *      occurring of an as yet unsent anyway frame!
 466                          *      We still add up the counts as the round trip time wants
 467                          *      adjusting.
 468                          */
 469                         if (sk && !skb_device_locked(skb))
 470                         {
 471                                 /* Remove it from any existing driver queue first! */
 472                                 skb_unlink(skb);
 473                                 /* Now queue it */
 474                                 ip_statistics.IpOutRequests++;
 475                                 dev_queue_xmit(skb, dev, sk->priority);
 476                         }
 477                 }
 478 
 479                 /*
 480                  *      Count retransmissions
 481                  */
 482                  
 483                 sk->retransmits++;
 484                 sk->prot->retransmits ++;
 485 
 486                 /*
 487                  *      Only one retransmit requested.
 488                  */
 489         
 490                 if (!all)
 491                         break;
 492 
 493                 /*
 494                  *      This should cut it off before we send too many packets.
 495                  */
 496 
 497                 if (sk->retransmits >= sk->cong_window)
 498                         break;
 499                 skb = skb->link3;
 500         }
 501 }
 502 
 503 /*
 504  *      Reset the retransmission timer
 505  */
 506  
 507 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
     /* [previous][next][first][last][top][bottom][index][help] */
 508 {
 509         del_timer(&sk->retransmit_timer);
 510         sk->ip_xmit_timeout = why;
 511         if((int)when < 0)
 512         {
 513                 when=3;
 514                 printk("Error: Negative timer in xmit_timer\n");
 515         }
 516         sk->retransmit_timer.expires=when;
 517         add_timer(&sk->retransmit_timer);
 518 }
 519 
 520 /*
 521  *      This is the normal code called for timeouts.  It does the retransmission
 522  *      and then does backoff.  tcp_do_retransmit is separated out because
 523  *      tcp_ack needs to send stuff from the retransmit queue without
 524  *      initiating a backoff.
 525  */
 526 
 527 
 528 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 529 {
 530         tcp_do_retransmit(sk, all);
 531 
 532         /*
 533          * Increase the timeout each time we retransmit.  Note that
 534          * we do not increase the rtt estimate.  rto is initialized
 535          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 536          * that doubling rto each time is the least we can get away with.
 537          * In KA9Q, Karn uses this for the first few times, and then
 538          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 539          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 540          * defined in the protocol as the maximum possible RTT.  I guess
 541          * we'll have to use something other than TCP to talk to the
 542          * University of Mars.
 543          *
 544          * PAWS allows us longer timeouts and large windows, so once
 545          * implemented ftp to mars will work nicely. We will have to fix
 546          * the 120 second clamps though!
 547          */
 548 
 549         sk->retransmits++;
 550         sk->backoff++;
 551         sk->rto = min(sk->rto << 1, 120*HZ);
 552         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
 553 }
 554 
 555 
 556 /*
 557  *      A timer event has trigger a tcp retransmit timeout. The
 558  *      socket xmit queue is ready and set up to send. Because
 559  *      the ack receive code keeps the queue straight we do
 560  *      nothing clever here.
 561  */
 562 
 563 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 564 {
 565         if (all) 
 566         {
 567                 tcp_retransmit_time(sk, all);
 568                 return;
 569         }
 570 
 571         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 572         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 573         sk->cong_count = 0;
 574 
 575         sk->cong_window = 1;
 576 
 577         /* Do the actual retransmit. */
 578         tcp_retransmit_time(sk, all);
 579 }
 580 
 581 /*
 582  *      A write timeout has occured. Process the after effects.
 583  */
 584 
 585 static int tcp_write_timeout(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 586 {
 587         /*
 588          *      Look for a 'soft' timeout.
 589          */
 590         if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
 591                 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
 592         {
 593                 /*
 594                  *      Attempt to recover if arp has changed (unlikely!) or
 595                  *      a route has shifted (not supported prior to 1.3).
 596                  */
 597                 arp_destroy (sk->daddr, 0);
 598                 ip_route_check (sk->daddr);
 599         }
 600         /*
 601          *      Has it gone just too far ?
 602          */
 603         if (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR2) 
 604         {
 605                 sk->err = ETIMEDOUT;
 606                 /*
 607                  *      Time wait the socket 
 608                  */
 609                 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) 
 610                 {
 611                         tcp_set_state(sk,TCP_TIME_WAIT);
 612                         reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 613                 }
 614                 else
 615                 {
 616                         /*
 617                          *      Clean up time.
 618                          */
 619                         sk->prot->close (sk, 1);
 620                         return 0;
 621                 }
 622         }
 623         return 1;
 624 }
 625 
 626 /*
 627  *      The TCP retransmit timer. This lacks a few small details.
 628  *
 629  *      1.      An initial rtt timeout on the probe0 should cause what we can
 630  *              of the first write queue buffer to be split and sent.
 631  *      2.      On a 'major timeout' as defined by RFC1122 we shouldn't report
 632  *              ETIMEDOUT if we know an additional 'soft' error caused this.
 633  *              tcp_err should save a 'soft error' for us.
 634  */
 635 
 636 static void retransmit_timer(unsigned long data)
     /* [previous][next][first][last][top][bottom][index][help] */
 637 {
 638         struct sock *sk = (struct sock*)data;
 639         int why = sk->ip_xmit_timeout;
 640 
 641         /* 
 642          * only process if socket is not in use
 643          */
 644 
 645         cli();
 646         if (sk->inuse || in_bh) 
 647         {
 648                 /* Try again in 1 second */
 649                 sk->retransmit_timer.expires = HZ;
 650                 add_timer(&sk->retransmit_timer);
 651                 sti();
 652                 return;
 653         }
 654 
 655         sk->inuse = 1;
 656         sti();
 657 
 658         /* Always see if we need to send an ack. */
 659 
 660         if (sk->ack_backlog && !sk->zapped) 
 661         {
 662                 sk->prot->read_wakeup (sk);
 663                 if (! sk->dead)
 664                         sk->data_ready(sk,0);
 665         }
 666 
 667         /* Now we need to figure out why the socket was on the timer. */
 668 
 669         switch (why) 
 670         {
 671                 /* Window probing */
 672                 case TIME_PROBE0:
 673                         tcp_send_probe0(sk);
 674                         if(tcp_write_timeout(sk))
 675                                 release_sock (sk);
 676                         break;
 677                 /* Retransmitting */
 678                 case TIME_WRITE:
 679                         /* It could be we got here because we needed to send an ack.
 680                          * So we need to check for that.
 681                          */
 682                 {
 683                         struct sk_buff *skb;
 684                         unsigned long flags;
 685 
 686                         save_flags(flags);
 687                         cli();
 688                         skb = sk->send_head;
 689                         if (!skb) 
 690                         {
 691                                 restore_flags(flags);
 692                         } 
 693                         else 
 694                         {
 695                                 /*
 696                                  *      Kicked by a delayed ack. Reset timer
 697                                  *      correctly now
 698                                  */
 699                                 if (jiffies < skb->when + sk->rto) 
 700                                 {
 701                                         reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
 702                                         restore_flags(flags);
 703                                         release_sock (sk);
 704                                         break;
 705                                 }
 706                                 restore_flags(flags);
 707                                 /*
 708                                  *      Retransmission
 709                                  */
 710                                 sk->prot->retransmit (sk, 0);
 711                                 if(!tcp_write_timeout(sk))
 712                                         break;
 713                         }
 714                         release_sock (sk);
 715                         break;
 716                 }
 717                 /* Sending Keepalives */
 718                 case TIME_KEEPOPEN:
 719                         /* 
 720                          * this reset_timer() call is a hack, this is not
 721                          * how KEEPOPEN is supposed to work.
 722                          */
 723                         reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
 724 
 725                         /* Send something to keep the connection open. */
 726                         if (sk->prot->write_wakeup)
 727                                   sk->prot->write_wakeup (sk);
 728                         sk->retransmits++;
 729                         if(tcp_write_timeout(sk))
 730                                 release_sock (sk);
 731                         break;
 732                 default:
 733                         printk ("rexmit_timer: timer expired - reason unknown\n");
 734                         release_sock (sk);
 735                         break;
 736         }
 737 }
 738 
 739 /*
 740  * This routine is called by the ICMP module when it gets some
 741  * sort of error condition.  If err < 0 then the socket should
 742  * be closed and the error returned to the user.  If err > 0
 743  * it's just the icmp type << 8 | icmp code.  After adjustment
 744  * header points to the first 8 bytes of the tcp header.  We need
 745  * to find the appropriate port.
 746  */
 747 
 748 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 749         unsigned long saddr, struct inet_protocol *protocol)
 750 {
 751         struct tcphdr *th;
 752         struct sock *sk;
 753         struct iphdr *iph=(struct iphdr *)header;
 754   
 755         header+=4*iph->ihl;
 756    
 757 
 758         th =(struct tcphdr *)header;
 759         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 760 
 761         if (sk == NULL) 
 762                 return;
 763   
 764         if(err<0)
 765         {
 766                 sk->err = -err;
 767                 sk->error_report(sk);
 768                 return;
 769         }
 770 
 771         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 772         {
 773                 /*
 774                  * FIXME:
 775                  * For now we will just trigger a linear backoff.
 776                  * The slow start code should cause a real backoff here.
 777                  */
 778                 if (sk->cong_window > 4)
 779                         sk->cong_window--;
 780                 return;
 781         }
 782 
 783 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 784 
 785         /*
 786          * If we've already connected we will keep trying
 787          * until we time out, or the user gives up.
 788          */
 789 
 790         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 791         {
 792                 if (sk->state == TCP_SYN_SENT) 
 793                 {
 794                         tcp_statistics.TcpAttemptFails++;
 795                         tcp_set_state(sk,TCP_CLOSE);
 796                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 797                 }
 798                 sk->err = icmp_err_convert[err & 0xff].errno;           
 799         }
 800         return;
 801 }
 802 
 803 
 804 /*
 805  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 806  *      in the received data queue (ie a frame missing that needs sending to us). Not
 807  *      sorting using two queues as data arrives makes life so much harder.
 808  */
 809 
 810 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 811 {
 812         unsigned long counted;
 813         unsigned long amount;
 814         struct sk_buff *skb;
 815         int sum;
 816         unsigned long flags;
 817 
 818         if(sk && sk->debug)
 819                 printk("tcp_readable: %p - ",sk);
 820 
 821         save_flags(flags);
 822         cli();
 823         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 824         {
 825                 restore_flags(flags);
 826                 if(sk && sk->debug) 
 827                         printk("empty\n");
 828                 return(0);
 829         }
 830   
 831         counted = sk->copied_seq;       /* Where we are at the moment */
 832         amount = 0;
 833   
 834         /* 
 835          *      Do until a push or until we are out of data. 
 836          */
 837          
 838         do 
 839         {
 840                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 841                         break;
 842                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 843                 if (skb->h.th->syn)
 844                         sum++;
 845                 if (sum > 0) 
 846                 {                                       /* Add it up, move on */
 847                         amount += sum;
 848                         if (skb->h.th->syn) 
 849                                 amount--;
 850                         counted += sum;
 851                 }
 852                 /*
 853                  * Don't count urg data ... but do it in the right place!
 854                  * Consider: "old_data (ptr is here) URG PUSH data"
 855                  * The old code would stop at the first push because
 856                  * it counted the urg (amount==1) and then does amount--
 857                  * *after* the loop.  This means tcp_readable() always
 858                  * returned zero if any URG PUSH was in the queue, even
 859                  * though there was normal data available. If we subtract
 860                  * the urg data right here, we even get it to work for more
 861                  * than one URG PUSH skb without normal data.
 862                  * This means that select() finally works now with urg data
 863                  * in the queue.  Note that rlogin was never affected
 864                  * because it doesn't use select(); it uses two processes
 865                  * and a blocking read().  And the queue scan in tcp_read()
 866                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 867                  */
 868                 if (skb->h.th->urg)
 869                         amount--;       /* don't count urg data */
 870                 if (amount && skb->h.th->psh) break;
 871                 skb = skb->next;
 872         }
 873         while(skb != (struct sk_buff *)&sk->receive_queue);
 874 
 875         restore_flags(flags);
 876         if(sk->debug)
 877                 printk("got %lu bytes.\n",amount);
 878         return(amount);
 879 }
 880 
 881 /*
 882  * LISTEN is a special case for select..
 883  */
 884 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 885 {
 886         if (sel_type == SEL_IN) {
 887                 int retval;
 888 
 889                 sk->inuse = 1;
 890                 retval = (tcp_find_established(sk) != NULL);
 891                 release_sock(sk);
 892                 if (!retval)
 893                         select_wait(&master_select_wakeup,wait);
 894                 return retval;
 895         }
 896         return 0;
 897 }
 898 
 899 
 900 /*
 901  *      Wait for a TCP event.
 902  *
 903  *      Note that we don't need to set "sk->inuse", as the upper select layers
 904  *      take care of normal races (between the test and the event) and we don't
 905  *      go look at any of the socket buffers directly.
 906  */
 907 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 908 {
 909         if (sk->state == TCP_LISTEN)
 910                 return tcp_listen_select(sk, sel_type, wait);
 911 
 912         switch(sel_type) {
 913         case SEL_IN:
 914                 if (sk->err)
 915                         return 1;
 916                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 917                         break;
 918 
 919                 if (sk->shutdown & RCV_SHUTDOWN)
 920                         return 1;
 921                         
 922                 if (sk->acked_seq == sk->copied_seq)
 923                         break;
 924 
 925                 if (sk->urg_seq != sk->copied_seq ||
 926                     sk->acked_seq != sk->copied_seq+1 ||
 927                     sk->urginline || !sk->urg_data)
 928                         return 1;
 929                 break;
 930 
 931         case SEL_OUT:
 932                 if (sk->shutdown & SEND_SHUTDOWN) 
 933                         return 0;
 934                 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
 935                         break;
 936                 /*
 937                  * This is now right thanks to a small fix
 938                  * by Matt Dillon.
 939                  */
 940 
 941                 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
 942                         break;
 943                 return 1;
 944 
 945         case SEL_EX:
 946                 if (sk->err || sk->urg_data)
 947                         return 1;
 948                 break;
 949         }
 950         select_wait(sk->sleep, wait);
 951         return 0;
 952 }
 953 
 954 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 955 {
 956         int err;
 957         switch(cmd) 
 958         {
 959 
 960                 case TIOCINQ:
 961 #ifdef FIXME    /* FIXME: */
 962                 case FIONREAD:
 963 #endif
 964                 {
 965                         unsigned long amount;
 966 
 967                         if (sk->state == TCP_LISTEN) 
 968                                 return(-EINVAL);
 969 
 970                         sk->inuse = 1;
 971                         amount = tcp_readable(sk);
 972                         release_sock(sk);
 973                         err=verify_area(VERIFY_WRITE,(void *)arg,
 974                                                    sizeof(unsigned long));
 975                         if(err)
 976                                 return err;
 977                         put_fs_long(amount,(unsigned long *)arg);
 978                         return(0);
 979                 }
 980                 case SIOCATMARK:
 981                 {
 982                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
 983 
 984                         err = verify_area(VERIFY_WRITE,(void *) arg,
 985                                                   sizeof(unsigned long));
 986                         if (err)
 987                                 return err;
 988                         put_fs_long(answ,(int *) arg);
 989                         return(0);
 990                 }
 991                 case TIOCOUTQ:
 992                 {
 993                         unsigned long amount;
 994 
 995                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 996                         amount = sk->prot->wspace(sk);
 997                         err=verify_area(VERIFY_WRITE,(void *)arg,
 998                                                    sizeof(unsigned long));
 999                         if(err)
1000                                 return err;
1001                         put_fs_long(amount,(unsigned long *)arg);
1002                         return(0);
1003                 }
1004                 default:
1005                         return(-EINVAL);
1006         }
1007 }
1008 
1009 
1010 /*
1011  *      This routine computes a TCP checksum. 
1012  */
1013  
1014 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
1015           unsigned long saddr, unsigned long daddr)
1016 {     
1017         unsigned long sum;
1018    
1019         if (saddr == 0) saddr = ip_my_addr();
1020 
1021 /*
1022  * stupid, gcc complains when I use just one __asm__ block,
1023  * something about too many reloads, but this is just two
1024  * instructions longer than what I want
1025  */
1026         __asm__("
1027             addl %%ecx, %%ebx
1028             adcl %%edx, %%ebx
1029             adcl $0, %%ebx
1030             "
1031         : "=b"(sum)
1032         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
1033         : "bx", "cx", "dx" );
1034         __asm__("
1035             movl %%ecx, %%edx
1036             cld
1037             cmpl $32, %%ecx
1038             jb 2f
1039             shrl $5, %%ecx
1040             clc
1041 1:          lodsl
1042             adcl %%eax, %%ebx
1043             lodsl
1044             adcl %%eax, %%ebx
1045             lodsl
1046             adcl %%eax, %%ebx
1047             lodsl
1048             adcl %%eax, %%ebx
1049             lodsl
1050             adcl %%eax, %%ebx
1051             lodsl
1052             adcl %%eax, %%ebx
1053             lodsl
1054             adcl %%eax, %%ebx
1055             lodsl
1056             adcl %%eax, %%ebx
1057             loop 1b
1058             adcl $0, %%ebx
1059             movl %%edx, %%ecx
1060 2:          andl $28, %%ecx
1061             je 4f
1062             shrl $2, %%ecx
1063             clc
1064 3:          lodsl
1065             adcl %%eax, %%ebx
1066             loop 3b
1067             adcl $0, %%ebx
1068 4:          movl $0, %%eax
1069             testw $2, %%dx
1070             je 5f
1071             lodsw
1072             addl %%eax, %%ebx
1073             adcl $0, %%ebx
1074             movw $0, %%ax
1075 5:          test $1, %%edx
1076             je 6f
1077             lodsb
1078             addl %%eax, %%ebx
1079             adcl $0, %%ebx
1080 6:          movl %%ebx, %%eax
1081             shrl $16, %%eax
1082             addw %%ax, %%bx
1083             adcw $0, %%bx
1084             "
1085         : "=b"(sum)
1086         : "0"(sum), "c"(len), "S"(th)
1087         : "ax", "bx", "cx", "dx", "si" );
1088 
1089         /* We only want the bottom 16 bits, but we never cleared the top 16. */
1090   
1091         return((~sum) & 0xffff);
1092 }
1093 
1094 
1095 
1096 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
1097                 unsigned long daddr, int len, struct sock *sk)
1098 {
1099         th->check = 0;
1100         th->check = tcp_check(th, len, saddr, daddr);
1101         return;
1102 }
1103 
1104 /*
1105  *      This is the main buffer sending routine. We queue the buffer
1106  *      having checked it is sane seeming.
1107  */
1108  
1109 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
1110 {
1111         int size;
1112         struct tcphdr * th = skb->h.th;
1113 
1114         /*
1115          *      length of packet (not counting length of pre-tcp headers) 
1116          */
1117          
1118         size = skb->len - ((unsigned char *) th - skb->data);
1119 
1120         /*
1121          *      Sanity check it.. 
1122          */
1123          
1124         if (size < sizeof(struct tcphdr) || size > skb->len) 
1125         {
1126                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1127                         skb, skb->data, th, skb->len);
1128                 kfree_skb(skb, FREE_WRITE);
1129                 return;
1130         }
1131 
1132         /*
1133          *      If we have queued a header size packet.. (these crash a few
1134          *      tcp stacks if ack is not set)
1135          */
1136          
1137         if (size == sizeof(struct tcphdr)) 
1138         {
1139                 /* If its got a syn or fin its notionally included in the size..*/
1140                 if(!th->syn && !th->fin) 
1141                 {
1142                         printk("tcp_send_skb: attempt to queue a bogon.\n");
1143                         kfree_skb(skb,FREE_WRITE);
1144                         return;
1145                 }
1146         }
1147 
1148         /*
1149          *      Actual processing.
1150          */
1151          
1152         tcp_statistics.TcpOutSegs++;  
1153         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1154         
1155         /*
1156          *      We must queue if
1157          *
1158          *      a) The right edge of this frame exceeds the window
1159          *      b) We are retransmitting (Nagle's rule)
1160          *      c) We have too many packets 'in flight'
1161          */
1162          
1163         if (after(skb->h.seq, sk->window_seq) ||
1164             (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1165              sk->packets_out >= sk->cong_window) 
1166         {
1167                 /* checksum will be supplied by tcp_write_xmit.  So
1168                  * we shouldn't need to set it at all.  I'm being paranoid */
1169                 th->check = 0;
1170                 if (skb->next != NULL) 
1171                 {
1172                         printk("tcp_send_partial: next != NULL\n");
1173                         skb_unlink(skb);
1174                 }
1175                 skb_queue_tail(&sk->write_queue, skb);
1176                 
1177                 /*
1178                  *      If we don't fit we have to start the zero window
1179                  *      probes. This is broken - we really need to do a partial
1180                  *      send _first_ (This is what causes the Cisco and PC/TCP
1181                  *      grief).
1182                  */
1183                  
1184                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1185                     sk->send_head == NULL && sk->ack_backlog == 0)
1186                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1187         } 
1188         else 
1189         {
1190                 /*
1191                  *      This is going straight out
1192                  */
1193                  
1194                 th->ack_seq = ntohl(sk->acked_seq);
1195                 th->window = ntohs(tcp_select_window(sk));
1196 
1197                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1198 
1199                 sk->sent_seq = sk->write_seq;
1200                 
1201                 /*
1202                  *      This is mad. The tcp retransmit queue is put together
1203                  *      by the ip layer. This causes half the problems with
1204                  *      unroutable FIN's and other things.
1205                  */
1206                  
1207                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1208                 
1209                 /*
1210                  *      Set for next retransmit based on expected ACK time.
1211                  *      FIXME: We set this every time which means our 
1212                  *      retransmits are really about a window behind.
1213                  */
1214 
1215                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1216         }
1217 }
1218 
1219 /*
1220  *      Locking problems lead us to a messy situation where we can have
1221  *      multiple partially complete buffers queued up. This is really bad
1222  *      as we don't want to be sending partial buffers. Fix this with
1223  *      a semaphore or similar to lock tcp_write per socket.
1224  *
1225  *      These routines are pretty self descriptive.
1226  */
1227  
1228 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1229 {
1230         struct sk_buff * skb;
1231         unsigned long flags;
1232 
1233         save_flags(flags);
1234         cli();
1235         skb = sk->partial;
1236         if (skb) {
1237                 sk->partial = NULL;
1238                 del_timer(&sk->partial_timer);
1239         }
1240         restore_flags(flags);
1241         return skb;
1242 }
1243 
1244 /*
1245  *      Empty the partial queue
1246  */
1247  
1248 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1249 {
1250         struct sk_buff *skb;
1251 
1252         if (sk == NULL)
1253                 return;
1254         while ((skb = tcp_dequeue_partial(sk)) != NULL)
1255                 tcp_send_skb(sk, skb);
1256 }
1257 
1258 /*
1259  *      Queue a partial frame
1260  */
1261  
1262 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1263 {
1264         struct sk_buff * tmp;
1265         unsigned long flags;
1266 
1267         save_flags(flags);
1268         cli();
1269         tmp = sk->partial;
1270         if (tmp)
1271                 del_timer(&sk->partial_timer);
1272         sk->partial = skb;
1273         init_timer(&sk->partial_timer);
1274         /*
1275          *      Wait up to 1 second for the buffer to fill.
1276          */
1277         sk->partial_timer.expires = HZ;
1278         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1279         sk->partial_timer.data = (unsigned long) sk;
1280         add_timer(&sk->partial_timer);
1281         restore_flags(flags);
1282         if (tmp)
1283                 tcp_send_skb(sk, tmp);
1284 }
1285 
1286 
1287 /*
1288  *      This routine sends an ack and also updates the window. 
1289  */
1290  
1291 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
1292              struct sock *sk,
1293              struct tcphdr *th, unsigned long daddr)
1294 {
1295         struct sk_buff *buff;
1296         struct tcphdr *t1;
1297         struct device *dev = NULL;
1298         int tmp;
1299 
1300         if(sk->zapped)
1301                 return;         /* We have been reset, we may not send again */
1302                 
1303         /*
1304          * We need to grab some memory, and put together an ack,
1305          * and then put it into the queue to be sent.
1306          */
1307 
1308         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1309         if (buff == NULL) 
1310         {
1311                 /* 
1312                  *      Force it to send an ack. We don't have to do this
1313                  *      (ACK is unreliable) but its much better use of 
1314                  *      bandwidth on slow links to send a spare ack than
1315                  *      resend packets. 
1316                  */
1317                  
1318                 sk->ack_backlog++;
1319                 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) 
1320                 {
1321                         reset_xmit_timer(sk, TIME_WRITE, HZ);
1322                 }
1323                 return;
1324         }
1325 
1326         /*
1327          *      Assemble a suitable TCP frame
1328          */
1329          
1330         buff->len = sizeof(struct tcphdr);
1331         buff->sk = sk;
1332         buff->localroute = sk->localroute;
1333         t1 =(struct tcphdr *) buff->data;
1334 
1335         /* 
1336          *      Put in the IP header and routing stuff. 
1337          */
1338          
1339         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1340                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1341         if (tmp < 0) 
1342         {
1343                 buff->free = 1;
1344                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1345                 return;
1346         }
1347         buff->len += tmp;
1348         t1 =(struct tcphdr *)((char *)t1 +tmp);
1349 
1350         memcpy(t1, th, sizeof(*t1));
1351 
1352         /*
1353          *      Swap the send and the receive. 
1354          */
1355          
1356         t1->dest = th->source;
1357         t1->source = th->dest;
1358         t1->seq = ntohl(sequence);
1359         t1->ack = 1;
1360         sk->window = tcp_select_window(sk);
1361         t1->window = ntohs(sk->window);
1362         t1->res1 = 0;
1363         t1->res2 = 0;
1364         t1->rst = 0;
1365         t1->urg = 0;
1366         t1->syn = 0;
1367         t1->psh = 0;
1368         t1->fin = 0;
1369         
1370         /*
1371          *      If we have nothing queued for transmit and the transmit timer
1372          *      is on we are just doing an ACK timeout and need to switch
1373          *      to a keepalive.
1374          */
1375          
1376         if (ack == sk->acked_seq) 
1377         {
1378                 sk->ack_backlog = 0;
1379                 sk->bytes_rcv = 0;
1380                 sk->ack_timed = 0;
1381                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1382                                   && sk->ip_xmit_timeout == TIME_WRITE) 
1383                 {
1384                         if(sk->keepopen) {
1385                                 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1386                         } else {
1387                                 delete_timer(sk);
1388                         }
1389                 }
1390         }
1391         
1392         /*
1393          *      Fill in the packet and send it
1394          */
1395          
1396         t1->ack_seq = ntohl(ack);
1397         t1->doff = sizeof(*t1)/4;
1398         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1399         if (sk->debug)
1400                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1401         tcp_statistics.TcpOutSegs++;
1402         sk->prot->queue_xmit(sk, dev, buff, 1);
1403 }
1404 
1405 
1406 /* 
1407  *      This routine builds a generic TCP header. 
1408  */
1409  
1410 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1411 {
1412 
1413         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1414         th->seq = htonl(sk->write_seq);
1415         th->psh =(push == 0) ? 1 : 0;
1416         th->doff = sizeof(*th)/4;
1417         th->ack = 1;
1418         th->fin = 0;
1419         sk->ack_backlog = 0;
1420         sk->bytes_rcv = 0;
1421         sk->ack_timed = 0;
1422         th->ack_seq = htonl(sk->acked_seq);
1423         sk->window = tcp_select_window(sk);
1424         th->window = htons(sk->window);
1425 
1426         return(sizeof(*th));
1427 }
1428 
1429 /*
1430  *      This routine copies from a user buffer into a socket,
1431  *      and starts the transmit system.
1432  */
1433 
1434 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1435           int len, int nonblock, unsigned flags)
1436 {
1437         int copied = 0;
1438         int copy;
1439         int tmp;
1440         struct sk_buff *skb;
1441         struct sk_buff *send_tmp;
1442         unsigned char *buff;
1443         struct proto *prot;
1444         struct device *dev = NULL;
1445 
1446         sk->inuse=1;
1447         prot = sk->prot;
1448         while(len > 0) 
1449         {
1450                 if (sk->err) 
1451                 {                       /* Stop on an error */
1452                         release_sock(sk);
1453                         if (copied) 
1454                                 return(copied);
1455                         tmp = -sk->err;
1456                         sk->err = 0;
1457                         return(tmp);
1458                 }
1459 
1460                 /*
1461                  *      First thing we do is make sure that we are established. 
1462                  */
1463         
1464                 if (sk->shutdown & SEND_SHUTDOWN) 
1465                 {
1466                         release_sock(sk);
1467                         sk->err = EPIPE;
1468                         if (copied) 
1469                                 return(copied);
1470                         sk->err = 0;
1471                         return(-EPIPE);
1472                 }
1473 
1474                 /* 
1475                  *      Wait for a connection to finish.
1476                  */
1477         
1478                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1479                 {
1480                         if (sk->err) 
1481                         {
1482                                 release_sock(sk);
1483                                 if (copied) 
1484                                         return(copied);
1485                                 tmp = -sk->err;
1486                                 sk->err = 0;
1487                                 return(tmp);
1488                         }
1489 
1490                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1491                         {
1492                                 release_sock(sk);
1493                                 if (copied) 
1494                                         return(copied);
1495 
1496                                 if (sk->err) 
1497                                 {
1498                                         tmp = -sk->err;
1499                                         sk->err = 0;
1500                                         return(tmp);
1501                                 }
1502 
1503                                 if (sk->keepopen) 
1504                                 {
1505                                         send_sig(SIGPIPE, current, 0);
1506                                 }
1507                                 return(-EPIPE);
1508                         }
1509 
1510                         if (nonblock || copied) 
1511                         {
1512                                 release_sock(sk);
1513                                 if (copied) 
1514                                         return(copied);
1515                                 return(-EAGAIN);
1516                         }
1517 
1518                         release_sock(sk);
1519                         cli();
1520                 
1521                         if (sk->state != TCP_ESTABLISHED &&
1522                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1523                         {
1524                                 interruptible_sleep_on(sk->sleep);
1525                                 if (current->signal & ~current->blocked) 
1526                                 {
1527                                         sti();
1528                                         if (copied) 
1529                                                 return(copied);
1530                                         return(-ERESTARTSYS);
1531                                 }
1532                         }
1533                         sk->inuse = 1;
1534                         sti();
1535                 }
1536 
1537         /*
1538          * The following code can result in copy <= if sk->mss is ever
1539          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1540          * sk->mtu is constant once SYN processing is finished.  I.e. we
1541          * had better not get here until we've seen his SYN and at least one
1542          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1543          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1544          * non-decreasing.  Note that any ioctl to set user_mss must be done
1545          * before the exchange of SYN's.  If the initial ack from the other
1546          * end has a window of 0, max_window and thus mss will both be 0.
1547          */
1548 
1549         /* 
1550          *      Now we need to check if we have a half built packet. 
1551          */
1552 
1553                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1554                 {
1555                         int hdrlen;
1556 
1557                          /* IP header + TCP header */
1558                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1559                                  + sizeof(struct tcphdr);
1560         
1561                         /* Add more stuff to the end of skb->len */
1562                         if (!(flags & MSG_OOB)) 
1563                         {
1564                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1565                                 /* FIXME: this is really a bug. */
1566                                 if (copy <= 0) 
1567                                 {
1568                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1569                                         copy = 0;
1570                                 }
1571           
1572                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1573                                 skb->len += copy;
1574                                 from += copy;
1575                                 copied += copy;
1576                                 len -= copy;
1577                                 sk->write_seq += copy;
1578                         }
1579                         if ((skb->len - hdrlen) >= sk->mss ||
1580                                 (flags & MSG_OOB) || !sk->packets_out)
1581                                 tcp_send_skb(sk, skb);
1582                         else
1583                                 tcp_enqueue_partial(skb, sk);
1584                         continue;
1585                 }
1586 
1587         /*
1588          * We also need to worry about the window.
1589          * If window < 1/2 the maximum window we've seen from this
1590          *   host, don't use it.  This is sender side
1591          *   silly window prevention, as specified in RFC1122.
1592          *   (Note that this is different than earlier versions of
1593          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1594          *   use the whole MSS.  Since the results in the right
1595          *   edge of the packet being outside the window, it will
1596          *   be queued for later rather than sent.
1597          */
1598 
1599                 copy = sk->window_seq - sk->write_seq;
1600                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1601                         copy = sk->mss;
1602                 if (copy > len)
1603                         copy = len;
1604 
1605         /*
1606          *      We should really check the window here also. 
1607          */
1608          
1609                 send_tmp = NULL;
1610                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1611                 {
1612                         /*
1613                          *      We will release the socket incase we sleep here. 
1614                          */
1615                         release_sock(sk);
1616                         /*
1617                          *      NB: following must be mtu, because mss can be increased.
1618                          *      mss is always <= mtu 
1619                          */
1620                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1621                         sk->inuse = 1;
1622                         send_tmp = skb;
1623                 } 
1624                 else 
1625                 {
1626                         /*
1627                          *      We will release the socket incase we sleep here. 
1628                          */
1629                         release_sock(sk);
1630                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1631                         sk->inuse = 1;
1632                 }
1633 
1634                 /*
1635                  *      If we didn't get any memory, we need to sleep. 
1636                  */
1637 
1638                 if (skb == NULL) 
1639                 {
1640                         sk->socket->flags |= SO_NOSPACE;
1641                         if (nonblock) 
1642                         {
1643                                 release_sock(sk);
1644                                 if (copied) 
1645                                         return(copied);
1646                                 return(-EAGAIN);
1647                         }
1648 
1649                         /*
1650                          *      FIXME: here is another race condition. 
1651                          */
1652 
1653                         tmp = sk->wmem_alloc;
1654                         release_sock(sk);
1655                         cli();
1656                         /*
1657                          *      Again we will try to avoid it. 
1658                          */
1659                         if (tmp <= sk->wmem_alloc &&
1660                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1661                                 && sk->err == 0) 
1662                         {
1663                                 sk->socket->flags &= ~SO_NOSPACE;
1664                                 interruptible_sleep_on(sk->sleep);
1665                                 if (current->signal & ~current->blocked) 
1666                                 {
1667                                         sti();
1668                                         if (copied) 
1669                                                 return(copied);
1670                                         return(-ERESTARTSYS);
1671                                 }
1672                         }
1673                         sk->inuse = 1;
1674                         sti();
1675                         continue;
1676                 }
1677 
1678                 skb->len = 0;
1679                 skb->sk = sk;
1680                 skb->free = 0;
1681                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1682         
1683                 buff = skb->data;
1684         
1685                 /*
1686                  * FIXME: we need to optimize this.
1687                  * Perhaps some hints here would be good.
1688                  */
1689                 
1690                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1691                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1692                 if (tmp < 0 ) 
1693                 {
1694                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1695                         release_sock(sk);
1696                         if (copied) 
1697                                 return(copied);
1698                         return(tmp);
1699                 }
1700                 skb->len += tmp;
1701                 skb->dev = dev;
1702                 buff += tmp;
1703                 skb->h.th =(struct tcphdr *) buff;
1704                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1705                 if (tmp < 0) 
1706                 {
1707                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1708                         release_sock(sk);
1709                         if (copied) 
1710                                 return(copied);
1711                         return(tmp);
1712                 }
1713 
1714                 if (flags & MSG_OOB) 
1715                 {
1716                         ((struct tcphdr *)buff)->urg = 1;
1717                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1718                 }
1719                 skb->len += tmp;
1720                 memcpy_fromfs(buff+tmp, from, copy);
1721 
1722                 from += copy;
1723                 copied += copy;
1724                 len -= copy;
1725                 skb->len += copy;
1726                 skb->free = 0;
1727                 sk->write_seq += copy;
1728         
1729                 if (send_tmp != NULL && sk->packets_out) 
1730                 {
1731                         tcp_enqueue_partial(send_tmp, sk);
1732                         continue;
1733                 }
1734                 tcp_send_skb(sk, skb);
1735         }
1736         sk->err = 0;
1737 
1738 /*
1739  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1740  *      interactive fast network servers. It's meant to be on and
1741  *      it really improves the throughput though not the echo time
1742  *      on my slow slip link - Alan
1743  */
1744 
1745 /*
1746  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1747  */
1748  
1749         if(sk->partial && ((!sk->packets_out) 
1750      /* If not nagling we can send on the before case too.. */
1751               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1752         ))
1753                 tcp_send_partial(sk);
1754 
1755         release_sock(sk);
1756         return(copied);
1757 }
1758 
1759 /*
1760  *      This is just a wrapper. 
1761  */
1762 
1763 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1764            int len, int nonblock, unsigned flags,
1765            struct sockaddr_in *addr, int addr_len)
1766 {
1767         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1768                 return -EINVAL;
1769         if (sk->state == TCP_CLOSE)
1770                 return -ENOTCONN;
1771         if (addr_len < sizeof(*addr))
1772                 return -EINVAL;
1773         if (addr->sin_family && addr->sin_family != AF_INET) 
1774                 return -EINVAL;
1775         if (addr->sin_port != sk->dummy_th.dest) 
1776                 return -EISCONN;
1777         if (addr->sin_addr.s_addr != sk->daddr) 
1778                 return -EISCONN;
1779         return tcp_write(sk, from, len, nonblock, flags);
1780 }
1781 
1782 
1783 /*
1784  *      Send an ack if one is backlogged at this point. Ought to merge
1785  *      this with tcp_send_ack().
1786  */
1787  
1788 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1789 {
1790         int tmp;
1791         struct device *dev = NULL;
1792         struct tcphdr *t1;
1793         struct sk_buff *buff;
1794 
1795         if (!sk->ack_backlog) 
1796                 return;
1797 
1798         /*
1799          * FIXME: we need to put code here to prevent this routine from
1800          * being called.  Being called once in a while is ok, so only check
1801          * if this is the second time in a row.
1802          */
1803 
1804         /*
1805          * We need to grab some memory, and put together an ack,
1806          * and then put it into the queue to be sent.
1807          */
1808 
1809         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1810         if (buff == NULL) 
1811         {
1812                 /* Try again real soon. */
1813                 reset_xmit_timer(sk, TIME_WRITE, HZ);
1814                 return;
1815         }
1816 
1817         buff->len = sizeof(struct tcphdr);
1818         buff->sk = sk;
1819         buff->localroute = sk->localroute;
1820         
1821         /*
1822          *      Put in the IP header and routing stuff. 
1823          */
1824 
1825         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1826                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1827         if (tmp < 0) 
1828         {
1829                 buff->free = 1;
1830                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1831                 return;
1832         }
1833 
1834         buff->len += tmp;
1835         t1 =(struct tcphdr *)(buff->data +tmp);
1836 
1837         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1838         t1->seq = htonl(sk->sent_seq);
1839         t1->ack = 1;
1840         t1->res1 = 0;
1841         t1->res2 = 0;
1842         t1->rst = 0;
1843         t1->urg = 0;
1844         t1->syn = 0;
1845         t1->psh = 0;
1846         sk->ack_backlog = 0;
1847         sk->bytes_rcv = 0;
1848         sk->window = tcp_select_window(sk);
1849         t1->window = ntohs(sk->window);
1850         t1->ack_seq = ntohl(sk->acked_seq);
1851         t1->doff = sizeof(*t1)/4;
1852         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1853         sk->prot->queue_xmit(sk, dev, buff, 1);
1854         tcp_statistics.TcpOutSegs++;
1855 }
1856 
1857 
1858 /*
1859  *      FIXME:
1860  *      This routine frees used buffers.
1861  *      It should consider sending an ACK to let the
1862  *      other end know we now have a bigger window.
1863  */
1864 
1865 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1866 {
1867         unsigned long flags;
1868         unsigned long left;
1869         struct sk_buff *skb;
1870         unsigned long rspace;
1871 
1872         if(sk->debug)
1873                 printk("cleaning rbuf for sk=%p\n", sk);
1874   
1875         save_flags(flags);
1876         cli();
1877   
1878         left = sk->prot->rspace(sk);
1879  
1880         /*
1881          *      We have to loop through all the buffer headers,
1882          *      and try to free up all the space we can.
1883          */
1884 
1885         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1886         {
1887                 if (!skb->used || skb->users) 
1888                         break;
1889                 skb_unlink(skb);
1890                 skb->sk = sk;
1891                 kfree_skb(skb, FREE_READ);
1892         }
1893 
1894         restore_flags(flags);
1895 
1896         /*
1897          *      FIXME:
1898          *      At this point we should send an ack if the difference
1899          *      in the window, and the amount of space is bigger than
1900          *      TCP_WINDOW_DIFF.
1901          */
1902 
1903         if(sk->debug)
1904                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1905                                             left);
1906         if ((rspace=sk->prot->rspace(sk)) != left) 
1907         {
1908                 /*
1909                  * This area has caused the most trouble.  The current strategy
1910                  * is to simply do nothing if the other end has room to send at
1911                  * least 3 full packets, because the ack from those will auto-
1912                  * matically update the window.  If the other end doesn't think
1913                  * we have much space left, but we have room for at least 1 more
1914                  * complete packet than it thinks we do, we will send an ack
1915                  * immediately.  Otherwise we will wait up to .5 seconds in case
1916                  * the user reads some more.
1917                  */
1918                 sk->ack_backlog++;
1919         /*
1920          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1921          * if the other end is offering a window smaller than the agreed on MSS
1922          * (called sk->mtu here).  In theory there's no connection between send
1923          * and receive, and so no reason to think that they're going to send
1924          * small packets.  For the moment I'm using the hack of reducing the mss
1925          * only on the send side, so I'm putting mtu here.
1926          */
1927 
1928                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1929                 {
1930                         /* Send an ack right now. */
1931                         tcp_read_wakeup(sk);
1932                 } 
1933                 else 
1934                 {
1935                         /* Force it to send an ack soon. */
1936                         int was_active = del_timer(&sk->retransmit_timer);
1937                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1938                         {
1939                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1940                         } 
1941                         else
1942                                 add_timer(&sk->retransmit_timer);
1943                 }
1944         }
1945 } 
1946 
1947 
1948 /*
1949  *      Handle reading urgent data. BSD has very simple semantics for
1950  *      this, no blocking and very strange errors 8)
1951  */
1952  
1953 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1954              unsigned char *to, int len, unsigned flags)
1955 {
1956         /*
1957          *      No URG data to read
1958          */
1959         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1960                 return -EINVAL; /* Yes this is right ! */
1961                 
1962         if (sk->err) 
1963         {
1964                 int tmp = -sk->err;
1965                 sk->err = 0;
1966                 return tmp;
1967         }
1968 
1969         if (sk->state == TCP_CLOSE || sk->done) 
1970         {
1971                 if (!sk->done) {
1972                         sk->done = 1;
1973                         return 0;
1974                 }
1975                 return -ENOTCONN;
1976         }
1977 
1978         if (sk->shutdown & RCV_SHUTDOWN) 
1979         {
1980                 sk->done = 1;
1981                 return 0;
1982         }
1983         sk->inuse = 1;
1984         if (sk->urg_data & URG_VALID) 
1985         {
1986                 char c = sk->urg_data;
1987                 if (!(flags & MSG_PEEK))
1988                         sk->urg_data = URG_READ;
1989                 put_fs_byte(c, to);
1990                 release_sock(sk);
1991                 return 1;
1992         }
1993         release_sock(sk);
1994         
1995         /*
1996          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1997          * the available implementations agree in this case:
1998          * this call should never block, independent of the
1999          * blocking state of the socket.
2000          * Mike <pall@rz.uni-karlsruhe.de>
2001          */
2002         return -EAGAIN;
2003 }
2004 
2005 
2006 /*
2007  *      This routine copies from a sock struct into the user buffer. 
2008  */
2009  
2010 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2011         int len, int nonblock, unsigned flags)
2012 {
2013         struct wait_queue wait = { current, NULL };
2014         int copied = 0;
2015         unsigned long peek_seq;
2016         volatile unsigned long *seq;    /* So gcc doesnt overoptimise */
2017         unsigned long used;
2018 
2019         /* 
2020          *      This error should be checked. 
2021          */
2022          
2023         if (sk->state == TCP_LISTEN)
2024                 return -ENOTCONN;
2025 
2026         /*
2027          *      Urgent data needs to be handled specially. 
2028          */
2029          
2030         if (flags & MSG_OOB)
2031                 return tcp_read_urg(sk, nonblock, to, len, flags);
2032 
2033         /*
2034          *      Copying sequence to update. This is volatile to handle
2035          *      the multi-reader case neatly (memcpy_to/fromfs might be 
2036          *      inline and thus not flush cached variables otherwise).
2037          */
2038          
2039         peek_seq = sk->copied_seq;
2040         seq = &sk->copied_seq;
2041         if (flags & MSG_PEEK)
2042                 seq = &peek_seq;
2043 
2044         add_wait_queue(sk->sleep, &wait);
2045         sk->inuse = 1;
2046         while (len > 0) 
2047         {
2048                 struct sk_buff * skb;
2049                 unsigned long offset;
2050         
2051                 /*
2052                  * Are we at urgent data? Stop if we have read anything.
2053                  */
2054                  
2055                 if (copied && sk->urg_data && sk->urg_seq == *seq)
2056                         break;
2057 
2058                 /*
2059                  *      Next get a buffer.
2060                  */
2061                  
2062                 current->state = TASK_INTERRUPTIBLE;
2063 
2064                 skb = skb_peek(&sk->receive_queue);
2065                 do 
2066                 {
2067                         if (!skb)
2068                                 break;
2069                         if (before(*seq, skb->h.th->seq))
2070                                 break;
2071                         offset = *seq - skb->h.th->seq;
2072                         if (skb->h.th->syn)
2073                                 offset--;
2074                         if (offset < skb->len)
2075                                 goto found_ok_skb;
2076                         if (skb->h.th->fin)
2077                                 goto found_fin_ok;
2078                         if (!(flags & MSG_PEEK))
2079                                 skb->used = 1;
2080                         skb = skb->next;
2081                 }
2082                 while (skb != (struct sk_buff *)&sk->receive_queue);
2083 
2084                 if (copied)
2085                         break;
2086 
2087                 if (sk->err) 
2088                 {
2089                         copied = -sk->err;
2090                         sk->err = 0;
2091                         break;
2092                 }
2093 
2094                 if (sk->state == TCP_CLOSE) 
2095                 {
2096                         if (!sk->done) 
2097                         {
2098                                 sk->done = 1;
2099                                 break;
2100                         }
2101                         copied = -ENOTCONN;
2102                         break;
2103                 }
2104 
2105                 if (sk->shutdown & RCV_SHUTDOWN) 
2106                 {
2107                         sk->done = 1;
2108                         break;
2109                 }
2110                         
2111                 if (nonblock) 
2112                 {
2113                         copied = -EAGAIN;
2114                         break;
2115                 }
2116 
2117                 cleanup_rbuf(sk);
2118                 release_sock(sk);
2119                 sk->socket->flags |= SO_WAITDATA;
2120                 schedule();
2121                 sk->socket->flags &= ~SO_WAITDATA;
2122                 sk->inuse = 1;
2123 
2124                 if (current->signal & ~current->blocked) 
2125                 {
2126                         copied = -ERESTARTSYS;
2127                         break;
2128                 }
2129                 continue;
2130 
2131         found_ok_skb:
2132                 /*
2133                  *      Lock the buffer. We can be fairly relaxed as
2134                  *      an interrupt will never steal a buffer we are 
2135                  *      using unless I've missed something serious in
2136                  *      tcp_data.
2137                  */
2138                 
2139                 skb->users++;
2140                 
2141                 /*
2142                  *      Ok so how much can we use ? 
2143                  */
2144                  
2145                 used = skb->len - offset;
2146                 if (len < used)
2147                         used = len;
2148                 /*
2149                  *      Do we have urgent data here? 
2150                  */
2151                 
2152                 if (sk->urg_data) 
2153                 {
2154                         unsigned long urg_offset = sk->urg_seq - *seq;
2155                         if (urg_offset < used) 
2156                         {
2157                                 if (!urg_offset) 
2158                                 {
2159                                         if (!sk->urginline) 
2160                                         {
2161                                                 ++*seq;
2162                                                 offset++;
2163                                                 used--;
2164                                         }
2165                                 }
2166                                 else
2167                                         used = urg_offset;
2168                         }
2169                 }
2170                 
2171                 /*
2172                  *      Copy it - We _MUST_ update *seq first so that we
2173                  *      don't ever double read when we have dual readers
2174                  */
2175                  
2176                 *seq += used;
2177 
2178                 /*
2179                  *      This memcpy_tofs can sleep. If it sleeps and we
2180                  *      do a second read it relies on the skb->users to avoid
2181                  *      a crash when cleanup_rbuf() gets called.
2182                  */
2183                  
2184                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2185                         skb->h.th->doff*4 + offset, used);
2186                 copied += used;
2187                 len -= used;
2188                 to += used;
2189                 
2190                 /*
2191                  *      We now will not sleep again until we are finished
2192                  *      with skb. Sorry if you are doing the SMP port
2193                  *      but you'll just have to fix it neatly ;)
2194                  */
2195                  
2196                 skb->users --;
2197                 
2198                 if (after(sk->copied_seq,sk->urg_seq))
2199                         sk->urg_data = 0;
2200                 if (used + offset < skb->len)
2201                         continue;
2202                 
2203                 /*
2204                  *      Process the FIN.
2205                  */
2206 
2207                 if (skb->h.th->fin)
2208                         goto found_fin_ok;
2209                 if (flags & MSG_PEEK)
2210                         continue;
2211                 skb->used = 1;
2212                 continue;
2213 
2214         found_fin_ok:
2215                 ++*seq;
2216                 if (flags & MSG_PEEK)
2217                         break;
2218                         
2219                 /*
2220                  *      All is done
2221                  */
2222                  
2223                 skb->used = 1;
2224                 sk->shutdown |= RCV_SHUTDOWN;
2225                 break;
2226 
2227         }
2228         remove_wait_queue(sk->sleep, &wait);
2229         current->state = TASK_RUNNING;
2230 
2231         /* Clean up data we have read: This will do ACK frames */
2232         cleanup_rbuf(sk);
2233         release_sock(sk);
2234         return copied;
2235 }
2236 
2237 /*
2238  *      State processing on a close. This implements the state shift for
2239  *      sending our FIN frame. Note that we only send a FIN for some 
2240  *      states. A shutdown() may have already sent the FIN, or we may be
2241  *      closed.
2242  */
2243  
2244 static int tcp_close_state(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2245 {
2246         int ns=TCP_CLOSE;
2247         int send_fin=0;
2248         switch(sk->state)
2249         {
2250                 case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
2251                         break;
2252                 case TCP_SYN_RECV:
2253                 case TCP_ESTABLISHED:   /* Closedown begin */
2254                         ns=TCP_FIN_WAIT1;
2255                         send_fin=1;
2256                         break;
2257                 case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
2258                 case TCP_FIN_WAIT2:
2259                 case TCP_CLOSING:
2260                         ns=sk->state;
2261                         break;
2262                 case TCP_CLOSE:
2263                 case TCP_LISTEN:
2264                         break;
2265                 case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
2266                                            wait only for the ACK */
2267                         ns=TCP_LAST_ACK;
2268                         send_fin=1;
2269         }
2270         
2271         tcp_set_state(sk,ns);
2272                 
2273         /*
2274          *      This is a (useful) BSD violating of the RFC. There is a
2275          *      problem with TCP as specified in that the other end could
2276          *      keep a socket open forever with no application left this end.
2277          *      We use a 3 minute timeout (about the same as BSD) then kill
2278          *      our end. If they send after that then tough - BUT: long enough
2279          *      that we won't make the old 4*rto = almost no time - whoops
2280          *      reset mistake.
2281          */
2282         if(sk->dead && ns==TCP_FIN_WAIT2)
2283         {
2284                 int timer_active=del_timer(&sk->timer);
2285                 if(timer_active)
2286                         add_timer(&sk->timer);
2287                 else
2288                         reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2289         }
2290         
2291         return send_fin;
2292 }
2293 
2294 /*
2295  *      Send a fin.
2296  */
2297 
2298 static void tcp_send_fin(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2299 {
2300         struct proto *prot =(struct proto *)sk->prot;
2301         struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2302         struct tcphdr *t1;
2303         struct sk_buff *buff;
2304         struct device *dev=NULL;
2305         int tmp;
2306                 
2307         release_sock(sk); /* in case the malloc sleeps. */
2308         
2309         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2310         sk->inuse = 1;
2311 
2312         if (buff == NULL)
2313         {
2314                 /* This is a disaster if it occurs */
2315                 printk("tcp_send_fin: Impossible malloc failure");
2316                 return;
2317         }
2318 
2319         /*
2320          *      Administrivia
2321          */
2322          
2323         buff->sk = sk;
2324         buff->len = sizeof(*t1);
2325         buff->localroute = sk->localroute;
2326         t1 =(struct tcphdr *) buff->data;
2327 
2328         /*
2329          *      Put in the IP header and routing stuff. 
2330          */
2331 
2332         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2333                            IPPROTO_TCP, sk->opt,
2334                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2335         if (tmp < 0) 
2336         {
2337                 /*
2338                  *      Finish anyway, treat this as a send that got lost. 
2339                  *      (Not good).
2340                  */
2341                  
2342                 buff->free = 1;
2343                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2344                 sk->write_seq++;
2345                 return;
2346         }
2347         
2348         /*
2349          *      We ought to check if the end of the queue is a buffer and
2350          *      if so simply add the fin to that buffer, not send it ahead.
2351          */
2352 
2353         t1 =(struct tcphdr *)((char *)t1 +tmp);
2354         buff->len += tmp;
2355         buff->dev = dev;
2356         memcpy(t1, th, sizeof(*t1));
2357         t1->seq = ntohl(sk->write_seq);
2358         sk->write_seq++;
2359         buff->h.seq = sk->write_seq;
2360         t1->ack = 1;
2361         t1->ack_seq = ntohl(sk->acked_seq);
2362         t1->window = ntohs(sk->window=tcp_select_window(sk));
2363         t1->fin = 1;
2364         t1->rst = 0;
2365         t1->doff = sizeof(*t1)/4;
2366         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2367 
2368         /*
2369          * If there is data in the write queue, the fin must be appended to
2370          * the write queue.
2371          */
2372         
2373         if (skb_peek(&sk->write_queue) != NULL) 
2374         {
2375                 buff->free = 0;
2376                 if (buff->next != NULL) 
2377                 {
2378                         printk("tcp_send_fin: next != NULL\n");
2379                         skb_unlink(buff);
2380                 }
2381                 skb_queue_tail(&sk->write_queue, buff);
2382         } 
2383         else 
2384         {
2385                 sk->sent_seq = sk->write_seq;
2386                 sk->prot->queue_xmit(sk, dev, buff, 0);
2387                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2388         }
2389 }
2390 
2391 /*
2392  *      Shutdown the sending side of a connection. Much like close except
2393  *      that we don't receive shut down or set sk->dead=1.
2394  */
2395 
2396 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
2397 {
2398         /*
2399          *      We need to grab some memory, and put together a FIN,
2400          *      and then put it into the queue to be sent.
2401          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2402          */
2403 
2404         if (!(how & SEND_SHUTDOWN)) 
2405                 return;
2406          
2407         /*
2408          *      If we've already sent a FIN, or its a closed state
2409          */
2410          
2411         if (sk->state == TCP_FIN_WAIT1 ||
2412             sk->state == TCP_FIN_WAIT2 ||
2413             sk->state == TCP_CLOSING ||
2414             sk->state == TCP_LAST_ACK ||
2415             sk->state == TCP_TIME_WAIT || 
2416             sk->state == TCP_CLOSE ||
2417             sk->state == TCP_LISTEN
2418           )
2419         {
2420                 return;
2421         }
2422         sk->inuse = 1;
2423 
2424         /*
2425          * flag that the sender has shutdown
2426          */
2427 
2428         sk->shutdown |= SEND_SHUTDOWN;
2429 
2430         /*
2431          *  Clear out any half completed packets. 
2432          */
2433 
2434         if (sk->partial)
2435                 tcp_send_partial(sk);
2436                 
2437         /*
2438          *      FIN if needed
2439          */
2440          
2441         if(tcp_close_state(sk))
2442                 tcp_send_fin(sk);
2443                 
2444         release_sock(sk);
2445 }
2446 
2447 
2448 static int
2449 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
2450              int to_len, int nonblock, unsigned flags,
2451              struct sockaddr_in *addr, int *addr_len)
2452 {
2453         int result;
2454   
2455         /* 
2456          *      Have to check these first unlike the old code. If 
2457          *      we check them after we lose data on an error
2458          *      which is wrong 
2459          */
2460 
2461         if(addr_len)
2462                 *addr_len = sizeof(*addr);
2463         result=tcp_read(sk, to, to_len, nonblock, flags);
2464 
2465         if (result < 0) 
2466                 return(result);
2467   
2468         if(addr)
2469         {
2470                 addr->sin_family = AF_INET;
2471                 addr->sin_port = sk->dummy_th.dest;
2472                 addr->sin_addr.s_addr = sk->daddr;
2473         }
2474         return(result);
2475 }
2476 
2477 
2478 /*
2479  *      This routine will send an RST to the other tcp. 
2480  */
2481  
2482 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
2483           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2484 {
2485         struct sk_buff *buff;
2486         struct tcphdr *t1;
2487         int tmp;
2488         struct device *ndev=NULL;
2489 
2490         /*
2491          *      Cannot reset a reset (Think about it).
2492          */
2493          
2494         if(th->rst)
2495                 return;
2496   
2497         /*
2498          * We need to grab some memory, and put together an RST,
2499          * and then put it into the queue to be sent.
2500          */
2501 
2502         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2503         if (buff == NULL) 
2504                 return;
2505 
2506         buff->len = sizeof(*t1);
2507         buff->sk = NULL;
2508         buff->dev = dev;
2509         buff->localroute = 0;
2510 
2511         t1 =(struct tcphdr *) buff->data;
2512 
2513         /*
2514          *      Put in the IP header and routing stuff. 
2515          */
2516 
2517         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2518                            sizeof(struct tcphdr),tos,ttl);
2519         if (tmp < 0) 
2520         {
2521                 buff->free = 1;
2522                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2523                 return;
2524         }
2525 
2526         t1 =(struct tcphdr *)((char *)t1 +tmp);
2527         buff->len += tmp;
2528         memcpy(t1, th, sizeof(*t1));
2529 
2530         /*
2531          *      Swap the send and the receive. 
2532          */
2533 
2534         t1->dest = th->source;
2535         t1->source = th->dest;
2536         t1->rst = 1;  
2537         t1->window = 0;
2538   
2539         if(th->ack)
2540         {
2541                 t1->ack = 0;
2542                 t1->seq = th->ack_seq;
2543                 t1->ack_seq = 0;
2544         }
2545         else
2546         {
2547                 t1->ack = 1;
2548                 if(!th->syn)
2549                         t1->ack_seq=htonl(th->seq);
2550                 else
2551                         t1->ack_seq=htonl(th->seq+1);
2552                 t1->seq=0;
2553         }
2554 
2555         t1->syn = 0;
2556         t1->urg = 0;
2557         t1->fin = 0;
2558         t1->psh = 0;
2559         t1->doff = sizeof(*t1)/4;
2560         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2561         prot->queue_xmit(NULL, ndev, buff, 1);
2562         tcp_statistics.TcpOutSegs++;
2563 }
2564 
2565 
2566 /*
2567  *      Look for tcp options. Parses everything but only knows about MSS.
2568  *      This routine is always called with the packet containing the SYN.
2569  *      However it may also be called with the ack to the SYN.  So you
2570  *      can't assume this is always the SYN.  It's always called after
2571  *      we have set up sk->mtu to our own MTU.
2572  *
2573  *      We need at minimum to add PAWS support here. Possibly large windows
2574  *      as Linux gets deployed on 100Mb/sec networks.
2575  */
2576  
2577 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2578 {
2579         unsigned char *ptr;
2580         int length=(th->doff*4)-sizeof(struct tcphdr);
2581         int mss_seen = 0;
2582     
2583         ptr = (unsigned char *)(th + 1);
2584   
2585         while(length>0)
2586         {
2587                 int opcode=*ptr++;
2588                 int opsize=*ptr++;
2589                 switch(opcode)
2590                 {
2591                         case TCPOPT_EOL:
2592                                 return;
2593                         case TCPOPT_NOP:
2594                                 length-=2;
2595                                 continue;
2596                         
2597                         default:
2598                                 if(opsize<=2)   /* Avoid silly options looping forever */
2599                                         return;
2600                                 switch(opcode)
2601                                 {
2602                                         case TCPOPT_MSS:
2603                                                 if(opsize==4 && th->syn)
2604                                                 {
2605                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2606                                                         mss_seen = 1;
2607                                                 }
2608                                                 break;
2609                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2610                                 }
2611                                 ptr+=opsize-2;
2612                                 length-=opsize;
2613                 }
2614         }
2615         if (th->syn) 
2616         {
2617                 if (! mss_seen)
2618                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2619         }
2620 #ifdef CONFIG_INET_PCTCP
2621         sk->mss = min(sk->max_window >> 1, sk->mtu);
2622 #else    
2623         sk->mss = min(sk->max_window, sk->mtu);
2624 #endif  
2625 }
2626 
2627 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2628 {
2629         dst = ntohl(dst);
2630         if (IN_CLASSA(dst))
2631                 return htonl(IN_CLASSA_NET);
2632         if (IN_CLASSB(dst))
2633                 return htonl(IN_CLASSB_NET);
2634         return htonl(IN_CLASSC_NET);
2635 }
2636 
2637 /*
2638  *      Default sequence number picking algorithm.
2639  */
2640 
2641 extern inline long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2642 {
2643         return jiffies * SEQ_TICK - seq_offset; 
2644 }
2645 
2646 /*
2647  *      This routine handles a connection request.
2648  *      It should make sure we haven't already responded.
2649  *      Because of the way BSD works, we have to send a syn/ack now.
2650  *      This also means it will be harder to close a socket which is
2651  *      listening.
2652  */
2653  
2654 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2655                  unsigned long daddr, unsigned long saddr,
2656                  struct options *opt, struct device *dev, unsigned long seq)
2657 {
2658         struct sk_buff *buff;
2659         struct tcphdr *t1;
2660         unsigned char *ptr;
2661         struct sock *newsk;
2662         struct tcphdr *th;
2663         struct device *ndev=NULL;
2664         int tmp;
2665         struct rtable *rt;
2666   
2667         th = skb->h.th;
2668 
2669         /* If the socket is dead, don't accept the connection. */
2670         if (!sk->dead) 
2671         {
2672                 sk->data_ready(sk,0);
2673         }
2674         else 
2675         {
2676                 if(sk->debug)
2677                         printk("Reset on %p: Connect on dead socket.\n",sk);
2678                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2679                 tcp_statistics.TcpAttemptFails++;
2680                 kfree_skb(skb, FREE_READ);
2681                 return;
2682         }
2683 
2684         /*
2685          * Make sure we can accept more.  This will prevent a
2686          * flurry of syns from eating up all our memory.
2687          */
2688 
2689         if (sk->ack_backlog >= sk->max_ack_backlog) 
2690         {
2691                 tcp_statistics.TcpAttemptFails++;
2692                 kfree_skb(skb, FREE_READ);
2693                 return;
2694         }
2695 
2696         /*
2697          * We need to build a new sock struct.
2698          * It is sort of bad to have a socket without an inode attached
2699          * to it, but the wake_up's will just wake up the listening socket,
2700          * and if the listening socket is destroyed before this is taken
2701          * off of the queue, this will take care of it.
2702          */
2703 
2704         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2705         if (newsk == NULL) 
2706         {
2707                 /* just ignore the syn.  It will get retransmitted. */
2708                 tcp_statistics.TcpAttemptFails++;
2709                 kfree_skb(skb, FREE_READ);
2710                 return;
2711         }
2712 
2713         memcpy(newsk, sk, sizeof(*newsk));
2714         skb_queue_head_init(&newsk->write_queue);
2715         skb_queue_head_init(&newsk->receive_queue);
2716         newsk->send_head = NULL;
2717         newsk->send_tail = NULL;
2718         skb_queue_head_init(&newsk->back_log);
2719         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2720         newsk->rto = TCP_TIMEOUT_INIT;
2721         newsk->mdev = 0;
2722         newsk->max_window = 0;
2723         newsk->cong_window = 1;
2724         newsk->cong_count = 0;
2725         newsk->ssthresh = 0;
2726         newsk->backoff = 0;
2727         newsk->blog = 0;
2728         newsk->intr = 0;
2729         newsk->proc = 0;
2730         newsk->done = 0;
2731         newsk->partial = NULL;
2732         newsk->pair = NULL;
2733         newsk->wmem_alloc = 0;
2734         newsk->rmem_alloc = 0;
2735         newsk->localroute = sk->localroute;
2736 
2737         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2738 
2739         newsk->err = 0;
2740         newsk->shutdown = 0;
2741         newsk->ack_backlog = 0;
2742         newsk->acked_seq = skb->h.th->seq+1;
2743         newsk->copied_seq = skb->h.th->seq+1;
2744         newsk->fin_seq = skb->h.th->seq;
2745         newsk->state = TCP_SYN_RECV;
2746         newsk->timeout = 0;
2747         newsk->ip_xmit_timeout = 0;
2748         newsk->write_seq = seq; 
2749         newsk->window_seq = newsk->write_seq;
2750         newsk->rcv_ack_seq = newsk->write_seq;
2751         newsk->urg_data = 0;
2752         newsk->retransmits = 0;
2753         newsk->linger=0;
2754         newsk->destroy = 0;
2755         init_timer(&newsk->timer);
2756         init_timer(&newsk->retransmit_timer);
2757         newsk->timer.data = (unsigned long)newsk;
2758         newsk->timer.function = &net_timer;
2759         newsk->retransmit_timer.data = (unsigned long)newsk;
2760         newsk->retransmit_timer.function=&retransmit_timer;
2761         newsk->dummy_th.source = skb->h.th->dest;
2762         newsk->dummy_th.dest = skb->h.th->source;
2763         
2764         /*
2765          *      Swap these two, they are from our point of view. 
2766          */
2767          
2768         newsk->daddr = saddr;
2769         newsk->saddr = daddr;
2770 
2771         put_sock(newsk->num,newsk);
2772         newsk->dummy_th.res1 = 0;
2773         newsk->dummy_th.doff = 6;
2774         newsk->dummy_th.fin = 0;
2775         newsk->dummy_th.syn = 0;
2776         newsk->dummy_th.rst = 0;        
2777         newsk->dummy_th.psh = 0;
2778         newsk->dummy_th.ack = 0;
2779         newsk->dummy_th.urg = 0;
2780         newsk->dummy_th.res2 = 0;
2781         newsk->acked_seq = skb->h.th->seq + 1;
2782         newsk->copied_seq = skb->h.th->seq + 1;
2783         newsk->socket = NULL;
2784 
2785         /*
2786          *      Grab the ttl and tos values and use them 
2787          */
2788 
2789         newsk->ip_ttl=sk->ip_ttl;
2790         newsk->ip_tos=skb->ip_hdr->tos;
2791 
2792         /*
2793          *      Use 512 or whatever user asked for 
2794          */
2795 
2796         /*
2797          *      Note use of sk->user_mss, since user has no direct access to newsk 
2798          */
2799 
2800         rt=ip_rt_route(saddr, NULL,NULL);
2801         
2802         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2803                 newsk->window_clamp = rt->rt_window;
2804         else
2805                 newsk->window_clamp = 0;
2806                 
2807         if (sk->user_mss)
2808                 newsk->mtu = sk->user_mss;
2809         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2810                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2811         else 
2812         {
2813 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2814                 if ((saddr ^ daddr) & default_mask(saddr))
2815 #else
2816                 if ((saddr ^ daddr) & dev->pa_mask)
2817 #endif
2818                         newsk->mtu = 576 - HEADER_SIZE;
2819                 else
2820                         newsk->mtu = MAX_WINDOW;
2821         }
2822 
2823         /*
2824          *      But not bigger than device MTU 
2825          */
2826 
2827         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2828 
2829         /*
2830          *      This will min with what arrived in the packet 
2831          */
2832 
2833         tcp_options(newsk,skb->h.th);
2834 
2835         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2836         if (buff == NULL) 
2837         {
2838                 sk->err = -ENOMEM;
2839                 newsk->dead = 1;
2840                 release_sock(newsk);
2841                 kfree_skb(skb, FREE_READ);
2842                 tcp_statistics.TcpAttemptFails++;
2843                 return;
2844         }
2845   
2846         buff->len = sizeof(struct tcphdr)+4;
2847         buff->sk = newsk;
2848         buff->localroute = newsk->localroute;
2849 
2850         t1 =(struct tcphdr *) buff->data;
2851 
2852         /*
2853          *      Put in the IP header and routing stuff. 
2854          */
2855 
2856         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2857                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2858 
2859         /*
2860          *      Something went wrong. 
2861          */
2862 
2863         if (tmp < 0) 
2864         {
2865                 sk->err = tmp;
2866                 buff->free = 1;
2867                 kfree_skb(buff,FREE_WRITE);
2868                 newsk->dead = 1;
2869                 release_sock(newsk);
2870                 skb->sk = sk;
2871                 kfree_skb(skb, FREE_READ);
2872                 tcp_statistics.TcpAttemptFails++;
2873                 return;
2874         }
2875 
2876         buff->len += tmp;
2877         t1 =(struct tcphdr *)((char *)t1 +tmp);
2878   
2879         memcpy(t1, skb->h.th, sizeof(*t1));
2880         buff->h.seq = newsk->write_seq;
2881         /*
2882          *      Swap the send and the receive. 
2883          */
2884         t1->dest = skb->h.th->source;
2885         t1->source = newsk->dummy_th.source;
2886         t1->seq = ntohl(newsk->write_seq++);
2887         t1->ack = 1;
2888         newsk->window = tcp_select_window(newsk);
2889         newsk->sent_seq = newsk->write_seq;
2890         t1->window = ntohs(newsk->window);
2891         t1->res1 = 0;
2892         t1->res2 = 0;
2893         t1->rst = 0;
2894         t1->urg = 0;
2895         t1->psh = 0;
2896         t1->syn = 1;
2897         t1->ack_seq = ntohl(skb->h.th->seq+1);
2898         t1->doff = sizeof(*t1)/4+1;
2899         ptr =(unsigned char *)(t1+1);
2900         ptr[0] = 2;
2901         ptr[1] = 4;
2902         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2903         ptr[3] =(newsk->mtu) & 0xff;
2904 
2905         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2906         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2907         reset_xmit_timer(newsk, TIME_WRITE, newsk->rto);
2908 
2909         reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2910         skb->sk = newsk;
2911 
2912         /*
2913          *      Charge the sock_buff to newsk. 
2914          */
2915          
2916         sk->rmem_alloc -= skb->mem_len;
2917         newsk->rmem_alloc += skb->mem_len;
2918         
2919         skb_queue_tail(&sk->receive_queue,skb);
2920         sk->ack_backlog++;
2921         release_sock(newsk);
2922         tcp_statistics.TcpOutSegs++;
2923 }
2924 
2925 
2926 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2927 {
2928         /*
2929          * We need to grab some memory, and put together a FIN, 
2930          * and then put it into the queue to be sent.
2931          */
2932         
2933         sk->inuse = 1;
2934         
2935         if(sk->state == TCP_LISTEN)
2936         {
2937                 /* Special case */
2938                 tcp_set_state(sk, TCP_CLOSE);
2939                 tcp_close_pending(sk, timeout);
2940                 release_sock(sk);
2941                 return;
2942         }
2943         
2944         sk->keepopen = 1;
2945         sk->shutdown = SHUTDOWN_MASK;
2946 
2947         if (!sk->dead) 
2948                 sk->state_change(sk);
2949 
2950         if (timeout == 0) 
2951         {
2952                 struct sk_buff *skb;
2953                 
2954                 /*
2955                  *  We need to flush the recv. buffs.  We do this only on the
2956                  *  descriptor close, not protocol-sourced closes, because the
2957                  *  reader process may not have drained the data yet!
2958                  */
2959                  
2960                 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2961                         kfree_skb(skb, FREE_READ);
2962         }
2963 
2964         /*
2965          *      Get rid off any half-completed packets. 
2966          */
2967          
2968         if (sk->partial) 
2969                 tcp_send_partial(sk);
2970                 
2971         /*
2972          *      Timeout is not the same thing - however the code likes
2973          *      to send both the same way (sigh).
2974          */
2975          
2976         if(timeout)
2977         {
2978                 /*
2979                  *      Time wait to avoid port reusage accidents if 
2980                  *      appropriate. If we have timed out from one
2981                  *      of these states then move straight to close.
2982                  */
2983                  
2984                 if( sk->state == TCP_TIME_WAIT || sk->state == TCP_LAST_ACK 
2985                         || sk->state == TCP_SYN_SENT || sk->state == TCP_CLOSE)
2986                         tcp_set_state(sk, TCP_CLOSE);   /* Dead */
2987                 else
2988                         tcp_time_wait(sk);                      
2989         }
2990         else
2991         {
2992                 if(tcp_close_state(sk)==1)
2993                 {
2994                         tcp_send_fin(sk);
2995                 }
2996         }
2997         release_sock(sk);
2998 }
2999 
3000 
3001 /*
3002  *      This routine takes stuff off of the write queue,
3003  *      and puts it in the xmit queue. This happens as incoming acks
3004  *      open up the remote window for us.
3005  */
3006  
3007 static void tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
3008 {
3009         struct sk_buff *skb;
3010 
3011         /*
3012          *      The bytes will have to remain here. In time closedown will
3013          *      empty the write queue and all will be happy 
3014          */
3015 
3016         if(sk->zapped)
3017                 return;
3018 
3019         /*
3020          *      Anything on the transmit queue that fits the window can
3021          *      be added providing we are not
3022          *
3023          *      a) retransmitting (Nagle's rule)
3024          *      b) exceeding our congestion window.
3025          */
3026          
3027         while((skb = skb_peek(&sk->write_queue)) != NULL &&
3028                 before(skb->h.seq, sk->window_seq + 1) &&
3029                 (sk->retransmits == 0 ||
3030                  sk->ip_xmit_timeout != TIME_WRITE ||
3031                  before(skb->h.seq, sk->rcv_ack_seq + 1))
3032                 && sk->packets_out < sk->cong_window) 
3033         {
3034                 IS_SKB(skb);
3035                 skb_unlink(skb);
3036                 
3037                 /*
3038                  *      See if we really need to send the packet. 
3039                  */
3040                  
3041                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
3042                 {
3043                         /*
3044                          *      This is acked data. We can discard it. This 
3045                          *      cannot currently occur.
3046                          */
3047                          
3048                         sk->retransmits = 0;
3049                         kfree_skb(skb, FREE_WRITE);
3050                         if (!sk->dead) 
3051                                 sk->write_space(sk);
3052                 } 
3053                 else
3054                 {
3055                         struct tcphdr *th;
3056                         struct iphdr *iph;
3057                         int size;
3058 /*
3059  * put in the ack seq and window at this point rather than earlier,
3060  * in order to keep them monotonic.  We really want to avoid taking
3061  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
3062  * Ack and window will in general have changed since this packet was put
3063  * on the write queue.
3064  */
3065                         iph = (struct iphdr *)(skb->data +
3066                                                skb->dev->hard_header_len);
3067                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3068                         size = skb->len - (((unsigned char *) th) - skb->data);
3069                         
3070                         th->ack_seq = ntohl(sk->acked_seq);
3071                         th->window = ntohs(tcp_select_window(sk));
3072 
3073                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3074 
3075                         sk->sent_seq = skb->h.seq;
3076                         
3077                         /*
3078                          *      IP manages our queue for some crazy reason
3079                          */
3080                          
3081                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3082                         
3083                         /*
3084                          *      Again we slide the timer wrongly
3085                          */
3086                          
3087                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3088                 }
3089         }
3090 }
3091 
3092 
3093 /*
3094  *      This routine deals with incoming acks, but not outgoing ones.
3095  */
3096 
3097 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
3098 {
3099         unsigned long ack;
3100         int flag = 0;
3101 
3102         /* 
3103          * 1 - there was data in packet as well as ack or new data is sent or 
3104          *     in shutdown state
3105          * 2 - data from retransmit queue was acked and removed
3106          * 4 - window shrunk or data from retransmit queue was acked and removed
3107          */
3108 
3109         if(sk->zapped)
3110                 return(1);      /* Dead, cant ack any more so why bother */
3111 
3112         /*
3113          *      Have we discovered a larger window
3114          */
3115          
3116         ack = ntohl(th->ack_seq);
3117 
3118         if (ntohs(th->window) > sk->max_window) 
3119         {
3120                 sk->max_window = ntohs(th->window);
3121 #ifdef CONFIG_INET_PCTCP
3122                 /* Hack because we don't send partial packets to non SWS
3123                    handling hosts */
3124                 sk->mss = min(sk->max_window>>1, sk->mtu);
3125 #else
3126                 sk->mss = min(sk->max_window, sk->mtu);
3127 #endif  
3128         }
3129 
3130         /*
3131          *      We have dropped back to keepalive timeouts. Thus we have
3132          *      no retransmits pending.
3133          */
3134          
3135         if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3136                 sk->retransmits = 0;
3137 
3138         /*
3139          *      If the ack is newer than sent or older than previous acks
3140          *      then we can probably ignore it.
3141          */
3142          
3143         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
3144         {
3145                 if(sk->debug)
3146                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
3147                         
3148                 /*
3149                  *      Keepalive processing.
3150                  */
3151                  
3152                 if (after(ack, sk->sent_seq)) 
3153                 {
3154                         return(0);
3155                 }
3156                 
3157                 /*
3158                  *      Restart the keepalive timer.
3159                  */
3160                  
3161                 if (sk->keepopen) 
3162                 {
3163                         if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3164                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3165                 }
3166                 return(1);
3167         }
3168 
3169         /*
3170          *      If there is data set flag 1
3171          */
3172          
3173         if (len != th->doff*4) 
3174                 flag |= 1;
3175 
3176         /*
3177          *      See if our window has been shrunk. 
3178          */
3179 
3180         if (after(sk->window_seq, ack+ntohs(th->window))) 
3181         {
3182                 /*
3183                  * We may need to move packets from the send queue
3184                  * to the write queue, if the window has been shrunk on us.
3185                  * The RFC says you are not allowed to shrink your window
3186                  * like this, but if the other end does, you must be able
3187                  * to deal with it.
3188                  */
3189                 struct sk_buff *skb;
3190                 struct sk_buff *skb2;
3191                 struct sk_buff *wskb = NULL;
3192         
3193                 skb2 = sk->send_head;
3194                 sk->send_head = NULL;
3195                 sk->send_tail = NULL;
3196         
3197                 /*
3198                  *      This is an artifact of a flawed concept. We want one
3199                  *      queue and a smarter send routine when we send all.
3200                  */
3201         
3202                 flag |= 4;      /* Window changed */
3203         
3204                 sk->window_seq = ack + ntohs(th->window);
3205                 cli();
3206                 while (skb2 != NULL) 
3207                 {
3208                         skb = skb2;
3209                         skb2 = skb->link3;
3210                         skb->link3 = NULL;
3211                         if (after(skb->h.seq, sk->window_seq)) 
3212                         {
3213                                 if (sk->packets_out > 0) 
3214                                         sk->packets_out--;
3215                                 /* We may need to remove this from the dev send list. */
3216                                 if (skb->next != NULL) 
3217                                 {
3218                                         skb_unlink(skb);                                
3219                                 }
3220                                 /* Now add it to the write_queue. */
3221                                 if (wskb == NULL)
3222                                         skb_queue_head(&sk->write_queue,skb);
3223                                 else
3224                                         skb_append(wskb,skb);
3225                                 wskb = skb;
3226                         } 
3227                         else 
3228                         {
3229                                 if (sk->send_head == NULL) 
3230                                 {
3231                                         sk->send_head = skb;
3232                                         sk->send_tail = skb;
3233                                 }
3234                                 else
3235                                 {
3236                                         sk->send_tail->link3 = skb;
3237                                         sk->send_tail = skb;
3238                                 }
3239                                 skb->link3 = NULL;
3240                         }
3241                 }
3242                 sti();
3243         }
3244 
3245         /*
3246          *      Pipe has emptied
3247          */
3248          
3249         if (sk->send_tail == NULL || sk->send_head == NULL) 
3250         {
3251                 sk->send_head = NULL;
3252                 sk->send_tail = NULL;
3253                 sk->packets_out= 0;
3254         }
3255 
3256         /*
3257          *      Update the right hand window edge of the host
3258          */
3259          
3260         sk->window_seq = ack + ntohs(th->window);
3261 
3262         /*
3263          *      We don't want too many packets out there. 
3264          */
3265          
3266         if (sk->ip_xmit_timeout == TIME_WRITE && 
3267                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
3268         {
3269                 /* 
3270                  * This is Jacobson's slow start and congestion avoidance. 
3271                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
3272                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
3273                  * counter and increment it once every cwnd times.  It's possible
3274                  * that this should be done only if sk->retransmits == 0.  I'm
3275                  * interpreting "new data is acked" as including data that has
3276                  * been retransmitted but is just now being acked.
3277                  */
3278                 if (sk->cong_window < sk->ssthresh)  
3279                         /* 
3280                          *      In "safe" area, increase
3281                          */
3282                         sk->cong_window++;
3283                 else 
3284                 {
3285                         /*
3286                          *      In dangerous area, increase slowly.  In theory this is
3287                          *      sk->cong_window += 1 / sk->cong_window
3288                          */
3289                         if (sk->cong_count >= sk->cong_window) 
3290                         {
3291                                 sk->cong_window++;
3292                                 sk->cong_count = 0;
3293                         }
3294                         else 
3295                                 sk->cong_count++;
3296                 }
3297         }
3298 
3299         /*
3300          *      Remember the highest ack received.
3301          */
3302          
3303         sk->rcv_ack_seq = ack;
3304 
3305         /*
3306          *      If this ack opens up a zero window, clear backoff.  It was
3307          *      being used to time the probes, and is probably far higher than
3308          *      it needs to be for normal retransmission.
3309          */
3310 
3311         if (sk->ip_xmit_timeout == TIME_PROBE0) 
3312         {
3313                 sk->retransmits = 0;    /* Our probe was answered */
3314                 
3315                 /*
3316                  *      Was it a usable window open ?
3317                  */
3318                  
3319                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
3320                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
3321                 {
3322                         sk->backoff = 0;
3323                         
3324                         /*
3325                          *      Recompute rto from rtt.  this eliminates any backoff.
3326                          */
3327 
3328                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3329                         if (sk->rto > 120*HZ)
3330                                 sk->rto = 120*HZ;
3331                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
3332                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3333                                                    .2 of a second is going to need huge windows (SIGH) */
3334                         sk->rto = 20;
3335                 }
3336         }
3337 
3338         /* 
3339          *      See if we can take anything off of the retransmit queue.
3340          */
3341    
3342         while(sk->send_head != NULL) 
3343         {
3344                 /* Check for a bug. */
3345                 if (sk->send_head->link3 &&
3346                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
3347                         printk("INET: tcp.c: *** bug send_list out of order.\n");
3348                         
3349                 /*
3350                  *      If our packet is before the ack sequence we can
3351                  *      discard it as its confirmed to have arrived the other end.
3352                  */
3353                  
3354                 if (before(sk->send_head->h.seq, ack+1)) 
3355                 {
3356                         struct sk_buff *oskb;   
3357                         if (sk->retransmits) 
3358                         {       
3359                                 /*
3360                                  *      We were retransmitting.  don't count this in RTT est 
3361                                  */
3362                                 flag |= 2;
3363 
3364                                 /*
3365                                  * even though we've gotten an ack, we're still
3366                                  * retransmitting as long as we're sending from
3367                                  * the retransmit queue.  Keeping retransmits non-zero
3368                                  * prevents us from getting new data interspersed with
3369                                  * retransmissions.
3370                                  */
3371 
3372                                 if (sk->send_head->link3)       /* Any more queued retransmits? */
3373                                         sk->retransmits = 1;
3374                                 else
3375                                         sk->retransmits = 0;
3376                         }
3377                         /*
3378                          * Note that we only reset backoff and rto in the
3379                          * rtt recomputation code.  And that doesn't happen
3380                          * if there were retransmissions in effect.  So the
3381                          * first new packet after the retransmissions is
3382                          * sent with the backoff still in effect.  Not until
3383                          * we get an ack from a non-retransmitted packet do
3384                          * we reset the backoff and rto.  This allows us to deal
3385                          * with a situation where the network delay has increased
3386                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3387                          */
3388 
3389                         /*
3390                          *      We have one less packet out there. 
3391                          */
3392                          
3393                         if (sk->packets_out > 0) 
3394                                 sk->packets_out --;
3395                         /* 
3396                          *      Wake up the process, it can probably write more. 
3397                          */
3398                         if (!sk->dead) 
3399                                 sk->write_space(sk);
3400                         oskb = sk->send_head;
3401 
3402                         if (!(flag&2))  /* Not retransmitting */
3403                         {
3404                                 long m;
3405         
3406                                 /*
3407                                  *      The following amusing code comes from Jacobson's
3408                                  *      article in SIGCOMM '88.  Note that rtt and mdev
3409                                  *      are scaled versions of rtt and mean deviation.
3410                                  *      This is designed to be as fast as possible 
3411                                  *      m stands for "measurement".
3412                                  */
3413         
3414                                 m = jiffies - oskb->when;  /* RTT */
3415                                 if(m<=0)
3416                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
3417                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
3418                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
3419                                 if (m < 0)
3420                                         m = -m;         /* m is now abs(error) */
3421                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
3422                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
3423         
3424                                 /*
3425                                  *      Now update timeout.  Note that this removes any backoff.
3426                                  */
3427                          
3428                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3429                                 if (sk->rto > 120*HZ)
3430                                         sk->rto = 120*HZ;
3431                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3432                                         sk->rto = 20;
3433                                 sk->backoff = 0;
3434                         }
3435                         flag |= (2|4);  /* 2 is really more like 'don't adjust the rtt 
3436                                            In this case as we just set it up */
3437                         cli();
3438                         oskb = sk->send_head;
3439                         IS_SKB(oskb);
3440                         sk->send_head = oskb->link3;
3441                         if (sk->send_head == NULL) 
3442                         {
3443                                 sk->send_tail = NULL;
3444                         }
3445 
3446                 /*
3447                  *      We may need to remove this from the dev send list. 
3448                  */
3449 
3450                         if (oskb->next)
3451                                 skb_unlink(oskb);
3452                         sti();
3453                         kfree_skb(oskb, FREE_WRITE); /* write. */
3454                         if (!sk->dead) 
3455                                 sk->write_space(sk);
3456                 }
3457                 else
3458                 {
3459                         break;
3460                 }
3461         }
3462 
3463         /*
3464          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3465          * returns non-NULL, we complete ignore the timer stuff in the else
3466          * clause.  We ought to organize the code so that else clause can
3467          * (should) be executed regardless, possibly moving the PROBE timer
3468          * reset over.  The skb_peek() thing should only move stuff to the
3469          * write queue, NOT also manage the timer functions.
3470          */
3471 
3472         /*
3473          * Maybe we can take some stuff off of the write queue,
3474          * and put it onto the xmit queue.
3475          */
3476         if (skb_peek(&sk->write_queue) != NULL) 
3477         {
3478                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3479                         (sk->retransmits == 0 || 
3480                          sk->ip_xmit_timeout != TIME_WRITE ||
3481                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3482                         && sk->packets_out < sk->cong_window) 
3483                 {
3484                         /*
3485                          *      Add more data to the send queue.
3486                          */
3487                         flag |= 1;
3488                         tcp_write_xmit(sk);
3489                 }
3490                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3491                         sk->send_head == NULL &&
3492                         sk->ack_backlog == 0 &&
3493                         sk->state != TCP_TIME_WAIT) 
3494                 {
3495                         /*
3496                          *      Data to queue but no room.
3497                          */
3498                         reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3499                 }               
3500         }
3501         else
3502         {
3503                 /*
3504                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3505                  * from TCP_CLOSE we don't do anything
3506                  *
3507                  * from anything else, if there is write data (or fin) pending,
3508                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3509                  * a KEEPALIVE timeout, else we delete the timer.
3510                  *
3511                  * We do not set flag for nominal write data, otherwise we may
3512                  * force a state where we start to write itsy bitsy tidbits
3513                  * of data.
3514                  */
3515 
3516                 switch(sk->state) {
3517                 case TCP_TIME_WAIT:
3518                         /*
3519                          * keep us in TIME_WAIT until we stop getting packets,
3520                          * reset the timeout.
3521                          */
3522                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3523                         break;
3524                 case TCP_CLOSE:
3525                         /*
3526                          * don't touch the timer.
3527                          */
3528                         break;
3529                 default:
3530                         /*
3531                          *      Must check send_head, write_queue, and ack_backlog
3532                          *      to determine which timeout to use.
3533                          */
3534                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3535                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3536                         } else if (sk->keepopen) {
3537                                 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3538                         } else {
3539                                 del_timer(&sk->retransmit_timer);
3540                                 sk->ip_xmit_timeout = 0;
3541                         }
3542                         break;
3543                 }
3544         }
3545 
3546         /*
3547          *      We have nothing queued but space to send. Send any partial
3548          *      packets immediately (end of Nagle rule application).
3549          */
3550          
3551         if (sk->packets_out == 0 && sk->partial != NULL &&
3552                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3553         {
3554                 flag |= 1;
3555                 tcp_send_partial(sk);
3556         }
3557 
3558         /*
3559          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3560          * we are now waiting for an acknowledge to our FIN.  The other end is
3561          * already in TIME_WAIT.
3562          *
3563          * Move to TCP_CLOSE on success.
3564          */
3565 
3566         if (sk->state == TCP_LAST_ACK) 
3567         {
3568                 if (!sk->dead)
3569                         sk->state_change(sk);
3570                 if(sk->debug)
3571                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3572                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3573                 if (sk->rcv_ack_seq == sk->write_seq && sk->acked_seq == sk->fin_seq) 
3574                 {
3575                         flag |= 1;
3576                         tcp_set_state(sk,TCP_CLOSE);
3577                         sk->shutdown = SHUTDOWN_MASK;
3578                 }
3579         }
3580 
3581         /*
3582          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3583          *
3584          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3585          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3586          */
3587 
3588         if (sk->state == TCP_FIN_WAIT1) 
3589         {
3590 
3591                 if (!sk->dead) 
3592                         sk->state_change(sk);
3593                 if (sk->rcv_ack_seq == sk->write_seq) 
3594                 {
3595                         flag |= 1;
3596                         sk->shutdown |= SEND_SHUTDOWN;
3597                         tcp_set_state(sk, TCP_FIN_WAIT2);
3598                 }
3599         }
3600 
3601         /*
3602          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3603          *
3604          *      Move to TIME_WAIT
3605          */
3606 
3607         if (sk->state == TCP_CLOSING) 
3608         {
3609 
3610                 if (!sk->dead) 
3611                         sk->state_change(sk);
3612                 if (sk->rcv_ack_seq == sk->write_seq) 
3613                 {
3614                         flag |= 1;
3615                         tcp_time_wait(sk);
3616                 }
3617         }
3618         
3619         /*
3620          *      Final ack of a three way shake 
3621          */
3622          
3623         if(sk->state==TCP_SYN_RECV)
3624         {
3625                 tcp_set_state(sk, TCP_ESTABLISHED);
3626                 tcp_options(sk,th);
3627                 sk->dummy_th.dest=th->source;
3628                 sk->copied_seq = sk->acked_seq;
3629                 if(!sk->dead)
3630                         sk->state_change(sk);
3631                 if(sk->max_window==0)
3632                 {
3633                         sk->max_window=32;      /* Sanity check */
3634                         sk->mss=min(sk->max_window,sk->mtu);
3635                 }
3636         }
3637         
3638         /*
3639          * I make no guarantees about the first clause in the following
3640          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3641          * what conditions "!flag" would be true.  However I think the rest
3642          * of the conditions would prevent that from causing any
3643          * unnecessary retransmission. 
3644          *   Clearly if the first packet has expired it should be 
3645          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3646          * harder to explain:  You have to look carefully at how and when the
3647          * timer is set and with what timeout.  The most recent transmission always
3648          * sets the timer.  So in general if the most recent thing has timed
3649          * out, everything before it has as well.  So we want to go ahead and
3650          * retransmit some more.  If we didn't explicitly test for this
3651          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3652          * would not be true.  If you look at the pattern of timing, you can
3653          * show that rto is increased fast enough that the next packet would
3654          * almost never be retransmitted immediately.  Then you'd end up
3655          * waiting for a timeout to send each packet on the retransmission
3656          * queue.  With my implementation of the Karn sampling algorithm,
3657          * the timeout would double each time.  The net result is that it would
3658          * take a hideous amount of time to recover from a single dropped packet.
3659          * It's possible that there should also be a test for TIME_WRITE, but
3660          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3661          * got to be in real retransmission mode.
3662          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3663          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3664          * As long as no further losses occur, this seems reasonable.
3665          */
3666         
3667         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3668                (((flag&2) && sk->retransmits) ||
3669                (sk->send_head->when + sk->rto < jiffies))) 
3670         {
3671                 if(sk->send_head->when + sk->rto < jiffies)
3672                         tcp_retransmit(sk,0);   
3673                 else
3674                 {
3675                         tcp_do_retransmit(sk, 1);
3676                         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3677                 }
3678         }
3679 
3680         return(1);
3681 }
3682 
3683 
3684 /*
3685  *      Process the FIN bit. This now behaves as it is supposed to work
3686  *      and the FIN takes effect when it is validly part of sequence
3687  *      space. Not before when we get holes.
3688  *
3689  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3690  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3691  *      TIME-WAIT)
3692  *
3693  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3694  *      close and we go into CLOSING (and later onto TIME-WAIT)
3695  *
3696  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3697  *
3698  */
3699  
3700 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3701 {
3702         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3703 
3704         if (!sk->dead) 
3705         {
3706                 sk->state_change(sk);
3707                 sock_wake_async(sk->socket, 1);
3708         }
3709 
3710         switch(sk->state) 
3711         {
3712                 case TCP_SYN_RECV:
3713                 case TCP_SYN_SENT:
3714                 case TCP_ESTABLISHED:
3715                         /*
3716                          * move to CLOSE_WAIT, tcp_data() already handled
3717                          * sending the ack.
3718                          */     /* Check me --------------vvvvvvv */
3719                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3720                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3721                         if (th->rst)
3722                                 sk->shutdown = SHUTDOWN_MASK;
3723                         break;
3724 
3725                 case TCP_CLOSE_WAIT:
3726                 case TCP_CLOSING:
3727                         /*
3728                          * received a retransmission of the FIN, do
3729                          * nothing.
3730                          */
3731                         break;
3732                 case TCP_TIME_WAIT:
3733                         /*
3734                          * received a retransmission of the FIN,
3735                          * restart the TIME_WAIT timer.
3736                          */
3737                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3738                         return(0);
3739                 case TCP_FIN_WAIT1:
3740                         /*
3741                          * This case occurs when a simultaneous close
3742                          * happens, we must ack the received FIN and
3743                          * enter the CLOSING state.
3744                          *
3745                          * This causes a WRITE timeout, which will either
3746                          * move on to TIME_WAIT when we timeout, or resend
3747                          * the FIN properly (maybe we get rid of that annoying
3748                          * FIN lost hang). The TIME_WRITE code is already correct
3749                          * for handling this timeout.
3750                          */
3751 
3752                         if(sk->ip_xmit_timeout != TIME_WRITE)
3753                                 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3754                         tcp_set_state(sk,TCP_CLOSING);
3755                         break;
3756                 case TCP_FIN_WAIT2:
3757                         /*
3758                          * received a FIN -- send ACK and enter TIME_WAIT
3759                          */
3760                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3761                         sk->shutdown|=SHUTDOWN_MASK;
3762                         tcp_set_state(sk,TCP_TIME_WAIT);
3763                         break;
3764                 case TCP_CLOSE:
3765                         /*
3766                          * already in CLOSE
3767                          */
3768                         break;
3769                 default:
3770                         tcp_set_state(sk,TCP_LAST_ACK);
3771         
3772                         /* Start the timers. */
3773                         reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3774                         return(0);
3775         }
3776 
3777         return(0);
3778 }
3779 
3780 
3781 
3782 /*
3783  *      This routine handles the data.  If there is room in the buffer,
3784  *      it will be have already been moved into it.  If there is no
3785  *      room, then we will just have to discard the packet.
3786  */
3787 
3788 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3789          unsigned long saddr, unsigned short len)
3790 {
3791         struct sk_buff *skb1, *skb2;
3792         struct tcphdr *th;
3793         int dup_dumped=0;
3794         unsigned long new_seq;
3795         unsigned long shut_seq;
3796 
3797         th = skb->h.th;
3798         skb->len = len -(th->doff*4);
3799 
3800         /*
3801          *      The bytes in the receive read/assembly queue has increased. Needed for the
3802          *      low memory discard algorithm 
3803          */
3804            
3805         sk->bytes_rcv += skb->len;
3806         
3807         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3808         {
3809                 /* 
3810                  *      Don't want to keep passing ack's back and forth. 
3811                  *      (someone sent us dataless, boring frame)
3812                  */
3813                 if (!th->ack)
3814                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3815                 kfree_skb(skb, FREE_READ);
3816                 return(0);
3817         }
3818         
3819         /*
3820          *      We no longer have anyone receiving data on this connection.
3821          */
3822 
3823 #ifndef TCP_DONT_RST_SHUTDOWN            
3824 
3825         if(sk->shutdown & RCV_SHUTDOWN)
3826         {
3827                 /*
3828                  *      FIXME: BSD has some magic to avoid sending resets to
3829                  *      broken 4.2 BSD keepalives. Much to my surprise a few non
3830                  *      BSD stacks still have broken keepalives so we want to
3831                  *      cope with it.
3832                  */
3833 
3834                 if(skb->len)    /* We don't care if its just an ack or
3835                                    a keepalive/window probe */
3836                 {
3837                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3838                         
3839                         /* Do this the way 4.4BSD treats it. Not what I'd
3840                            regard as the meaning of the spec but its what BSD
3841                            does and clearly they know everything 8) */
3842 
3843                         /*
3844                          *      This is valid because of two things
3845                          *
3846                          *      a) The way tcp_data behaves at the bottom.
3847                          *      b) A fin takes effect when read not when received.
3848                          */
3849                          
3850                         shut_seq=sk->acked_seq+1;       /* Last byte */
3851                         
3852                         if(after(new_seq,shut_seq))
3853                         {
3854                                 if(sk->debug)
3855                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3856                                                 sk, new_seq, shut_seq, sk->blog);
3857                                 if(sk->dead)
3858                                 {
3859                                         sk->acked_seq = new_seq + th->fin;
3860                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3861                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3862                                         tcp_statistics.TcpEstabResets++;
3863                                         tcp_set_state(sk,TCP_CLOSE);
3864                                         sk->err = EPIPE;
3865                                         sk->shutdown = SHUTDOWN_MASK;
3866                                         kfree_skb(skb, FREE_READ);
3867                                         return 0;
3868                                 }
3869                         }
3870                 }
3871         }
3872 
3873 #endif
3874 
3875         /*
3876          *      Now we have to walk the chain, and figure out where this one
3877          *      goes into it.  This is set up so that the last packet we received
3878          *      will be the first one we look at, that way if everything comes
3879          *      in order, there will be no performance loss, and if they come
3880          *      out of order we will be able to fit things in nicely.
3881          *
3882          *      [AC: This is wrong. We should assume in order first and then walk
3883          *       forwards from the first hole based upon real traffic patterns.]
3884          *      
3885          */
3886 
3887         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3888         {
3889                 skb_queue_head(&sk->receive_queue,skb);
3890                 skb1= NULL;
3891         } 
3892         else
3893         {
3894                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3895                 {
3896                         if(sk->debug)
3897                         {
3898                                 printk("skb1=%p :", skb1);
3899                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3900                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3901                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3902                                                 sk->acked_seq);
3903                         }
3904                         
3905                         /*
3906                          *      Optimisation: Duplicate frame or extension of previous frame from
3907                          *      same sequence point (lost ack case).
3908                          *      The frame contains duplicate data or replaces a previous frame
3909                          *      discard the previous frame (safe as sk->inuse is set) and put
3910                          *      the new one in its place.
3911                          */
3912                          
3913                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3914                         {
3915                                 skb_append(skb1,skb);
3916                                 skb_unlink(skb1);
3917                                 kfree_skb(skb1,FREE_READ);
3918                                 dup_dumped=1;
3919                                 skb1=NULL;
3920                                 break;
3921                         }
3922                         
3923                         /*
3924                          *      Found where it fits
3925                          */
3926                          
3927                         if (after(th->seq+1, skb1->h.th->seq))
3928                         {
3929                                 skb_append(skb1,skb);
3930                                 break;
3931                         }
3932                         
3933                         /*
3934                          *      See if we've hit the start. If so insert.
3935                          */
3936                         if (skb1 == skb_peek(&sk->receive_queue))
3937                         {
3938                                 skb_queue_head(&sk->receive_queue, skb);
3939                                 break;
3940                         }
3941                 }
3942         }
3943 
3944         /*
3945          *      Figure out what the ack value for this frame is
3946          */
3947          
3948         th->ack_seq = th->seq + skb->len;
3949         if (th->syn) 
3950                 th->ack_seq++;
3951         if (th->fin)
3952                 th->ack_seq++;
3953 
3954         if (before(sk->acked_seq, sk->copied_seq)) 
3955         {
3956                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3957                 sk->acked_seq = sk->copied_seq;
3958         }
3959 
3960         /*
3961          *      Now figure out if we can ack anything. This is very messy because we really want two
3962          *      receive queues, a completed and an assembly queue. We also want only one transmit
3963          *      queue.
3964          */
3965 
3966         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3967         {
3968                 if (before(th->seq, sk->acked_seq+1)) 
3969                 {
3970                         int newwindow;
3971 
3972                         if (after(th->ack_seq, sk->acked_seq)) 
3973                         {
3974                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3975                                 if (newwindow < 0)
3976                                         newwindow = 0;  
3977                                 sk->window = newwindow;
3978                                 sk->acked_seq = th->ack_seq;
3979                         }
3980                         skb->acked = 1;
3981 
3982                         /*
3983                          *      When we ack the fin, we do the FIN 
3984                          *      processing.
3985                          */
3986 
3987                         if (skb->h.th->fin) 
3988                         {
3989                                 tcp_fin(skb,sk,skb->h.th);
3990                         }
3991           
3992                         for(skb2 = skb->next;
3993                             skb2 != (struct sk_buff *)&sk->receive_queue;
3994                             skb2 = skb2->next) 
3995                         {
3996                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3997                                 {
3998                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3999                                         {
4000                                                 newwindow = sk->window -
4001                                                  (skb2->h.th->ack_seq - sk->acked_seq);
4002                                                 if (newwindow < 0)
4003                                                         newwindow = 0;  
4004                                                 sk->window = newwindow;
4005                                                 sk->acked_seq = skb2->h.th->ack_seq;
4006                                         }
4007                                         skb2->acked = 1;
4008                                         /*
4009                                          *      When we ack the fin, we do
4010                                          *      the fin handling.
4011                                          */
4012                                         if (skb2->h.th->fin) 
4013                                         {
4014                                                 tcp_fin(skb,sk,skb->h.th);
4015                                         }
4016 
4017                                         /*
4018                                          *      Force an immediate ack.
4019                                          */
4020                                          
4021                                         sk->ack_backlog = sk->max_ack_backlog;
4022                                 }
4023                                 else
4024                                 {
4025                                         break;
4026                                 }
4027                         }
4028 
4029                         /*
4030                          *      This also takes care of updating the window.
4031                          *      This if statement needs to be simplified.
4032                          */
4033                         if (!sk->delay_acks ||
4034                             sk->ack_backlog >= sk->max_ack_backlog || 
4035                             sk->bytes_rcv > sk->max_unacked || th->fin) {
4036         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4037                         }
4038                         else 
4039                         {
4040                                 sk->ack_backlog++;
4041                                 if(sk->debug)
4042                                         printk("Ack queued.\n");
4043                                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4044                         }
4045                 }
4046         }
4047 
4048         /*
4049          *      If we've missed a packet, send an ack.
4050          *      Also start a timer to send another.
4051          */
4052          
4053         if (!skb->acked) 
4054         {
4055         
4056         /*
4057          *      This is important.  If we don't have much room left,
4058          *      we need to throw out a few packets so we have a good
4059          *      window.  Note that mtu is used, not mss, because mss is really
4060          *      for the send side.  He could be sending us stuff as large as mtu.
4061          */
4062                  
4063                 while (sk->prot->rspace(sk) < sk->mtu) 
4064                 {
4065                         skb1 = skb_peek(&sk->receive_queue);
4066                         if (skb1 == NULL) 
4067                         {
4068                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4069                                 break;
4070                         }
4071 
4072                         /*
4073                          *      Don't throw out something that has been acked. 
4074                          */
4075                  
4076                         if (skb1->acked) 
4077                         {
4078                                 break;
4079                         }
4080                 
4081                         skb_unlink(skb1);
4082                         kfree_skb(skb1, FREE_READ);
4083                 }
4084                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4085                 sk->ack_backlog++;
4086                 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4087         }
4088         else
4089         {
4090                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4091         }
4092 
4093         /*
4094          *      Now tell the user we may have some data. 
4095          */
4096          
4097         if (!sk->dead) 
4098         {
4099                 if(sk->debug)
4100                         printk("Data wakeup.\n");
4101                 sk->data_ready(sk,0);
4102         } 
4103         return(0);
4104 }
4105 
4106 
4107 /*
4108  *      This routine is only called when we have urgent data
4109  *      signalled. Its the 'slow' part of tcp_urg. It could be
4110  *      moved inline now as tcp_urg is only called from one
4111  *      place. We handle URGent data wrong. We have to - as
4112  *      BSD still doesn't use the correction from RFC961.
4113  */
4114  
4115 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
4116 {
4117         unsigned long ptr = ntohs(th->urg_ptr);
4118 
4119         if (ptr)
4120                 ptr--;
4121         ptr += th->seq;
4122 
4123         /* ignore urgent data that we've already seen and read */
4124         if (after(sk->copied_seq, ptr))
4125                 return;
4126 
4127         /* do we already have a newer (or duplicate) urgent pointer? */
4128         if (sk->urg_data && !after(ptr, sk->urg_seq))
4129                 return;
4130 
4131         /* tell the world about our new urgent pointer */
4132         if (sk->proc != 0) {
4133                 if (sk->proc > 0) {
4134                         kill_proc(sk->proc, SIGURG, 1);
4135                 } else {
4136                         kill_pg(-sk->proc, SIGURG, 1);
4137                 }
4138         }
4139         sk->urg_data = URG_NOTYET;
4140         sk->urg_seq = ptr;
4141 }
4142 
4143 /*
4144  *      This is the 'fast' part of urgent handling.
4145  */
4146  
4147 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
4148         unsigned long saddr, unsigned long len)
4149 {
4150         unsigned long ptr;
4151 
4152         /*
4153          *      Check if we get a new urgent pointer - normally not 
4154          */
4155          
4156         if (th->urg)
4157                 tcp_check_urg(sk,th);
4158 
4159         /*
4160          *      Do we wait for any urgent data? - normally not
4161          */
4162          
4163         if (sk->urg_data != URG_NOTYET)
4164                 return 0;
4165 
4166         /*
4167          *      Is the urgent pointer pointing into this packet? 
4168          */
4169          
4170         ptr = sk->urg_seq - th->seq + th->doff*4;
4171         if (ptr >= len)
4172                 return 0;
4173 
4174         /*
4175          *      Ok, got the correct packet, update info 
4176          */
4177          
4178         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4179         if (!sk->dead)
4180                 sk->data_ready(sk,0);
4181         return 0;
4182 }
4183 
4184 /*
4185  *      This will accept the next outstanding connection. 
4186  */
4187  
4188 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
4189 {
4190         struct sock *newsk;
4191         struct sk_buff *skb;
4192   
4193   /*
4194    * We need to make sure that this socket is listening,
4195    * and that it has something pending.
4196    */
4197 
4198         if (sk->state != TCP_LISTEN) 
4199         {
4200                 sk->err = EINVAL;
4201                 return(NULL); 
4202         }
4203 
4204         /* Avoid the race. */
4205         cli();
4206         sk->inuse = 1;
4207 
4208         while((skb = tcp_dequeue_established(sk)) == NULL) 
4209         {
4210                 if (flags & O_NONBLOCK) 
4211                 {
4212                         sti();
4213                         release_sock(sk);
4214                         sk->err = EAGAIN;
4215                         return(NULL);
4216                 }
4217 
4218                 release_sock(sk);
4219                 interruptible_sleep_on(sk->sleep);
4220                 if (current->signal & ~current->blocked) 
4221                 {
4222                         sti();
4223                         sk->err = ERESTARTSYS;
4224                         return(NULL);
4225                 }
4226                 sk->inuse = 1;
4227         }
4228         sti();
4229 
4230         /*
4231          *      Now all we need to do is return skb->sk. 
4232          */
4233 
4234         newsk = skb->sk;
4235 
4236         kfree_skb(skb, FREE_READ);
4237         sk->ack_backlog--;
4238         release_sock(sk);
4239         return(newsk);
4240 }
4241 
4242 
4243 /*
4244  *      This will initiate an outgoing connection. 
4245  */
4246  
4247 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
4248 {
4249         struct sk_buff *buff;
4250         struct device *dev=NULL;
4251         unsigned char *ptr;
4252         int tmp;
4253         int atype;
4254         struct tcphdr *t1;
4255         struct rtable *rt;
4256 
4257         if (sk->state != TCP_CLOSE) 
4258         {
4259                 return(-EISCONN);
4260         }
4261         
4262         if (addr_len < 8) 
4263                 return(-EINVAL);
4264 
4265         if (usin->sin_family && usin->sin_family != AF_INET) 
4266                 return(-EAFNOSUPPORT);
4267 
4268         /*
4269          *      connect() to INADDR_ANY means loopback (BSD'ism).
4270          */
4271         
4272         if(usin->sin_addr.s_addr==INADDR_ANY)
4273                 usin->sin_addr.s_addr=ip_my_addr();
4274                   
4275         /*
4276          *      Don't want a TCP connection going to a broadcast address 
4277          */
4278 
4279         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
4280                 return -ENETUNREACH;
4281   
4282         sk->inuse = 1;
4283         sk->daddr = usin->sin_addr.s_addr;
4284         sk->write_seq = jiffies * SEQ_TICK - seq_offset;
4285         sk->window_seq = sk->write_seq;
4286         sk->rcv_ack_seq = sk->write_seq -1;
4287         sk->err = 0;
4288         sk->dummy_th.dest = usin->sin_port;
4289         release_sock(sk);
4290 
4291         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4292         if (buff == NULL) 
4293         {
4294                 return(-ENOMEM);
4295         }
4296         sk->inuse = 1;
4297         buff->len = 24;
4298         buff->sk = sk;
4299         buff->free = 0;
4300         buff->localroute = sk->localroute;
4301         
4302         t1 = (struct tcphdr *) buff->data;
4303 
4304         /*
4305          *      Put in the IP header and routing stuff. 
4306          */
4307          
4308         rt=ip_rt_route(sk->daddr, NULL, NULL);
4309         
4310 
4311         /*
4312          *      We need to build the routing stuff from the things saved in skb. 
4313          */
4314 
4315         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4316                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4317         if (tmp < 0) 
4318         {
4319                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4320                 release_sock(sk);
4321                 return(-ENETUNREACH);
4322         }
4323 
4324         buff->len += tmp;
4325         t1 = (struct tcphdr *)((char *)t1 +tmp);
4326 
4327         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4328         t1->seq = ntohl(sk->write_seq++);
4329         sk->sent_seq = sk->write_seq;
4330         buff->h.seq = sk->write_seq;
4331         t1->ack = 0;
4332         t1->window = 2;
4333         t1->res1=0;
4334         t1->res2=0;
4335         t1->rst = 0;
4336         t1->urg = 0;
4337         t1->psh = 0;
4338         t1->syn = 1;
4339         t1->urg_ptr = 0;
4340         t1->doff = 6;
4341         /* use 512 or whatever user asked for */
4342         
4343         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4344                 sk->window_clamp=rt->rt_window;
4345         else
4346                 sk->window_clamp=0;
4347 
4348         if (sk->user_mss)
4349                 sk->mtu = sk->user_mss;
4350         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4351                 sk->mtu = rt->rt_mss;
4352         else 
4353         {
4354 #ifdef CONFIG_INET_SNARL
4355                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4356 #else
4357                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4358 #endif
4359                         sk->mtu = 576 - HEADER_SIZE;
4360                 else
4361                         sk->mtu = MAX_WINDOW;
4362         }
4363         /*
4364          *      but not bigger than device MTU 
4365          */
4366 
4367         if(sk->mtu <32)
4368                 sk->mtu = 32;   /* Sanity limit */
4369                 
4370         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4371         
4372         /*
4373          *      Put in the TCP options to say MTU. 
4374          */
4375 
4376         ptr = (unsigned char *)(t1+1);
4377         ptr[0] = 2;
4378         ptr[1] = 4;
4379         ptr[2] = (sk->mtu) >> 8;
4380         ptr[3] = (sk->mtu) & 0xff;
4381         tcp_send_check(t1, sk->saddr, sk->daddr,
4382                   sizeof(struct tcphdr) + 4, sk);
4383 
4384         /*
4385          *      This must go first otherwise a really quick response will get reset. 
4386          */
4387 
4388         tcp_set_state(sk,TCP_SYN_SENT);
4389         sk->rto = TCP_TIMEOUT_INIT;
4390         init_timer(&sk->retransmit_timer);
4391         sk->retransmit_timer.function=&retransmit_timer;
4392         sk->retransmit_timer.data = (unsigned long)sk;
4393         reset_xmit_timer(sk, TIME_WRITE, sk->rto);      /* Timer for repeating the SYN until an answer */
4394         sk->retransmits = TCP_SYN_RETRIES;
4395 
4396         sk->prot->queue_xmit(sk, dev, buff, 0);  
4397         reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4398         tcp_statistics.TcpActiveOpens++;
4399         tcp_statistics.TcpOutSegs++;
4400   
4401         release_sock(sk);
4402         return(0);
4403 }
4404 
4405 
4406 /* This functions checks to see if the tcp header is actually acceptable. */
4407 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
4408              struct options *opt, unsigned long saddr, struct device *dev)
4409 {
4410         unsigned long next_seq;
4411 
4412         next_seq = len - 4*th->doff;
4413         if (th->fin)
4414                 next_seq++;
4415         /* if we have a zero window, we can't have any data in the packet.. */
4416         if (next_seq && !sk->window)
4417                 goto ignore_it;
4418         next_seq += th->seq;
4419 
4420         /*
4421          * This isn't quite right.  sk->acked_seq could be more recent
4422          * than sk->window.  This is however close enough.  We will accept
4423          * slightly more packets than we should, but it should not cause
4424          * problems unless someone is trying to forge packets.
4425          */
4426 
4427         /* have we already seen all of this packet? */
4428         if (!after(next_seq+1, sk->acked_seq))
4429                 goto ignore_it;
4430         /* or does it start beyond the window? */
4431         if (!before(th->seq, sk->acked_seq + sk->window + 1))
4432                 goto ignore_it;
4433 
4434         /* ok, at least part of this packet would seem interesting.. */
4435         return 1;
4436 
4437 ignore_it:
4438         if (th->rst)
4439                 return 0;
4440 
4441         /*
4442          *      Send a reset if we get something not ours and we are
4443          *      unsynchronized. Note: We don't do anything to our end. We
4444          *      are just killing the bogus remote connection then we will
4445          *      connect again and it will work (with luck).
4446          */
4447          
4448         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4449         {
4450                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4451                 return 1;
4452         }
4453 
4454         /* Try to resync things. */
4455         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4456         return 0;
4457 }
4458 
4459 /*
4460  *      When we get a reset we do this.
4461  */
4462 
4463 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4464 {
4465         sk->zapped = 1;
4466         sk->err = ECONNRESET;
4467         if (sk->state == TCP_SYN_SENT)
4468                 sk->err = ECONNREFUSED;
4469         if (sk->state == TCP_CLOSE_WAIT)
4470                 sk->err = EPIPE;
4471 #ifdef TCP_DO_RFC1337           
4472         /*
4473          *      Time wait assassination protection [RFC1337]
4474          */
4475         if(sk->state!=TCP_TIME_WAIT)
4476         {       
4477                 tcp_set_state(sk,TCP_CLOSE);
4478                 sk->shutdown = SHUTDOWN_MASK;
4479         }
4480 #else   
4481         tcp_set_state(sk,TCP_CLOSE);
4482         sk->shutdown = SHUTDOWN_MASK;
4483 #endif  
4484         if (!sk->dead) 
4485                 sk->state_change(sk);
4486         kfree_skb(skb, FREE_READ);
4487         release_sock(sk);
4488         return(0);
4489 }
4490 
4491 /*
4492  *      A TCP packet has arrived.
4493  */
4494  
4495 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4496         unsigned long daddr, unsigned short len,
4497         unsigned long saddr, int redo, struct inet_protocol * protocol)
4498 {
4499         struct tcphdr *th;
4500         struct sock *sk;
4501         int syn_ok=0;
4502         
4503         if (!skb) 
4504         {
4505                 printk("IMPOSSIBLE 1\n");
4506                 return(0);
4507         }
4508 
4509         if (!dev) 
4510         {
4511                 printk("IMPOSSIBLE 2\n");
4512                 return(0);
4513         }
4514   
4515         tcp_statistics.TcpInSegs++;
4516   
4517         if(skb->pkt_type!=PACKET_HOST)
4518         {
4519                 kfree_skb(skb,FREE_READ);
4520                 return(0);
4521         }
4522   
4523         th = skb->h.th;
4524 
4525         /*
4526          *      Find the socket.
4527          */
4528 
4529         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4530 
4531         /*
4532          *      If this socket has got a reset its to all intents and purposes 
4533          *      really dead. Count closed sockets as dead.
4534          *
4535          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4536          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4537          *      exist so should cause resets as if the port was unreachable.
4538          */
4539          
4540         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4541                 sk=NULL;
4542 
4543         if (!redo) 
4544         {
4545                 if (tcp_check(th, len, saddr, daddr )) 
4546                 {
4547                         skb->sk = NULL;
4548                         kfree_skb(skb,FREE_READ);
4549                         /*
4550                          *      We don't release the socket because it was
4551                          *      never marked in use.
4552                          */
4553                         return(0);
4554                 }
4555                 th->seq = ntohl(th->seq);
4556 
4557                 /* See if we know about the socket. */
4558                 if (sk == NULL) 
4559                 {
4560                         /*
4561                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4562                          */
4563                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4564                         skb->sk = NULL;
4565                         /*
4566                          *      Discard frame
4567                          */
4568                         kfree_skb(skb, FREE_READ);
4569                         return(0);
4570                 }
4571 
4572                 skb->len = len;
4573                 skb->acked = 0;
4574                 skb->used = 0;
4575                 skb->free = 0;
4576                 skb->saddr = daddr;
4577                 skb->daddr = saddr;
4578         
4579                 /* We may need to add it to the backlog here. */
4580                 cli();
4581                 if (sk->inuse) 
4582                 {
4583                         skb_queue_tail(&sk->back_log, skb);
4584                         sti();
4585                         return(0);
4586                 }
4587                 sk->inuse = 1;
4588                 sti();
4589         }
4590         else
4591         {
4592                 if (sk==NULL) 
4593                 {
4594                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4595                         skb->sk = NULL;
4596                         kfree_skb(skb, FREE_READ);
4597                         return(0);
4598                 }
4599         }
4600 
4601 
4602         if (!sk->prot) 
4603         {
4604                 printk("IMPOSSIBLE 3\n");
4605                 return(0);
4606         }
4607 
4608 
4609         /*
4610          *      Charge the memory to the socket. 
4611          */
4612          
4613         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4614         {
4615                 kfree_skb(skb, FREE_READ);
4616                 release_sock(sk);
4617                 return(0);
4618         }
4619 
4620         skb->sk=sk;
4621         sk->rmem_alloc += skb->mem_len;
4622 
4623         /*
4624          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4625          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4626          *      compatibility. We also set up variables more thoroughly [Karn notes in the
4627          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4628          */
4629 
4630         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4631         {
4632         
4633                 /*
4634                  *      Now deal with unusual cases.
4635                  */
4636          
4637                 if(sk->state==TCP_LISTEN)
4638                 {
4639                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4640                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4641 
4642                         /*
4643                          *      We don't care for RST, and non SYN are absorbed (old segments)
4644                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4645                          *      netmask on a running connection it can go broadcast. Even Sun's have
4646                          *      this problem so I'm ignoring it 
4647                          */
4648                            
4649                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4650                         {
4651                                 kfree_skb(skb, FREE_READ);
4652                                 release_sock(sk);
4653                                 return 0;
4654                         }
4655                 
4656                         /*      
4657                          *      Guess we need to make a new socket up 
4658                          */
4659                 
4660                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4661                 
4662                         /*
4663                          *      Now we have several options: In theory there is nothing else
4664                          *      in the frame. KA9Q has an option to send data with the syn,
4665                          *      BSD accepts data with the syn up to the [to be] advertised window
4666                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4667                          *      it, that fits the spec precisely and avoids incompatibilities. It
4668                          *      would be nice in future to drop through and process the data.
4669                          */
4670                          
4671                         release_sock(sk);
4672                         return 0;
4673                 }
4674         
4675                 /* retransmitted SYN? */
4676                 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4677                 {
4678                         kfree_skb(skb, FREE_READ);
4679                         release_sock(sk);
4680                         return 0;
4681                 }
4682                 
4683                 /*
4684                  *      SYN sent means we have to look for a suitable ack and either reset
4685                  *      for bad matches or go to connected 
4686                  */
4687            
4688                 if(sk->state==TCP_SYN_SENT)
4689                 {
4690                         /* Crossed SYN or previous junk segment */
4691                         if(th->ack)
4692                         {
4693                                 /* We got an ack, but its not a good ack */
4694                                 if(!tcp_ack(sk,th,saddr,len))
4695                                 {
4696                                         /* Reset the ack - its an ack from a 
4697                                            different connection  [ th->rst is checked in tcp_reset()] */
4698                                         tcp_statistics.TcpAttemptFails++;
4699                                         tcp_reset(daddr, saddr, th,
4700                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4701                                         kfree_skb(skb, FREE_READ);
4702                                         release_sock(sk);
4703                                         return(0);
4704                                 }
4705                                 if(th->rst)
4706                                         return tcp_std_reset(sk,skb);
4707                                 if(!th->syn)
4708                                 {
4709                                         /* A valid ack from a different connection
4710                                            start. Shouldn't happen but cover it */
4711                                         kfree_skb(skb, FREE_READ);
4712                                         release_sock(sk);
4713                                         return 0;
4714                                 }
4715                                 /*
4716                                  *      Ok.. its good. Set up sequence numbers and
4717                                  *      move to established.
4718                                  */
4719                                 syn_ok=1;       /* Don't reset this connection for the syn */
4720                                 sk->acked_seq=th->seq+1;
4721                                 sk->fin_seq=th->seq;
4722                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4723                                 tcp_set_state(sk, TCP_ESTABLISHED);
4724                                 tcp_options(sk,th);
4725                                 sk->dummy_th.dest=th->source;
4726                                 sk->copied_seq = sk->acked_seq;
4727                                 if(!sk->dead)
4728                                 {
4729                                         sk->state_change(sk);
4730                                         sock_wake_async(sk->socket, 0);
4731                                 }
4732                                 if(sk->max_window==0)
4733                                 {
4734                                         sk->max_window = 32;
4735                                         sk->mss = min(sk->max_window, sk->mtu);
4736                                 }
4737                         }
4738                         else
4739                         {
4740                                 /* See if SYN's cross. Drop if boring */
4741                                 if(th->syn && !th->rst)
4742                                 {
4743                                         /* Crossed SYN's are fine - but talking to
4744                                            yourself is right out... */
4745                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4746                                                 sk->dummy_th.source==th->source &&
4747                                                 sk->dummy_th.dest==th->dest)
4748                                         {
4749                                                 tcp_statistics.TcpAttemptFails++;
4750                                                 return tcp_std_reset(sk,skb);
4751                                         }
4752                                         tcp_set_state(sk,TCP_SYN_RECV);
4753                                         
4754                                         /*
4755                                          *      FIXME:
4756                                          *      Must send SYN|ACK here
4757                                          */
4758                                 }               
4759                                 /* Discard junk segment */
4760                                 kfree_skb(skb, FREE_READ);
4761                                 release_sock(sk);
4762                                 return 0;
4763                         }
4764                         /*
4765                          *      SYN_RECV with data maybe.. drop through
4766                          */
4767                         goto rfc_step6;
4768                 }
4769 
4770         /*
4771          *      BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4772          *      a more complex suggestion for fixing these reuse issues in RFC1644
4773          *      but not yet ready for general use. Also see RFC1379.
4774          */
4775         
4776 #define BSD_TIME_WAIT
4777 #ifdef BSD_TIME_WAIT
4778                 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4779                         after(th->seq, sk->acked_seq) && !th->rst)
4780                 {
4781                         long seq=sk->write_seq;
4782                         if(sk->debug)
4783                                 printk("Doing a BSD time wait\n");
4784                         tcp_statistics.TcpEstabResets++;           
4785                         sk->rmem_alloc -= skb->mem_len;
4786                         skb->sk = NULL;
4787                         sk->err=ECONNRESET;
4788                         tcp_set_state(sk, TCP_CLOSE);
4789                         sk->shutdown = SHUTDOWN_MASK;
4790                         release_sock(sk);
4791                         sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4792                         if (sk && sk->state==TCP_LISTEN)
4793                         {
4794                                 sk->inuse=1;
4795                                 skb->sk = sk;
4796                                 sk->rmem_alloc += skb->mem_len;
4797                                 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4798                                 release_sock(sk);
4799                                 return 0;
4800                         }
4801                         kfree_skb(skb, FREE_READ);
4802                         return 0;
4803                 }
4804 #endif  
4805         }
4806 
4807         /*
4808          *      We are now in normal data flow (see the step list in the RFC)
4809          *      Note most of these are inline now. I'll inline the lot when
4810          *      I have time to test it hard and look at what gcc outputs 
4811          */
4812         
4813         if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4814         {
4815                 kfree_skb(skb, FREE_READ);
4816                 release_sock(sk);
4817                 return 0;
4818         }
4819 
4820         if(th->rst)
4821                 return tcp_std_reset(sk,skb);
4822         
4823         /*
4824          *      !syn_ok is effectively the state test in RFC793.
4825          */
4826          
4827         if(th->syn && !syn_ok)
4828         {
4829                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4830                 return tcp_std_reset(sk,skb);   
4831         }
4832 
4833         /*
4834          *      Process the ACK
4835          */
4836          
4837 
4838         if(th->ack && !tcp_ack(sk,th,saddr,len))
4839         {
4840                 /*
4841                  *      Our three way handshake failed.
4842                  */
4843                  
4844                 if(sk->state==TCP_SYN_RECV)
4845                 {
4846                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4847                 }
4848                 kfree_skb(skb, FREE_READ);
4849                 release_sock(sk);
4850                 return 0;
4851         }
4852         
4853 rfc_step6:              /* I'll clean this up later */
4854 
4855         /*
4856          *      Process urgent data
4857          */
4858                 
4859         if(tcp_urg(sk, th, saddr, len))
4860         {
4861                 kfree_skb(skb, FREE_READ);
4862                 release_sock(sk);
4863                 return 0;
4864         }
4865         
4866         
4867         /*
4868          *      Process the encapsulated data
4869          */
4870         
4871         if(tcp_data(skb,sk, saddr, len))
4872         {
4873                 kfree_skb(skb, FREE_READ);
4874                 release_sock(sk);
4875                 return 0;
4876         }
4877 
4878         /*
4879          *      And done
4880          */     
4881         
4882         release_sock(sk);
4883         return 0;
4884 }
4885 
4886 /*
4887  *      This routine sends a packet with an out of date sequence
4888  *      number. It assumes the other end will try to ack it.
4889  */
4890 
4891 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4892 {
4893         struct sk_buff *buff;
4894         struct tcphdr *t1;
4895         struct device *dev=NULL;
4896         int tmp;
4897 
4898         if (sk->zapped)
4899                 return; /* After a valid reset we can send no more */
4900 
4901         /*
4902          *      Write data can still be transmitted/retransmitted in the
4903          *      following states.  If any other state is encountered, return.
4904          *      [listen/close will never occur here anyway]
4905          */
4906 
4907         if (sk->state != TCP_ESTABLISHED && 
4908             sk->state != TCP_CLOSE_WAIT &&
4909             sk->state != TCP_FIN_WAIT1 && 
4910             sk->state != TCP_LAST_ACK &&
4911             sk->state != TCP_CLOSING
4912         ) 
4913         {
4914                 return;
4915         }
4916 
4917         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4918         if (buff == NULL) 
4919                 return;
4920 
4921         buff->len = sizeof(struct tcphdr);
4922         buff->free = 1;
4923         buff->sk = sk;
4924         buff->localroute = sk->localroute;
4925 
4926         t1 = (struct tcphdr *) buff->data;
4927 
4928         /* Put in the IP header and routing stuff. */
4929         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4930                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4931         if (tmp < 0) 
4932         {
4933                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4934                 return;
4935         }
4936 
4937         buff->len += tmp;
4938         t1 = (struct tcphdr *)((char *)t1 +tmp);
4939 
4940         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4941 
4942         /*
4943          *      Use a previous sequence.
4944          *      This should cause the other end to send an ack.
4945          */
4946          
4947         t1->seq = htonl(sk->sent_seq-1);
4948         t1->ack = 1; 
4949         t1->res1= 0;
4950         t1->res2= 0;
4951         t1->rst = 0;
4952         t1->urg = 0;
4953         t1->psh = 0;
4954         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4955         t1->syn = 0;
4956         t1->ack_seq = ntohl(sk->acked_seq);
4957         t1->window = ntohs(tcp_select_window(sk));
4958         t1->doff = sizeof(*t1)/4;
4959         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4960          /*
4961           *     Send it and free it.
4962           *     This will prevent the timer from automatically being restarted.
4963           */
4964         sk->prot->queue_xmit(sk, dev, buff, 1);
4965         tcp_statistics.TcpOutSegs++;
4966 }
4967 
4968 /*
4969  *      A window probe timeout has occurred.
4970  */
4971 
4972 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4973 {
4974         if (sk->zapped)
4975                 return;         /* After a valid reset we can send no more */
4976 
4977         tcp_write_wakeup(sk);
4978 
4979         sk->backoff++;
4980         sk->rto = min(sk->rto << 1, 120*HZ);
4981         reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
4982         sk->retransmits++;
4983         sk->prot->retransmits ++;
4984 }
4985 
4986 /*
4987  *      Socket option code for TCP. 
4988  */
4989   
4990 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4991 {
4992         int val,err;
4993 
4994         if(level!=SOL_TCP)
4995                 return ip_setsockopt(sk,level,optname,optval,optlen);
4996 
4997         if (optval == NULL) 
4998                 return(-EINVAL);
4999 
5000         err=verify_area(VERIFY_READ, optval, sizeof(int));
5001         if(err)
5002                 return err;
5003         
5004         val = get_fs_long((unsigned long *)optval);
5005 
5006         switch(optname)
5007         {
5008                 case TCP_MAXSEG:
5009 /*
5010  * values greater than interface MTU won't take effect.  however at
5011  * the point when this call is done we typically don't yet know
5012  * which interface is going to be used
5013  */
5014                         if(val<1||val>MAX_WINDOW)
5015                                 return -EINVAL;
5016                         sk->user_mss=val;
5017                         return 0;
5018                 case TCP_NODELAY:
5019                         sk->nonagle=(val==0)?0:1;
5020                         return 0;
5021                 default:
5022                         return(-ENOPROTOOPT);
5023         }
5024 }
5025 
5026 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
5027 {
5028         int val,err;
5029 
5030         if(level!=SOL_TCP)
5031                 return ip_getsockopt(sk,level,optname,optval,optlen);
5032                         
5033         switch(optname)
5034         {
5035                 case TCP_MAXSEG:
5036                         val=sk->user_mss;
5037                         break;
5038                 case TCP_NODELAY:
5039                         val=sk->nonagle;
5040                         break;
5041                 default:
5042                         return(-ENOPROTOOPT);
5043         }
5044         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5045         if(err)
5046                 return err;
5047         put_fs_long(sizeof(int),(unsigned long *) optlen);
5048 
5049         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5050         if(err)
5051                 return err;
5052         put_fs_long(val,(unsigned long *)optval);
5053 
5054         return(0);
5055 }       
5056 
5057 
5058 struct proto tcp_prot = {
5059         sock_wmalloc,
5060         sock_rmalloc,
5061         sock_wfree,
5062         sock_rfree,
5063         sock_rspace,
5064         sock_wspace,
5065         tcp_close,
5066         tcp_read,
5067         tcp_write,
5068         tcp_sendto,
5069         tcp_recvfrom,
5070         ip_build_header,
5071         tcp_connect,
5072         tcp_accept,
5073         ip_queue_xmit,
5074         tcp_retransmit,
5075         tcp_write_wakeup,
5076         tcp_read_wakeup,
5077         tcp_rcv,
5078         tcp_select,
5079         tcp_ioctl,
5080         NULL,
5081         tcp_shutdown,
5082         tcp_setsockopt,
5083         tcp_getsockopt,
5084         128,
5085         0,
5086         {NULL,},
5087         "TCP",
5088         0, 0
5089 };

/* [previous][next][first][last][top][bottom][index][help] */