root/net/inet/tcp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. min
  2. tcp_set_state
  3. tcp_select_window
  4. tcp_find_established
  5. tcp_close_pending
  6. tcp_dequeue_established
  7. tcp_time_wait
  8. tcp_do_retransmit
  9. tcp_retransmit_time
  10. tcp_retransmit
  11. tcp_err
  12. tcp_readable
  13. tcp_select
  14. tcp_ioctl
  15. tcp_check
  16. tcp_send_check
  17. tcp_send_skb
  18. tcp_dequeue_partial
  19. tcp_send_partial
  20. tcp_enqueue_partial
  21. tcp_send_ack
  22. tcp_build_header
  23. tcp_write
  24. tcp_sendto
  25. tcp_read_wakeup
  26. cleanup_rbuf
  27. tcp_read_urg
  28. tcp_read
  29. tcp_shutdown
  30. tcp_recvfrom
  31. tcp_reset
  32. tcp_options
  33. default_mask
  34. tcp_init_seq
  35. tcp_conn_request
  36. tcp_close
  37. tcp_write_xmit
  38. tcp_ack
  39. tcp_fin
  40. tcp_data
  41. tcp_check_urg
  42. tcp_urg
  43. tcp_accept
  44. tcp_connect
  45. tcp_sequence
  46. tcp_std_reset
  47. tcp_rcv
  48. tcp_write_wakeup
  49. tcp_send_probe0
  50. tcp_setsockopt
  51. tcp_getsockopt

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     @(#)tcp.c       1.0.16  05/25/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@no.unit.nvg>
  20  *
  21  * Fixes:       
  22  *              Alan Cox        :       Numerous verify_area() calls
  23  *              Alan Cox        :       Set the ACK bit on a reset
  24  *              Alan Cox        :       Stopped it crashing if it closed while sk->inuse=1
  25  *                                      and was trying to connect (tcp_err()).
  26  *              Alan Cox        :       All icmp error handling was broken
  27  *                                      pointers passed where wrong and the
  28  *                                      socket was looked up backwards. Nobody
  29  *                                      tested any icmp error code obviously.
  30  *              Alan Cox        :       tcp_err() now handled properly. It wakes people
  31  *                                      on errors. select behaves and the icmp error race
  32  *                                      has gone by moving it into sock.c
  33  *              Alan Cox        :       tcp_reset() fixed to work for everything not just
  34  *                                      packets for unknown sockets.
  35  *              Alan Cox        :       tcp option processing.
  36  *              Alan Cox        :       Reset tweaked (still not 100%) [Had syn rule wrong]
  37  *              Herp Rosmanith  :       More reset fixes
  38  *              Alan Cox        :       No longer acks invalid rst frames. Acking
  39  *                                      any kind of RST is right out.
  40  *              Alan Cox        :       Sets an ignore me flag on an rst receive
  41  *                                      otherwise odd bits of prattle escape still
  42  *              Alan Cox        :       Fixed another acking RST frame bug. Should stop
  43  *                                      LAN workplace lockups.
  44  *              Alan Cox        :       Some tidyups using the new skb list facilities
  45  *              Alan Cox        :       sk->keepopen now seems to work
  46  *              Alan Cox        :       Pulls options out correctly on accepts
  47  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  48  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a bit to skb ops.
  49  *              Alan Cox        :       Tidied tcp_data to avoid a potential nasty.
  50  *              Alan Cox        :       Added some better commenting, as the tcp is hard to follow
  51  *              Alan Cox        :       Removed incorrect check for 20 * psh
  52  *      Michael O'Reilly        :       ack < copied bug fix.
  53  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  54  *              Alan Cox        :       FIN with no memory -> CRASH
  55  *              Alan Cox        :       Added socket option proto entries. Also added awareness of them to accept.
  56  *              Alan Cox        :       Added TCP options (SOL_TCP)
  57  *              Alan Cox        :       Switched wakeup calls to callbacks, so the kernel can layer network sockets.
  58  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  59  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  60  *              Alan Cox        :       RST frames sent on unsynchronised state ack error/
  61  *              Alan Cox        :       Put in missing check for SYN bit.
  62  *              Alan Cox        :       Added tcp_select_window() aka NET2E 
  63  *                                      window non shrink trick.
  64  *              Alan Cox        :       Added a couple of small NET2E timer fixes
  65  *              Charles Hedrick :       TCP fixes
  66  *              Toomas Tamm     :       TCP window fixes
  67  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  68  *              Charles Hedrick :       Rewrote most of it to actually work
  69  *              Linus           :       Rewrote tcp_read() and URG handling
  70  *                                      completely
  71  *              Gerhard Koerting:       Fixed some missing timer handling
  72  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  73  *              Gerhard Koerting:       PC/TCP workarounds
  74  *              Adam Caldwell   :       Assorted timer/timing errors
  75  *              Matthew Dillon  :       Fixed another RST bug
  76  *              Alan Cox        :       Move to kernel side addressing changes.
  77  *              Alan Cox        :       Beginning work on TCP fastpathing (not yet usable)
  78  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  79  *              Alan Cox        :       TCP fast path debugging
  80  *              Alan Cox        :       Window clamping
  81  *              Michael Riepe   :       Bug in tcp_check()
  82  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  83  *              Matt Dillon     :       Yet more small nasties remove from the TCP code
  84  *                                      (Be very nice to this man if tcp finally works 100%) 8)
  85  *              Alan Cox        :       BSD accept semantics. 
  86  *              Alan Cox        :       Reset on closedown bug.
  87  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
  88  *              Michael Pall    :       Handle select() after URG properly in all cases.
  89  *              Michael Pall    :       Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
  90  *              Michael Pall    :       Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
  91  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the BSD api.
  92  *              Alan Cox        :       Changed the semantics of sk->socket to 
  93  *                                      fix a race and a signal problem with
  94  *                                      accept() and async I/O.
  95  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
  96  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
  97  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
  98  *                                      clients/servers which listen in on
  99  *                                      fixed ports.
 100  *              Alan Cox        :       Cleaned the above up and shrank it to
 101  *                                      a sensible code size.
 102  *              Alan Cox        :       Self connect lockup fix.
 103  *              Alan Cox        :       No connect to multicast.
 104  *              Ross Biro       :       Close unaccepted children on master
 105  *                                      socket close.
 106  *              Alan Cox        :       Reset tracing code.
 107  *              Alan Cox        :       Spurious resets on shutdown.
 108  *              Alan Cox        :       Giant 15 minute/60 second timer error
 109  *              Alan Cox        :       Small whoops in selecting before an accept.
 110  *              Alan Cox        :       Kept the state trace facility since its
 111  *                                      handy for debugging.
 112  *              Alan Cox        :       More reset handler fixes.
 113  *              Alan Cox        :       Started rewriting the code based on the RFC's
 114  *                                      for other useful protocol references see:  
 115  *                                      Comer, KA9Q NOS, and for a reference on the
 116  *                                      difference between specifications and how BSD
 117  *                                      works see the 4.4lite source.
 118  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy 
 119  *                                      close.
 120  *
 121  *
 122  * To Fix:
 123  *              Fast path the code. Two things here - fix the window calculation
 124  *              so it doesn't iterate over the queue, also spot packets with no funny
 125  *              options arriving in order and process directly.
 126  *
 127  *              Implement RFC 1191 [Path MTU discovery]
 128  *              Look at the effect of implementing RFC 1337 suggestions and their impact.
 129  *              Rewrite output state machine to use a single queue and do low window
 130  *              situations as per the spec (RFC 1122)
 131  *              Speed up input assembly algorithm.
 132  *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
 133  *              could do with it working on IPv4
 134  *              User settable/learned rtt/max window/mtu
 135  *              Cope with MTU/device switches when retransmitting in tcp.
 136  *
 137  *
 138  *
 139  *              This program is free software; you can redistribute it and/or
 140  *              modify it under the terms of the GNU General Public License
 141  *              as published by the Free Software Foundation; either version
 142  *              2 of the License, or(at your option) any later version.
 143  *
 144  * Description of States:
 145  *
 146  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 147  *
 148  *      TCP_SYN_RECV            received a connection request, sent ack,
 149  *                              waiting for final ack in three-way handshake.
 150  *
 151  *      TCP_ESTABLISHED         connection established
 152  *
 153  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 154  *                              transmission of remaining buffered data
 155  *
 156  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 157  *                              to shutdown
 158  *
 159  *      TCP_CLOSING             both sides have shutdown but we still have
 160  *                              data we have to finish sending
 161  *
 162  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 163  *                              closed, can only be entered from FIN_WAIT2
 164  *                              or CLOSING.  Required because the other end
 165  *                              may not have gotten our last ACK causing it
 166  *                              to retransmit the data packet (which we ignore)
 167  *
 168  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 169  *                              us to finish writing our data and to shutdown
 170  *                              (we have to close() to move on to LAST_ACK)
 171  *
 172  *      TCP_LAST_ACK            out side has shutdown after remote has
 173  *                              shutdown.  There may still be data in our
 174  *                              buffer that we have to finish sending
 175  *              
 176  *      TCP_CLOSE               socket is finished
 177  */
 178 #include <linux/types.h>
 179 #include <linux/sched.h>
 180 #include <linux/mm.h>
 181 #include <linux/string.h>
 182 #include <linux/socket.h>
 183 #include <linux/sockios.h>
 184 #include <linux/termios.h>
 185 #include <linux/in.h>
 186 #include <linux/fcntl.h>
 187 #include <linux/inet.h>
 188 #include <linux/netdevice.h>
 189 #include "snmp.h"
 190 #include "ip.h"
 191 #include "protocol.h"
 192 #include "icmp.h"
 193 #include "tcp.h"
 194 #include <linux/skbuff.h>
 195 #include "sock.h"
 196 #include "route.h"
 197 #include <linux/errno.h>
 198 #include <linux/timer.h>
 199 #include <asm/system.h>
 200 #include <asm/segment.h>
 201 #include <linux/mm.h>
 202 
 203 #undef TCP_FASTPATH
 204 
 205 #define SEQ_TICK 3
 206 unsigned long seq_offset;
 207 struct tcp_mib  tcp_statistics;
 208 
 209 static void tcp_close(struct sock *sk, int timeout);
 210 
 211 #ifdef TCP_FASTPATH
 212 unsigned long tcp_rx_miss=0, tcp_rx_hit1=0, tcp_rx_hit2=0;
 213 #endif
 214 
 215 /* The less said about this the better, but it works and will do for 1.2 */
 216 
 217 static struct wait_queue *master_select_wakeup;
 218 
 219 static __inline__ int min(unsigned int a, unsigned int b)
     /* [previous][next][first][last][top][bottom][index][help] */
 220 {
 221         if (a < b) 
 222                 return(a);
 223         return(b);
 224 }
 225 
 226 #undef STATE_TRACE
 227 
 228 #ifdef STATE_TRACE
 229 static char *statename[]={
 230         "Unused","Established","Syn Sent","Syn Recv",
 231         "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
 232         "Close Wait","Last ACK","Listen","Closing"
 233 };
 234 #endif
 235 
 236 static __inline__ void tcp_set_state(struct sock *sk, int state)
     /* [previous][next][first][last][top][bottom][index][help] */
 237 {
 238         if(sk->state==TCP_ESTABLISHED)
 239                 tcp_statistics.TcpCurrEstab--;
 240 #ifdef STATE_TRACE
 241         if(sk->debug)
 242                 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
 243 #endif  
 244         /* This is a hack but it doesnt occur often and its going to
 245            be a real        to fix nicely */
 246            
 247         if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
 248         {
 249                 wake_up_interruptible(&master_select_wakeup);
 250         }
 251         sk->state=state;
 252         if(state==TCP_ESTABLISHED)
 253                 tcp_statistics.TcpCurrEstab++;
 254 }
 255 
 256 /* This routine picks a TCP windows for a socket based on
 257    the following constraints
 258    
 259    1. The window can never be shrunk once it is offered (RFC 793)
 260    2. We limit memory per socket
 261    
 262    For now we use NET2E3's heuristic of offering half the memory
 263    we have handy. All is not as bad as this seems however because
 264    of two things. Firstly we will bin packets even within the window
 265    in order to get the data we are waiting for into the memory limit.
 266    Secondly we bin common duplicate forms at receive time
 267    
 268    Better heuristics welcome
 269 */
 270    
 271 int tcp_select_window(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 272 {
 273         int new_window = sk->prot->rspace(sk);
 274         
 275         if(sk->window_clamp)
 276                 new_window=min(sk->window_clamp,new_window);
 277 /*
 278  * two things are going on here.  First, we don't ever offer a
 279  * window less than min(sk->mss, MAX_WINDOW/2).  This is the
 280  * receiver side of SWS as specified in RFC1122.
 281  * Second, we always give them at least the window they
 282  * had before, in order to avoid retracting window.  This
 283  * is technically allowed, but RFC1122 advises against it and
 284  * in practice it causes trouble.
 285  */
 286         if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
 287                 return(sk->window);
 288         return(new_window);
 289 }
 290 
 291 /*
 292  *      Find someone to 'accept'. Must be called with
 293  *      sk->inuse=1 or cli()
 294  */ 
 295 
 296 static struct sk_buff *tcp_find_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 297 {
 298         struct sk_buff *p=skb_peek(&s->receive_queue);
 299         if(p==NULL)
 300                 return NULL;
 301         do
 302         {
 303                 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
 304                         return p;
 305                 p=p->next;
 306         }
 307         while(p!=(struct sk_buff *)&s->receive_queue);
 308         return NULL;
 309 }
 310 
 311 
 312 /* 
 313  *      This routine closes sockets which have been at least partially
 314  *      opened, but not yet accepted. Currently it is only called by
 315  *      tcp_close, and timeout mirrors the value there. 
 316  */
 317 
 318 static void tcp_close_pending (struct sock *sk, int timeout) 
     /* [previous][next][first][last][top][bottom][index][help] */
 319 {
 320         struct sk_buff *skb;
 321 
 322         while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
 323                 tcp_close(skb->sk, timeout);
 324                 kfree_skb(skb, FREE_READ);
 325         }
 326         return;
 327 }
 328 
 329 static struct sk_buff *tcp_dequeue_established(struct sock *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 330 {
 331         struct sk_buff *skb;
 332         unsigned long flags;
 333         save_flags(flags);
 334         cli(); 
 335         skb=tcp_find_established(s);
 336         if(skb!=NULL)
 337                 skb_unlink(skb);        /* Take it off the queue */
 338         restore_flags(flags);
 339         return skb;
 340 }
 341 
 342 
 343 /*
 344  *      Enter the time wait state. 
 345  */
 346 
 347 static void tcp_time_wait(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 348 {
 349         tcp_set_state(sk,TCP_TIME_WAIT);
 350         sk->shutdown = SHUTDOWN_MASK;
 351         if (!sk->dead)
 352                 sk->state_change(sk);
 353         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 354 }
 355 
 356 /*
 357  *      A socket has timed out on its send queue and wants to do a
 358  *      little retransmitting. Currently this means TCP.
 359  */
 360 
 361 void tcp_do_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363         struct sk_buff * skb;
 364         struct proto *prot;
 365         struct device *dev;
 366 
 367         prot = sk->prot;
 368         skb = sk->send_head;
 369 
 370         while (skb != NULL)
 371         {
 372                 struct tcphdr *th;
 373                 struct iphdr *iph;
 374                 int size;
 375 
 376                 dev = skb->dev;
 377                 IS_SKB(skb);
 378                 skb->when = jiffies;
 379 
 380                 /*
 381                  * In general it's OK just to use the old packet.  However we
 382                  * need to use the current ack and window fields.  Urg and
 383                  * urg_ptr could possibly stand to be updated as well, but we
 384                  * don't keep the necessary data.  That shouldn't be a problem,
 385                  * if the other end is doing the right thing.  Since we're
 386                  * changing the packet, we have to issue a new IP identifier.
 387                  */
 388 
 389 
 390                 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
 391                 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
 392                 size = skb->len - (((unsigned char *) th) - skb->data);
 393 
 394                 iph->id = htons(ip_id_count++);
 395                 ip_send_check(iph);
 396 
 397                 /*
 398                  *      This is not the right way to handle this. We have to
 399                  *      issue an up to date window and ack report with this 
 400                  *      retransmit to keep the odd buggy tcp that relies on 
 401                  *      the fact BSD does this happy. 
 402                  *      We don't however need to recalculate the entire 
 403                  *      checksum, so someone wanting a small problem to play
 404                  *      with might like to implement RFC1141/RFC1624 and speed
 405                  *      this up by avoiding a full checksum.
 406                  */
 407                  
 408                 th->ack_seq = ntohl(sk->acked_seq);
 409                 th->window = ntohs(tcp_select_window(sk));
 410                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 411                 
 412                 /*
 413                  *      If the interface is (still) up and running, kick it.
 414                  */
 415 
 416                 if (dev->flags & IFF_UP)
 417                 {
 418                         /*
 419                          *      If the packet is still being sent by the device/protocol
 420                          *      below then don't retransmit. This is both needed, and good -
 421                          *      especially with connected mode AX.25 where it stops resends
 422                          *      occurring of an as yet unsent anyway frame!
 423                          *      We still add up the counts as the round trip time wants
 424                          *      adjusting.
 425                          */
 426                         if (sk && !skb_device_locked(skb))
 427                         {
 428                                 /* Remove it from any existing driver queue first! */
 429                                 skb_unlink(skb);
 430                                 /* Now queue it */
 431                                 ip_statistics.IpOutRequests++;
 432                                 dev_queue_xmit(skb, dev, sk->priority);
 433                         }
 434                 }
 435 
 436                 /*
 437                  *      Count retransmissions
 438                  */
 439                 sk->retransmits++;
 440                 sk->prot->retransmits ++;
 441 
 442                 /*
 443                  *      Only one retransmit requested.
 444                  */
 445                 if (!all)
 446                         break;
 447 
 448                 /*
 449                  *      This should cut it off before we send too many packets.
 450                  */
 451                 if (sk->retransmits >= sk->cong_window)
 452                         break;
 453                 skb = skb->link3;
 454         }
 455 }
 456 
 457 /*
 458  *      This is the normal code called for timeouts.  It does the retransmission
 459  *      and then does backoff.  tcp_do_retransmit is separated out because
 460  *      tcp_ack needs to send stuff from the retransmit queue without
 461  *      initiating a backoff.
 462  */
 463 
 464 void tcp_retransmit_time(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 465 {
 466         tcp_do_retransmit(sk, all);
 467 
 468         /*
 469          * Increase the timeout each time we retransmit.  Note that
 470          * we do not increase the rtt estimate.  rto is initialized
 471          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 472          * that doubling rto each time is the least we can get away with.
 473          * In KA9Q, Karn uses this for the first few times, and then
 474          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 475          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 476          * defined in the protocol as the maximum possible RTT.  I guess
 477          * we'll have to use something other than TCP to talk to the
 478          * University of Mars.
 479          */
 480 
 481         sk->retransmits++;
 482         sk->backoff++;
 483         sk->rto = min(sk->rto << 1, 120*HZ);
 484         reset_timer(sk, TIME_WRITE, sk->rto);
 485 }
 486 
 487 
 488 /*
 489  *      A timer event has trigger a tcp retransmit timeout. The
 490  *      socket xmit queue is ready and set up to send. Because
 491  *      the ack receive code keeps the queue straight we do
 492  *      nothing clever here.
 493  */
 494 
 495 static void tcp_retransmit(struct sock *sk, int all)
     /* [previous][next][first][last][top][bottom][index][help] */
 496 {
 497         if (all) 
 498         {
 499                 tcp_retransmit_time(sk, all);
 500                 return;
 501         }
 502 
 503         sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
 504         /* sk->ssthresh in theory can be zero.  I guess that's OK */
 505         sk->cong_count = 0;
 506 
 507         sk->cong_window = 1;
 508 
 509         /* Do the actual retransmit. */
 510         tcp_retransmit_time(sk, all);
 511 }
 512 
 513 
 514 /*
 515  * This routine is called by the ICMP module when it gets some
 516  * sort of error condition.  If err < 0 then the socket should
 517  * be closed and the error returned to the user.  If err > 0
 518  * it's just the icmp type << 8 | icmp code.  After adjustment
 519  * header points to the first 8 bytes of the tcp header.  We need
 520  * to find the appropriate port.
 521  */
 522 
 523 void tcp_err(int err, unsigned char *header, unsigned long daddr,
     /* [previous][next][first][last][top][bottom][index][help] */
 524         unsigned long saddr, struct inet_protocol *protocol)
 525 {
 526         struct tcphdr *th;
 527         struct sock *sk;
 528         struct iphdr *iph=(struct iphdr *)header;
 529   
 530         header+=4*iph->ihl;
 531    
 532 
 533         th =(struct tcphdr *)header;
 534         sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
 535 
 536         if (sk == NULL) 
 537                 return;
 538   
 539         if(err<0)
 540         {
 541                 sk->err = -err;
 542                 sk->error_report(sk);
 543                 return;
 544         }
 545 
 546         if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) 
 547         {
 548                 /*
 549                  * FIXME:
 550                  * For now we will just trigger a linear backoff.
 551                  * The slow start code should cause a real backoff here.
 552                  */
 553                 if (sk->cong_window > 4)
 554                         sk->cong_window--;
 555                 return;
 556         }
 557 
 558 /*      sk->err = icmp_err_convert[err & 0xff].errno;  -- moved as TCP should hide non fatals internally (and does) */
 559 
 560         /*
 561          * If we've already connected we will keep trying
 562          * until we time out, or the user gives up.
 563          */
 564 
 565         if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) 
 566         {
 567                 if (sk->state == TCP_SYN_SENT) 
 568                 {
 569                         tcp_statistics.TcpAttemptFails++;
 570                         tcp_set_state(sk,TCP_CLOSE);
 571                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 572                 }
 573                 sk->err = icmp_err_convert[err & 0xff].errno;           
 574         }
 575         return;
 576 }
 577 
 578 
 579 /*
 580  *      Walk down the receive queue counting readable data until we hit the end or we find a gap
 581  *      in the received data queue (ie a frame missing that needs sending to us)
 582  */
 583 
 584 static int tcp_readable(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 585 {
 586         unsigned long counted;
 587         unsigned long amount;
 588         struct sk_buff *skb;
 589         int sum;
 590         unsigned long flags;
 591 
 592         if(sk && sk->debug)
 593                 printk("tcp_readable: %p - ",sk);
 594 
 595         save_flags(flags);
 596         cli();
 597         if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
 598         {
 599                 restore_flags(flags);
 600                 if(sk && sk->debug) 
 601                         printk("empty\n");
 602                 return(0);
 603         }
 604   
 605         counted = sk->copied_seq+1;     /* Where we are at the moment */
 606         amount = 0;
 607   
 608         /* Do until a push or until we are out of data. */
 609         do 
 610         {
 611                 if (before(counted, skb->h.th->seq))    /* Found a hole so stops here */
 612                         break;
 613                 sum = skb->len -(counted - skb->h.th->seq);     /* Length - header but start from where we are up to (avoid overlaps) */
 614                 if (skb->h.th->syn)
 615                         sum++;
 616                 if (sum > 0) 
 617                 {                                       /* Add it up, move on */
 618                         amount += sum;
 619                         if (skb->h.th->syn) 
 620                                 amount--;
 621                         counted += sum;
 622                 }
 623                 /*
 624                  * Don't count urg data ... but do it in the right place!
 625                  * Consider: "old_data (ptr is here) URG PUSH data"
 626                  * The old code would stop at the first push because
 627                  * it counted the urg (amount==1) and then does amount--
 628                  * *after* the loop.  This means tcp_readable() always
 629                  * returned zero if any URG PUSH was in the queue, even
 630                  * though there was normal data available. If we subtract
 631                  * the urg data right here, we even get it to work for more
 632                  * than one URG PUSH skb without normal data.
 633                  * This means that select() finally works now with urg data
 634                  * in the queue.  Note that rlogin was never affected
 635                  * because it doesn't use select(); it uses two processes
 636                  * and a blocking read().  And the queue scan in tcp_read()
 637                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 638                  */
 639                 if (skb->h.th->urg)
 640                         amount--;       /* don't count urg data */
 641                 if (amount && skb->h.th->psh) break;
 642                 skb = skb->next;
 643         }
 644         while(skb != (struct sk_buff *)&sk->receive_queue);
 645 
 646         restore_flags(flags);
 647         if(sk->debug)
 648                 printk("got %lu bytes.\n",amount);
 649         return(amount);
 650 }
 651 
 652 
 653 /*
 654  *      Wait for a TCP event. Note the oddity with SEL_IN and reading. The
 655  *      listening socket has a receive queue of sockets to accept.
 656  */
 657 
 658 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
     /* [previous][next][first][last][top][bottom][index][help] */
 659 {
 660         sk->inuse = 1;
 661 
 662         switch(sel_type) 
 663         {
 664                 case SEL_IN:
 665                         select_wait(sk->sleep, wait);
 666                         if(sk->state==TCP_LISTEN)
 667                                 select_wait(&master_select_wakeup,wait);
 668                         if (skb_peek(&sk->receive_queue) != NULL) 
 669                         {
 670                                 if ((sk->state == TCP_LISTEN && tcp_find_established(sk)) || tcp_readable(sk)) 
 671                                 {
 672                                         release_sock(sk);
 673                                         return(1);
 674                                 }
 675                         }
 676                         if (sk->err != 0)       /* Receiver error */
 677                         {
 678                                 release_sock(sk);
 679                                 return(1);
 680                         }
 681                         if (sk->shutdown & RCV_SHUTDOWN) 
 682                         {
 683                                 release_sock(sk);
 684                                 return(1);
 685                         } 
 686                         release_sock(sk);
 687                         return(0);
 688                 case SEL_OUT:
 689                         select_wait(sk->sleep, wait);
 690                         if (sk->shutdown & SEND_SHUTDOWN) 
 691                         {
 692                                 /* FIXME: should this return an error? */
 693                                 release_sock(sk);
 694                                 return(0);
 695                         }
 696 
 697                         /*
 698                          * This is now right thanks to a small fix
 699                          * by Matt Dillon.
 700                          */
 701                         
 702                         if (sk->prot->wspace(sk) >= sk->mtu+128+sk->prot->max_header) 
 703                         {
 704                                 release_sock(sk);
 705                                 /* This should cause connect to work ok. */
 706                                 if (sk->state == TCP_SYN_RECV ||
 707                                     sk->state == TCP_SYN_SENT) return(0);
 708                                 return(1);
 709                         }
 710                         release_sock(sk);
 711                         return(0);
 712                 case SEL_EX:
 713                         select_wait(sk->sleep,wait);
 714                         if (sk->err || sk->urg_data) 
 715                         {
 716                                 release_sock(sk);
 717                                 return(1);
 718                         }
 719                         release_sock(sk);
 720                         return(0);
 721         }
 722 
 723         release_sock(sk);
 724         return(0);
 725 }
 726 
 727 
 728 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /* [previous][next][first][last][top][bottom][index][help] */
 729 {
 730         int err;
 731         switch(cmd) 
 732         {
 733 
 734                 case TIOCINQ:
 735 #ifdef FIXME    /* FIXME: */
 736                 case FIONREAD:
 737 #endif
 738                 {
 739                         unsigned long amount;
 740 
 741                         if (sk->state == TCP_LISTEN) 
 742                                 return(-EINVAL);
 743 
 744                         sk->inuse = 1;
 745                         amount = tcp_readable(sk);
 746                         release_sock(sk);
 747                         err=verify_area(VERIFY_WRITE,(void *)arg,
 748                                                    sizeof(unsigned long));
 749                         if(err)
 750                                 return err;
 751                         put_fs_long(amount,(unsigned long *)arg);
 752                         return(0);
 753                 }
 754                 case SIOCATMARK:
 755                 {
 756                         int answ = sk->urg_data && sk->urg_seq == sk->copied_seq+1;
 757 
 758                         err = verify_area(VERIFY_WRITE,(void *) arg,
 759                                                   sizeof(unsigned long));
 760                         if (err)
 761                                 return err;
 762                         put_fs_long(answ,(int *) arg);
 763                         return(0);
 764                 }
 765                 case TIOCOUTQ:
 766                 {
 767                         unsigned long amount;
 768 
 769                         if (sk->state == TCP_LISTEN) return(-EINVAL);
 770                         amount = sk->prot->wspace(sk);
 771                         err=verify_area(VERIFY_WRITE,(void *)arg,
 772                                                    sizeof(unsigned long));
 773                         if(err)
 774                                 return err;
 775                         put_fs_long(amount,(unsigned long *)arg);
 776                         return(0);
 777                 }
 778                 default:
 779                         return(-EINVAL);
 780         }
 781 }
 782 
 783 
 784 /*
 785  *      This routine computes a TCP checksum. 
 786  */
 787  
 788 unsigned short tcp_check(struct tcphdr *th, int len,
     /* [previous][next][first][last][top][bottom][index][help] */
 789           unsigned long saddr, unsigned long daddr)
 790 {     
 791         unsigned long sum;
 792    
 793         if (saddr == 0) saddr = ip_my_addr();
 794 
 795 /*
 796  * stupid, gcc complains when I use just one __asm__ block,
 797  * something about too many reloads, but this is just two
 798  * instructions longer than what I want
 799  */
 800         __asm__("
 801             addl %%ecx, %%ebx
 802             adcl %%edx, %%ebx
 803             adcl $0, %%ebx
 804             "
 805         : "=b"(sum)
 806         : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
 807         : "bx", "cx", "dx" );
 808         __asm__("
 809             movl %%ecx, %%edx
 810             cld
 811             cmpl $32, %%ecx
 812             jb 2f
 813             shrl $5, %%ecx
 814             clc
 815 1:          lodsl
 816             adcl %%eax, %%ebx
 817             lodsl
 818             adcl %%eax, %%ebx
 819             lodsl
 820             adcl %%eax, %%ebx
 821             lodsl
 822             adcl %%eax, %%ebx
 823             lodsl
 824             adcl %%eax, %%ebx
 825             lodsl
 826             adcl %%eax, %%ebx
 827             lodsl
 828             adcl %%eax, %%ebx
 829             lodsl
 830             adcl %%eax, %%ebx
 831             loop 1b
 832             adcl $0, %%ebx
 833             movl %%edx, %%ecx
 834 2:          andl $28, %%ecx
 835             je 4f
 836             shrl $2, %%ecx
 837             clc
 838 3:          lodsl
 839             adcl %%eax, %%ebx
 840             loop 3b
 841             adcl $0, %%ebx
 842 4:          movl $0, %%eax
 843             testw $2, %%dx
 844             je 5f
 845             lodsw
 846             addl %%eax, %%ebx
 847             adcl $0, %%ebx
 848             movw $0, %%ax
 849 5:          test $1, %%edx
 850             je 6f
 851             lodsb
 852             addl %%eax, %%ebx
 853             adcl $0, %%ebx
 854 6:          movl %%ebx, %%eax
 855             shrl $16, %%eax
 856             addw %%ax, %%bx
 857             adcw $0, %%bx
 858             "
 859         : "=b"(sum)
 860         : "0"(sum), "c"(len), "S"(th)
 861         : "ax", "bx", "cx", "dx", "si" );
 862 
 863         /* We only want the bottom 16 bits, but we never cleared the top 16. */
 864   
 865         return((~sum) & 0xffff);
 866 }
 867 
 868 
 869 
 870 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
     /* [previous][next][first][last][top][bottom][index][help] */
 871                 unsigned long daddr, int len, struct sock *sk)
 872 {
 873         th->check = 0;
 874         th->check = tcp_check(th, len, saddr, daddr);
 875         return;
 876 }
 877 
 878 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
 879 {
 880         int size;
 881         struct tcphdr * th = skb->h.th;
 882 
 883         /* length of packet (not counting length of pre-tcp headers) */
 884         size = skb->len - ((unsigned char *) th - skb->data);
 885 
 886         /* sanity check it.. */
 887         if (size < sizeof(struct tcphdr) || size > skb->len) 
 888         {
 889                 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
 890                         skb, skb->data, th, skb->len);
 891                 kfree_skb(skb, FREE_WRITE);
 892                 return;
 893         }
 894 
 895         /* If we have queued a header size packet.. */
 896         if (size == sizeof(struct tcphdr)) 
 897         {
 898                 /* If its got a syn or fin its notionally included in the size..*/
 899                 if(!th->syn && !th->fin) 
 900                 {
 901                         printk("tcp_send_skb: attempt to queue a bogon.\n");
 902                         kfree_skb(skb,FREE_WRITE);
 903                         return;
 904                 }
 905         }
 906 
 907         tcp_statistics.TcpOutSegs++;  
 908 
 909         skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
 910         if (after(skb->h.seq, sk->window_seq) ||
 911             (sk->retransmits && sk->timeout == TIME_WRITE) ||
 912              sk->packets_out >= sk->cong_window) 
 913         {
 914                 /* checksum will be supplied by tcp_write_xmit.  So
 915                  * we shouldn't need to set it at all.  I'm being paranoid */
 916                 th->check = 0;
 917                 if (skb->next != NULL) 
 918                 {
 919                         printk("tcp_send_partial: next != NULL\n");
 920                         skb_unlink(skb);
 921                 }
 922                 skb_queue_tail(&sk->write_queue, skb);
 923                 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
 924                     sk->send_head == NULL &&
 925                     sk->ack_backlog == 0)
 926                         reset_timer(sk, TIME_PROBE0, sk->rto);
 927         } 
 928         else 
 929         {
 930                 th->ack_seq = ntohl(sk->acked_seq);
 931                 th->window = ntohs(tcp_select_window(sk));
 932 
 933                 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
 934 
 935                 sk->sent_seq = sk->write_seq;
 936                 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
 937         }
 938 }
 939 
 940 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 941 {
 942         struct sk_buff * skb;
 943         unsigned long flags;
 944 
 945         save_flags(flags);
 946         cli();
 947         skb = sk->partial;
 948         if (skb) {
 949                 sk->partial = NULL;
 950                 del_timer(&sk->partial_timer);
 951         }
 952         restore_flags(flags);
 953         return skb;
 954 }
 955 
 956 static void tcp_send_partial(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 957 {
 958         struct sk_buff *skb;
 959 
 960         if (sk == NULL)
 961                 return;
 962         while ((skb = tcp_dequeue_partial(sk)) != NULL)
 963                 tcp_send_skb(sk, skb);
 964 }
 965 
 966 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
     /* [previous][next][first][last][top][bottom][index][help] */
 967 {
 968         struct sk_buff * tmp;
 969         unsigned long flags;
 970 
 971         save_flags(flags);
 972         cli();
 973         tmp = sk->partial;
 974         if (tmp)
 975                 del_timer(&sk->partial_timer);
 976         sk->partial = skb;
 977         init_timer(&sk->partial_timer);
 978         sk->partial_timer.expires = HZ;
 979         sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
 980         sk->partial_timer.data = (unsigned long) sk;
 981         add_timer(&sk->partial_timer);
 982         restore_flags(flags);
 983         if (tmp)
 984                 tcp_send_skb(sk, tmp);
 985 }
 986 
 987 
 988 /*
 989  *      This routine sends an ack and also updates the window. 
 990  */
 991  
 992 static void tcp_send_ack(unsigned long sequence, unsigned long ack,
     /* [previous][next][first][last][top][bottom][index][help] */
 993              struct sock *sk,
 994              struct tcphdr *th, unsigned long daddr)
 995 {
 996         struct sk_buff *buff;
 997         struct tcphdr *t1;
 998         struct device *dev = NULL;
 999         int tmp;
1000 
1001         if(sk->zapped)
1002                 return;         /* We have been reset, we may not send again */
1003         /*
1004          * We need to grab some memory, and put together an ack,
1005          * and then put it into the queue to be sent.
1006          */
1007 
1008         buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1009         if (buff == NULL) 
1010         {
1011                 /* Force it to send an ack. */
1012                 sk->ack_backlog++;
1013                 if (sk->timeout != TIME_WRITE && tcp_connected(sk->state)) 
1014                 {
1015                         reset_timer(sk, TIME_WRITE, 10);
1016                 }
1017                 return;
1018         }
1019 
1020         buff->len = sizeof(struct tcphdr);
1021         buff->sk = sk;
1022         buff->localroute = sk->localroute;
1023         t1 =(struct tcphdr *) buff->data;
1024 
1025         /* Put in the IP header and routing stuff. */
1026         tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1027                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1028         if (tmp < 0) 
1029         {
1030                 buff->free=1;
1031                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1032                 return;
1033         }
1034         buff->len += tmp;
1035         t1 =(struct tcphdr *)((char *)t1 +tmp);
1036 
1037         /* FIXME: */
1038         memcpy(t1, th, sizeof(*t1)); /* this should probably be removed */
1039 
1040         /*
1041          *      Swap the send and the receive. 
1042          */
1043          
1044         t1->dest = th->source;
1045         t1->source = th->dest;
1046         t1->seq = ntohl(sequence);
1047         t1->ack = 1;
1048         sk->window = tcp_select_window(sk);
1049         t1->window = ntohs(sk->window);
1050         t1->res1 = 0;
1051         t1->res2 = 0;
1052         t1->rst = 0;
1053         t1->urg = 0;
1054         t1->syn = 0;
1055         t1->psh = 0;
1056         t1->fin = 0;
1057         if (ack == sk->acked_seq) 
1058         {
1059                 sk->ack_backlog = 0;
1060                 sk->bytes_rcv = 0;
1061                 sk->ack_timed = 0;
1062                 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1063                                   && sk->timeout == TIME_WRITE) 
1064                 {
1065                         if(sk->keepopen) {
1066                                 reset_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1067                         } else {
1068                                 delete_timer(sk);
1069                         }
1070                 }
1071         }
1072         t1->ack_seq = ntohl(ack);
1073         t1->doff = sizeof(*t1)/4;
1074         tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1075         if (sk->debug)
1076                  printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
1077         tcp_statistics.TcpOutSegs++;
1078         sk->prot->queue_xmit(sk, dev, buff, 1);
1079 }
1080 
1081 
1082 /* 
1083  *      This routine builds a generic TCP header. 
1084  */
1085  
1086 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
     /* [previous][next][first][last][top][bottom][index][help] */
1087 {
1088 
1089         /* FIXME: want to get rid of this. */
1090         memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1091         th->seq = htonl(sk->write_seq);
1092         th->psh =(push == 0) ? 1 : 0;
1093         th->doff = sizeof(*th)/4;
1094         th->ack = 1;
1095         th->fin = 0;
1096         sk->ack_backlog = 0;
1097         sk->bytes_rcv = 0;
1098         sk->ack_timed = 0;
1099         th->ack_seq = htonl(sk->acked_seq);
1100         sk->window = tcp_select_window(sk);
1101         th->window = htons(sk->window);
1102 
1103         return(sizeof(*th));
1104 }
1105 
1106 /*
1107  *      This routine copies from a user buffer into a socket,
1108  *      and starts the transmit system.
1109  */
1110 
1111 static int tcp_write(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1112           int len, int nonblock, unsigned flags)
1113 {
1114         int copied = 0;
1115         int copy;
1116         int tmp;
1117         struct sk_buff *skb;
1118         struct sk_buff *send_tmp;
1119         unsigned char *buff;
1120         struct proto *prot;
1121         struct device *dev = NULL;
1122 
1123         sk->inuse=1;
1124         prot = sk->prot;
1125         while(len > 0) 
1126         {
1127                 if (sk->err) 
1128                 {                       /* Stop on an error */
1129                         release_sock(sk);
1130                         if (copied) 
1131                                 return(copied);
1132                         tmp = -sk->err;
1133                         sk->err = 0;
1134                         return(tmp);
1135                 }
1136 
1137         /*
1138          *      First thing we do is make sure that we are established. 
1139          */
1140         
1141                 if (sk->shutdown & SEND_SHUTDOWN) 
1142                 {
1143                         release_sock(sk);
1144                         sk->err = EPIPE;
1145                         if (copied) 
1146                                 return(copied);
1147                         sk->err = 0;
1148                         return(-EPIPE);
1149                 }
1150 
1151 
1152         /* 
1153          *      Wait for a connection to finish.
1154          */
1155         
1156                 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
1157                 {
1158                         if (sk->err) 
1159                         {
1160                                 release_sock(sk);
1161                                 if (copied) 
1162                                         return(copied);
1163                                 tmp = -sk->err;
1164                                 sk->err = 0;
1165                                 return(tmp);
1166                         }
1167 
1168                         if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
1169                         {
1170                                 release_sock(sk);
1171                                 if (copied) 
1172                                         return(copied);
1173 
1174                                 if (sk->err) 
1175                                 {
1176                                         tmp = -sk->err;
1177                                         sk->err = 0;
1178                                         return(tmp);
1179                                 }
1180 
1181                                 if (sk->keepopen) 
1182                                 {
1183                                         send_sig(SIGPIPE, current, 0);
1184                                 }
1185                                 return(-EPIPE);
1186                         }
1187 
1188                         if (nonblock || copied) 
1189                         {
1190                                 release_sock(sk);
1191                                 if (copied) 
1192                                         return(copied);
1193                                 return(-EAGAIN);
1194                         }
1195 
1196                         release_sock(sk);
1197                         cli();
1198                 
1199                         if (sk->state != TCP_ESTABLISHED &&
1200                                 sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
1201                         {
1202                                 interruptible_sleep_on(sk->sleep);
1203                                 if (current->signal & ~current->blocked) 
1204                                 {
1205                                         sti();
1206                                         if (copied) 
1207                                                 return(copied);
1208                                         return(-ERESTARTSYS);
1209                                 }
1210                         }
1211                         sk->inuse = 1;
1212                         sti();
1213                 }
1214 
1215         /*
1216          * The following code can result in copy <= if sk->mss is ever
1217          * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1218          * sk->mtu is constant once SYN processing is finished.  I.e. we
1219          * had better not get here until we've seen his SYN and at least one
1220          * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1221          * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1222          * non-decreasing.  Note that any ioctl to set user_mss must be done
1223          * before the exchange of SYN's.  If the initial ack from the other
1224          * end has a window of 0, max_window and thus mss will both be 0.
1225          */
1226 
1227         /* 
1228          *      Now we need to check if we have a half built packet. 
1229          */
1230 
1231                 if ((skb = tcp_dequeue_partial(sk)) != NULL) 
1232                 {
1233                         int hdrlen;
1234 
1235                          /* IP header + TCP header */
1236                         hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1237                                  + sizeof(struct tcphdr);
1238         
1239                         /* Add more stuff to the end of skb->len */
1240                         if (!(flags & MSG_OOB)) 
1241                         {
1242                                 copy = min(sk->mss - (skb->len - hdrlen), len);
1243                                 /* FIXME: this is really a bug. */
1244                                 if (copy <= 0) 
1245                                 {
1246                                         printk("TCP: **bug**: \"copy\" <= 0!!\n");
1247                                         copy = 0;
1248                                 }
1249           
1250                                 memcpy_fromfs(skb->data + skb->len, from, copy);
1251                                 skb->len += copy;
1252                                 from += copy;
1253                                 copied += copy;
1254                                 len -= copy;
1255                                 sk->write_seq += copy;
1256                         }
1257                         if ((skb->len - hdrlen) >= sk->mss ||
1258                                 (flags & MSG_OOB) || !sk->packets_out)
1259                                 tcp_send_skb(sk, skb);
1260                         else
1261                                 tcp_enqueue_partial(skb, sk);
1262                         continue;
1263                 }
1264 
1265         /*
1266          * We also need to worry about the window.
1267          * If window < 1/2 the maximum window we've seen from this
1268          *   host, don't use it.  This is sender side
1269          *   silly window prevention, as specified in RFC1122.
1270          *   (Note that this is different than earlier versions of
1271          *   SWS prevention, e.g. RFC813.).  What we actually do is 
1272          *   use the whole MSS.  Since the results in the right
1273          *   edge of the packet being outside the window, it will
1274          *   be queued for later rather than sent.
1275          */
1276 
1277                 copy = sk->window_seq - sk->write_seq;
1278                 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1279                         copy = sk->mss;
1280                 if (copy > len)
1281                         copy = len;
1282 
1283         /*
1284          *      We should really check the window here also. 
1285          */
1286          
1287                 send_tmp = NULL;
1288                 if (copy < sk->mss && !(flags & MSG_OOB)) 
1289                 {
1290                         /*
1291                          *      We will release the socket incase we sleep here. 
1292                          */
1293                         release_sock(sk);
1294                         /*
1295                          *      NB: following must be mtu, because mss can be increased.
1296                          *      mss is always <= mtu 
1297                          */
1298                         skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1299                         sk->inuse = 1;
1300                         send_tmp = skb;
1301                 } 
1302                 else 
1303                 {
1304                         /*
1305                          *      We will release the socket incase we sleep here. 
1306                          */
1307                         release_sock(sk);
1308                         skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1309                         sk->inuse = 1;
1310                 }
1311 
1312                 /*
1313                  *      If we didn't get any memory, we need to sleep. 
1314                  */
1315 
1316                 if (skb == NULL) 
1317                 {
1318                         if (nonblock) 
1319                         {
1320                                 release_sock(sk);
1321                                 if (copied) 
1322                                         return(copied);
1323                                 return(-EAGAIN);
1324                         }
1325 
1326                         /*
1327                          *      FIXME: here is another race condition. 
1328                          */
1329 
1330                         tmp = sk->wmem_alloc;
1331                         release_sock(sk);
1332                         cli();
1333                         /*
1334                          *      Again we will try to avoid it. 
1335                          */
1336                         if (tmp <= sk->wmem_alloc &&
1337                                   (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1338                                 && sk->err == 0) 
1339                         {
1340                                 interruptible_sleep_on(sk->sleep);
1341                                 if (current->signal & ~current->blocked) 
1342                                 {
1343                                         sti();
1344                                         if (copied) 
1345                                                 return(copied);
1346                                         return(-ERESTARTSYS);
1347                                 }
1348                         }
1349                         sk->inuse = 1;
1350                         sti();
1351                         continue;
1352                 }
1353 
1354                 skb->len = 0;
1355                 skb->sk = sk;
1356                 skb->free = 0;
1357                 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1358         
1359                 buff = skb->data;
1360         
1361                 /*
1362                  * FIXME: we need to optimize this.
1363                  * Perhaps some hints here would be good.
1364                  */
1365                 
1366                 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1367                                  IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1368                 if (tmp < 0 ) 
1369                 {
1370                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1371                         release_sock(sk);
1372                         if (copied) 
1373                                 return(copied);
1374                         return(tmp);
1375                 }
1376                 skb->len += tmp;
1377                 skb->dev = dev;
1378                 buff += tmp;
1379                 skb->h.th =(struct tcphdr *) buff;
1380                 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1381                 if (tmp < 0) 
1382                 {
1383                         prot->wfree(sk, skb->mem_addr, skb->mem_len);
1384                         release_sock(sk);
1385                         if (copied) 
1386                                 return(copied);
1387                         return(tmp);
1388                 }
1389 
1390                 if (flags & MSG_OOB) 
1391                 {
1392                         ((struct tcphdr *)buff)->urg = 1;
1393                         ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1394                 }
1395                 skb->len += tmp;
1396                 memcpy_fromfs(buff+tmp, from, copy);
1397 
1398                 from += copy;
1399                 copied += copy;
1400                 len -= copy;
1401                 skb->len += copy;
1402                 skb->free = 0;
1403                 sk->write_seq += copy;
1404         
1405                 if (send_tmp != NULL && sk->packets_out) 
1406                 {
1407                         tcp_enqueue_partial(send_tmp, sk);
1408                         continue;
1409                 }
1410                 tcp_send_skb(sk, skb);
1411         }
1412         sk->err = 0;
1413 
1414 /*
1415  *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1416  *      interactive fast network servers. It's meant to be on and
1417  *      it really improves the throughput though not the echo time
1418  *      on my slow slip link - Alan
1419  */
1420 
1421 /*
1422  *      Avoid possible race on send_tmp - c/o Johannes Stille 
1423  */
1424  
1425         if(sk->partial && ((!sk->packets_out) 
1426      /* If not nagling we can send on the before case too.. */
1427               || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1428         ))
1429                 tcp_send_partial(sk);
1430 
1431         release_sock(sk);
1432         return(copied);
1433 }
1434 
1435 
1436 static int tcp_sendto(struct sock *sk, unsigned char *from,
     /* [previous][next][first][last][top][bottom][index][help] */
1437            int len, int nonblock, unsigned flags,
1438            struct sockaddr_in *addr, int addr_len)
1439 {
1440         if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1441                 return -EINVAL;
1442         if (sk->state == TCP_CLOSE)
1443                 return -ENOTCONN;
1444         if (addr_len < sizeof(*addr))
1445                 return -EINVAL;
1446         if (addr->sin_family && addr->sin_family != AF_INET) 
1447                 return -EINVAL;
1448         if (addr->sin_port != sk->dummy_th.dest) 
1449                 return -EISCONN;
1450         if (addr->sin_addr.s_addr != sk->daddr) 
1451                 return -EISCONN;
1452         return tcp_write(sk, from, len, nonblock, flags);
1453 }
1454 
1455 
1456 static void tcp_read_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1457 {
1458         int tmp;
1459         struct device *dev = NULL;
1460         struct tcphdr *t1;
1461         struct sk_buff *buff;
1462 
1463         if (!sk->ack_backlog) 
1464                 return;
1465 
1466         /*
1467          * FIXME: we need to put code here to prevent this routine from
1468          * being called.  Being called once in a while is ok, so only check
1469          * if this is the second time in a row.
1470          */
1471 
1472         /*
1473          * We need to grab some memory, and put together an ack,
1474          * and then put it into the queue to be sent.
1475          */
1476 
1477         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1478         if (buff == NULL) 
1479         {
1480                 /* Try again real soon. */
1481                 reset_timer(sk, TIME_WRITE, 10);
1482                 return;
1483         }
1484 
1485         buff->len = sizeof(struct tcphdr);
1486         buff->sk = sk;
1487         buff->localroute = sk->localroute;
1488         
1489         /*
1490          *      Put in the IP header and routing stuff. 
1491          */
1492 
1493         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1494                                IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1495         if (tmp < 0) 
1496         {
1497                 buff->free=1;
1498                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1499                 return;
1500         }
1501 
1502         buff->len += tmp;
1503         t1 =(struct tcphdr *)(buff->data +tmp);
1504 
1505         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1506         t1->seq = htonl(sk->sent_seq);
1507         t1->ack = 1;
1508         t1->res1 = 0;
1509         t1->res2 = 0;
1510         t1->rst = 0;
1511         t1->urg = 0;
1512         t1->syn = 0;
1513         t1->psh = 0;
1514         sk->ack_backlog = 0;
1515         sk->bytes_rcv = 0;
1516         sk->window = tcp_select_window(sk);
1517         t1->window = ntohs(sk->window);
1518         t1->ack_seq = ntohl(sk->acked_seq);
1519         t1->doff = sizeof(*t1)/4;
1520         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1521         sk->prot->queue_xmit(sk, dev, buff, 1);
1522         tcp_statistics.TcpOutSegs++;
1523 }
1524 
1525 
1526 /*
1527  *      FIXME:
1528  *      This routine frees used buffers.
1529  *      It should consider sending an ACK to let the
1530  *      other end know we now have a bigger window.
1531  */
1532 
1533 static void cleanup_rbuf(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
1534 {
1535         unsigned long flags;
1536         unsigned long left;
1537         struct sk_buff *skb;
1538         unsigned long rspace;
1539 
1540         if(sk->debug)
1541                 printk("cleaning rbuf for sk=%p\n", sk);
1542   
1543         save_flags(flags);
1544         cli();
1545   
1546         left = sk->prot->rspace(sk);
1547  
1548         /*
1549          * We have to loop through all the buffer headers,
1550          * and try to free up all the space we can.
1551          */
1552 
1553         while((skb=skb_peek(&sk->receive_queue)) != NULL) 
1554         {
1555                 if (!skb->used) 
1556                         break;
1557                 skb_unlink(skb);
1558                 skb->sk = sk;
1559                 kfree_skb(skb, FREE_READ);
1560         }
1561 
1562         restore_flags(flags);
1563 
1564         /*
1565          * FIXME:
1566          * At this point we should send an ack if the difference
1567          * in the window, and the amount of space is bigger than
1568          * TCP_WINDOW_DIFF.
1569          */
1570 
1571         if(sk->debug)
1572                 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1573                                             left);
1574         if ((rspace=sk->prot->rspace(sk)) != left) 
1575         {
1576                 /*
1577                  * This area has caused the most trouble.  The current strategy
1578                  * is to simply do nothing if the other end has room to send at
1579                  * least 3 full packets, because the ack from those will auto-
1580                  * matically update the window.  If the other end doesn't think
1581                  * we have much space left, but we have room for at least 1 more
1582                  * complete packet than it thinks we do, we will send an ack
1583                  * immediately.  Otherwise we will wait up to .5 seconds in case
1584                  * the user reads some more.
1585                  */
1586                 sk->ack_backlog++;
1587         /*
1588          * It's unclear whether to use sk->mtu or sk->mss here.  They differ only
1589          * if the other end is offering a window smaller than the agreed on MSS
1590          * (called sk->mtu here).  In theory there's no connection between send
1591          * and receive, and so no reason to think that they're going to send
1592          * small packets.  For the moment I'm using the hack of reducing the mss
1593          * only on the send side, so I'm putting mtu here.
1594          */
1595 
1596                 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) 
1597                 {
1598                         /* Send an ack right now. */
1599                         tcp_read_wakeup(sk);
1600                 } 
1601                 else 
1602                 {
1603                         /* Force it to send an ack soon. */
1604                         int was_active = del_timer(&sk->timer);
1605                         if (!was_active || TCP_ACK_TIME < sk->timer.expires) 
1606                         {
1607                                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1608                         } 
1609                         else
1610                                 add_timer(&sk->timer);
1611                 }
1612         }
1613 } 
1614 
1615 
1616 /*
1617  *      Handle reading urgent data. 
1618  */
1619  
1620 static int tcp_read_urg(struct sock * sk, int nonblock,
     /* [previous][next][first][last][top][bottom][index][help] */
1621              unsigned char *to, int len, unsigned flags)
1622 {
1623         if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1624                 return -EINVAL;
1625         if (sk->err) 
1626         {
1627                 int tmp = -sk->err;
1628                 sk->err = 0;
1629                 return tmp;
1630         }
1631 
1632         if (sk->state == TCP_CLOSE || sk->done) 
1633         {
1634                 if (!sk->done) {
1635                         sk->done = 1;
1636                         return 0;
1637                 }
1638                 return -ENOTCONN;
1639         }
1640 
1641         if (sk->shutdown & RCV_SHUTDOWN) 
1642         {
1643                 sk->done = 1;
1644                 return 0;
1645         }
1646         sk->inuse = 1;
1647         if (sk->urg_data & URG_VALID) 
1648         {
1649                 char c = sk->urg_data;
1650                 if (!(flags & MSG_PEEK))
1651                         sk->urg_data = URG_READ;
1652                 put_fs_byte(c, to);
1653                 release_sock(sk);
1654                 return 1;
1655         }
1656         release_sock(sk);
1657         
1658         /*
1659          * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1660          * the available implementations agree in this case:
1661          * this call should never block, independent of the
1662          * blocking state of the socket.
1663          * Mike <pall@rz.uni-karlsruhe.de>
1664          */
1665         return -EAGAIN;
1666 }
1667 
1668 
1669 /*
1670  *      This routine copies from a sock struct into the user buffer. 
1671  */
1672  
1673 static int tcp_read(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
1674         int len, int nonblock, unsigned flags)
1675 {
1676         struct wait_queue wait = { current, NULL };
1677         int copied = 0;
1678         unsigned long peek_seq;
1679         unsigned long *seq;
1680         unsigned long used;
1681 
1682         /* This error should be checked. */
1683         if (sk->state == TCP_LISTEN)
1684                 return -ENOTCONN;
1685 
1686         /* Urgent data needs to be handled specially. */
1687         if (flags & MSG_OOB)
1688                 return tcp_read_urg(sk, nonblock, to, len, flags);
1689 
1690         peek_seq = sk->copied_seq;
1691         seq = &sk->copied_seq;
1692         if (flags & MSG_PEEK)
1693                 seq = &peek_seq;
1694 
1695         add_wait_queue(sk->sleep, &wait);
1696         sk->inuse = 1;
1697         while (len > 0) 
1698         {
1699                 struct sk_buff * skb;
1700                 unsigned long offset;
1701         
1702                 /*
1703                  * are we at urgent data? Stop if we have read anything.
1704                  */
1705                 if (copied && sk->urg_data && sk->urg_seq == 1+*seq)
1706                         break;
1707 
1708                 current->state = TASK_INTERRUPTIBLE;
1709 
1710                 skb = skb_peek(&sk->receive_queue);
1711                 do 
1712                 {
1713                         if (!skb)
1714                                 break;
1715                         if (before(1+*seq, skb->h.th->seq))
1716                                 break;
1717                         offset = 1 + *seq - skb->h.th->seq;
1718                         if (skb->h.th->syn)
1719                                 offset--;
1720                         if (offset < skb->len)
1721                                 goto found_ok_skb;
1722                         if (!(flags & MSG_PEEK))
1723                                 skb->used = 1;
1724                         skb = skb->next;
1725                 }
1726                 while (skb != (struct sk_buff *)&sk->receive_queue);
1727 
1728                 if (copied)
1729                         break;
1730 
1731                 if (sk->err) 
1732                 {
1733                         copied = -sk->err;
1734                         sk->err = 0;
1735                         break;
1736                 }
1737 
1738                 if (sk->state == TCP_CLOSE) 
1739                 {
1740                         if (!sk->done) 
1741                         {
1742                                 sk->done = 1;
1743                                 break;
1744                         }
1745                         copied = -ENOTCONN;
1746                         break;
1747                 }
1748 
1749                 if (sk->shutdown & RCV_SHUTDOWN) 
1750                 {
1751                         sk->done = 1;
1752                         break;
1753                 }
1754                         
1755                 if (nonblock) 
1756                 {
1757                         copied = -EAGAIN;
1758                         break;
1759                 }
1760 
1761                 cleanup_rbuf(sk);
1762                 release_sock(sk);
1763                 schedule();
1764                 sk->inuse = 1;
1765 
1766                 if (current->signal & ~current->blocked) 
1767                 {
1768                         copied = -ERESTARTSYS;
1769                         break;
1770                 }
1771                 continue;
1772 
1773         found_ok_skb:
1774                 /* Ok so how much can we use ? */
1775                 used = skb->len - offset;
1776                 if (len < used)
1777                         used = len;
1778                 /* do we have urgent data here? */
1779                 if (sk->urg_data) 
1780                 {
1781                         unsigned long urg_offset = sk->urg_seq - (1 + *seq);
1782                         if (urg_offset < used) 
1783                         {
1784                                 if (!urg_offset) 
1785                                 {
1786                                         if (!sk->urginline) 
1787                                         {
1788                                                 ++*seq;
1789                                                 offset++;
1790                                                 used--;
1791                                         }
1792                                 }
1793                                 else
1794                                         used = urg_offset;
1795                         }
1796                 }
1797                 /* Copy it */
1798                 memcpy_tofs(to,((unsigned char *)skb->h.th) +
1799                         skb->h.th->doff*4 + offset, used);
1800                 copied += used;
1801                 len -= used;
1802                 to += used;
1803                 *seq += used;
1804                 if (after(sk->copied_seq+1,sk->urg_seq))
1805                         sk->urg_data = 0;
1806                 if (!(flags & MSG_PEEK) && (used + offset >= skb->len))
1807                         skb->used = 1;
1808         }
1809         remove_wait_queue(sk->sleep, &wait);
1810         current->state = TASK_RUNNING;
1811 
1812         /* Clean up data we have read: This will do ACK frames */
1813         cleanup_rbuf(sk);
1814         release_sock(sk);
1815         return copied;
1816 }
1817 
1818  
1819 /*
1820  *      Shutdown the sending side of a connection.
1821  */
1822 
1823 void tcp_shutdown(struct sock *sk, int how)
     /* [previous][next][first][last][top][bottom][index][help] */
1824 {
1825         struct sk_buff *buff;
1826         struct tcphdr *t1, *th;
1827         struct proto *prot;
1828         int tmp;
1829         struct device *dev = NULL;
1830 
1831         /*
1832          * We need to grab some memory, and put together a FIN,
1833          * and then put it into the queue to be sent.
1834          * FIXME:
1835          *
1836          *      Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1837          *      Most of this is guesswork, so maybe it will work...
1838          */
1839 
1840         if (!(how & SEND_SHUTDOWN)) 
1841                 return;
1842          
1843         /*
1844          *      If we've already sent a FIN, return. 
1845          */
1846          
1847         if (sk->state == TCP_FIN_WAIT1 ||
1848             sk->state == TCP_FIN_WAIT2 ||
1849             sk->state == TCP_CLOSING ||
1850             sk->state == TCP_LAST_ACK ||
1851             sk->state == TCP_TIME_WAIT
1852         ) 
1853         {
1854                 return;
1855         }
1856         sk->inuse = 1;
1857 
1858         /*
1859          * flag that the sender has shutdown
1860          */
1861 
1862         sk->shutdown |= SEND_SHUTDOWN;
1863 
1864         /*
1865          *  Clear out any half completed packets. 
1866          */
1867 
1868         if (sk->partial)
1869                 tcp_send_partial(sk);
1870 
1871         prot =(struct proto *)sk->prot;
1872         th =(struct tcphdr *)&sk->dummy_th;
1873         release_sock(sk); /* in case the malloc sleeps. */
1874         buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
1875         if (buff == NULL)
1876                 return;
1877         sk->inuse = 1;
1878 
1879         buff->sk = sk;
1880         buff->len = sizeof(*t1);
1881         buff->localroute = sk->localroute;
1882         t1 =(struct tcphdr *) buff->data;
1883 
1884         /*
1885          *      Put in the IP header and routing stuff. 
1886          */
1887 
1888         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
1889                            IPPROTO_TCP, sk->opt,
1890                            sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
1891         if (tmp < 0) 
1892         {
1893                 /*
1894                  *      Finish anyway, treat this as a send that got lost. 
1895                  *
1896                  *      Enter FIN_WAIT1 on normal shutdown, which waits for
1897                  *      written data to be completely acknowledged along
1898                  *      with an acknowledge to our FIN.
1899                  *
1900                  *      Enter FIN_WAIT2 on abnormal shutdown -- close before
1901                  *      connection established.
1902                  */
1903                 buff->free=1;
1904                 prot->wfree(sk,buff->mem_addr, buff->mem_len);
1905 
1906                 if (sk->state == TCP_ESTABLISHED)
1907                         tcp_set_state(sk,TCP_FIN_WAIT1);
1908                 else if(sk->state == TCP_CLOSE_WAIT)
1909                         tcp_set_state(sk,TCP_LAST_ACK);
1910                 else
1911                         tcp_set_state(sk,TCP_FIN_WAIT2);
1912 
1913                 release_sock(sk);
1914                 return;
1915         }
1916 
1917         t1 =(struct tcphdr *)((char *)t1 +tmp);
1918         buff->len += tmp;
1919         buff->dev = dev;
1920         memcpy(t1, th, sizeof(*t1));
1921         t1->seq = ntohl(sk->write_seq);
1922         sk->write_seq++;
1923         buff->h.seq = sk->write_seq;
1924         t1->ack = 1;
1925         t1->ack_seq = ntohl(sk->acked_seq);
1926         t1->window = ntohs(sk->window=tcp_select_window(sk));
1927         t1->fin = 1;
1928         t1->rst = 0;
1929         t1->doff = sizeof(*t1)/4;
1930         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1931 
1932         /*
1933          * If there is data in the write queue, the fin must be appended to
1934          * the write queue.
1935          */
1936         
1937         if (skb_peek(&sk->write_queue) != NULL) 
1938         {
1939                 buff->free=0;
1940                 if (buff->next != NULL) 
1941                 {
1942                         printk("tcp_shutdown: next != NULL\n");
1943                         skb_unlink(buff);
1944                 }
1945                 skb_queue_tail(&sk->write_queue, buff);
1946         } 
1947         else 
1948         {
1949                 sk->sent_seq = sk->write_seq;
1950                 sk->prot->queue_xmit(sk, dev, buff, 0);
1951         }
1952 
1953         if (sk->state == TCP_ESTABLISHED) 
1954                 tcp_set_state(sk,TCP_FIN_WAIT1);
1955         else if (sk->state == TCP_CLOSE_WAIT)
1956                 tcp_set_state(sk,TCP_LAST_ACK);
1957         else
1958                 tcp_set_state(sk,TCP_FIN_WAIT2);
1959 
1960         release_sock(sk);
1961 }
1962 
1963 
1964 static int
1965 tcp_recvfrom(struct sock *sk, unsigned char *to,
     /* [previous][next][first][last][top][bottom][index][help] */
1966              int to_len, int nonblock, unsigned flags,
1967              struct sockaddr_in *addr, int *addr_len)
1968 {
1969         int result;
1970   
1971         /* 
1972          *      Have to check these first unlike the old code. If 
1973          *      we check them after we lose data on an error
1974          *      which is wrong 
1975          */
1976 
1977         if(addr_len)
1978                 *addr_len = sizeof(*addr);
1979         result=tcp_read(sk, to, to_len, nonblock, flags);
1980 
1981         if (result < 0) 
1982                 return(result);
1983   
1984         if(addr)
1985         {
1986                 addr->sin_family = AF_INET;
1987                 addr->sin_port = sk->dummy_th.dest;
1988                 addr->sin_addr.s_addr = sk->daddr;
1989         }
1990         return(result);
1991 }
1992 
1993 
1994 /*
1995  *      This routine will send an RST to the other tcp. 
1996  */
1997  
1998 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
1999           struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2000 {
2001         struct sk_buff *buff;
2002         struct tcphdr *t1;
2003         int tmp;
2004         struct device *ndev=NULL;
2005 
2006         /*
2007          *      Cannot reset a reset (Think about it).
2008          */
2009          
2010         if(th->rst)
2011                 return;
2012   
2013         /*
2014          * We need to grab some memory, and put together an RST,
2015          * and then put it into the queue to be sent.
2016          */
2017 
2018         buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2019         if (buff == NULL) 
2020                 return;
2021 
2022         buff->len = sizeof(*t1);
2023         buff->sk = NULL;
2024         buff->dev = dev;
2025         buff->localroute = 0;
2026 
2027         t1 =(struct tcphdr *) buff->data;
2028 
2029         /*
2030          *      Put in the IP header and routing stuff. 
2031          */
2032 
2033         tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2034                            sizeof(struct tcphdr),tos,ttl);
2035         if (tmp < 0) 
2036         {
2037                 buff->free = 1;
2038                 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2039                 return;
2040         }
2041 
2042         t1 =(struct tcphdr *)((char *)t1 +tmp);
2043         buff->len += tmp;
2044         memcpy(t1, th, sizeof(*t1));
2045 
2046         /*
2047          *      Swap the send and the receive. 
2048          */
2049 
2050         t1->dest = th->source;
2051         t1->source = th->dest;
2052         t1->rst = 1;  
2053         t1->window = 0;
2054   
2055         if(th->ack)
2056         {
2057                 t1->ack = 0;
2058                 t1->seq = th->ack_seq;
2059                 t1->ack_seq = 0;
2060         }
2061         else
2062         {
2063                 t1->ack = 1;
2064                 if(!th->syn)
2065                         t1->ack_seq=htonl(th->seq);
2066                 else
2067                         t1->ack_seq=htonl(th->seq+1);
2068                 t1->seq=0;
2069         }
2070 
2071         t1->syn = 0;
2072         t1->urg = 0;
2073         t1->fin = 0;
2074         t1->psh = 0;
2075         t1->doff = sizeof(*t1)/4;
2076         tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2077         prot->queue_xmit(NULL, ndev, buff, 1);
2078         tcp_statistics.TcpOutSegs++;
2079 }
2080 
2081 
2082 /*
2083  *      Look for tcp options. Parses everything but only knows about MSS.
2084  *      This routine is always called with the packet containing the SYN.
2085  *      However it may also be called with the ack to the SYN.  So you
2086  *      can't assume this is always the SYN.  It's always called after
2087  *      we have set up sk->mtu to our own MTU.
2088  *
2089  *      We need at minimum to add PAWS support here. Possibly large windows
2090  *      as Linux gets deployed on 100Mb/sec networks.
2091  */
2092  
2093 static void tcp_options(struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
2094 {
2095         unsigned char *ptr;
2096         int length=(th->doff*4)-sizeof(struct tcphdr);
2097         int mss_seen = 0;
2098     
2099         ptr = (unsigned char *)(th + 1);
2100   
2101         while(length>0)
2102         {
2103                 int opcode=*ptr++;
2104                 int opsize=*ptr++;
2105                 switch(opcode)
2106                 {
2107                         case TCPOPT_EOL:
2108                                 return;
2109                         case TCPOPT_NOP:
2110                                 length-=2;
2111                                 continue;
2112                         
2113                         default:
2114                                 if(opsize<=2)   /* Avoid silly options looping forever */
2115                                         return;
2116                                 switch(opcode)
2117                                 {
2118                                         case TCPOPT_MSS:
2119                                                 if(opsize==4 && th->syn)
2120                                                 {
2121                                                         sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2122                                                         mss_seen = 1;
2123                                                 }
2124                                                 break;
2125                                                 /* Add other options here as people feel the urge to implement stuff like large windows */
2126                                 }
2127                                 ptr+=opsize-2;
2128                                 length-=opsize;
2129                 }
2130         }
2131         if (th->syn) 
2132         {
2133                 if (! mss_seen)
2134                       sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
2135         }
2136 #ifdef CONFIG_INET_PCTCP
2137         sk->mss = min(sk->max_window >> 1, sk->mtu);
2138 #else    
2139         sk->mss = min(sk->max_window, sk->mtu);
2140 #endif  
2141 }
2142 
2143 static inline unsigned long default_mask(unsigned long dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2144 {
2145         dst = ntohl(dst);
2146         if (IN_CLASSA(dst))
2147                 return htonl(IN_CLASSA_NET);
2148         if (IN_CLASSB(dst))
2149                 return htonl(IN_CLASSB_NET);
2150         return htonl(IN_CLASSC_NET);
2151 }
2152 
2153 /*
2154  *      Default sequence number picking algorithm.
2155  */
2156 
2157 extern inline long tcp_init_seq(void)
     /* [previous][next][first][last][top][bottom][index][help] */
2158 {
2159         return jiffies * SEQ_TICK - seq_offset; 
2160 }
2161 
2162 /*
2163  *      This routine handles a connection request.
2164  *      It should make sure we haven't already responded.
2165  *      Because of the way BSD works, we have to send a syn/ack now.
2166  *      This also means it will be harder to close a socket which is
2167  *      listening.
2168  */
2169  
2170 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
     /* [previous][next][first][last][top][bottom][index][help] */
2171                  unsigned long daddr, unsigned long saddr,
2172                  struct options *opt, struct device *dev, unsigned long seq)
2173 {
2174         struct sk_buff *buff;
2175         struct tcphdr *t1;
2176         unsigned char *ptr;
2177         struct sock *newsk;
2178         struct tcphdr *th;
2179         struct device *ndev=NULL;
2180         int tmp;
2181         struct rtable *rt;
2182   
2183         th = skb->h.th;
2184 
2185         /* If the socket is dead, don't accept the connection. */
2186         if (!sk->dead) 
2187         {
2188                 sk->data_ready(sk,0);
2189         }
2190         else 
2191         {
2192                 if(sk->debug)
2193                         printk("Reset on %p: Connect on dead socket.\n",sk);
2194                 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2195                 tcp_statistics.TcpAttemptFails++;
2196                 kfree_skb(skb, FREE_READ);
2197                 return;
2198         }
2199 
2200         /*
2201          * Make sure we can accept more.  This will prevent a
2202          * flurry of syns from eating up all our memory.
2203          */
2204 
2205         if (sk->ack_backlog >= sk->max_ack_backlog) 
2206         {
2207                 tcp_statistics.TcpAttemptFails++;
2208                 kfree_skb(skb, FREE_READ);
2209                 return;
2210         }
2211 
2212         /*
2213          * We need to build a new sock struct.
2214          * It is sort of bad to have a socket without an inode attached
2215          * to it, but the wake_up's will just wake up the listening socket,
2216          * and if the listening socket is destroyed before this is taken
2217          * off of the queue, this will take care of it.
2218          */
2219 
2220         newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2221         if (newsk == NULL) 
2222         {
2223                 /* just ignore the syn.  It will get retransmitted. */
2224                 tcp_statistics.TcpAttemptFails++;
2225                 kfree_skb(skb, FREE_READ);
2226                 return;
2227         }
2228 
2229         memcpy(newsk, sk, sizeof(*newsk));
2230         skb_queue_head_init(&newsk->write_queue);
2231         skb_queue_head_init(&newsk->receive_queue);
2232         newsk->send_head = NULL;
2233         newsk->send_tail = NULL;
2234         skb_queue_head_init(&newsk->back_log);
2235         newsk->rtt = 0;         /*TCP_CONNECT_TIME<<3*/
2236         newsk->rto = TCP_TIMEOUT_INIT;
2237         newsk->mdev = 0;
2238         newsk->max_window = 0;
2239         newsk->cong_window = 1;
2240         newsk->cong_count = 0;
2241         newsk->ssthresh = 0;
2242         newsk->backoff = 0;
2243         newsk->blog = 0;
2244         newsk->intr = 0;
2245         newsk->proc = 0;
2246         newsk->done = 0;
2247         newsk->partial = NULL;
2248         newsk->pair = NULL;
2249         newsk->wmem_alloc = 0;
2250         newsk->rmem_alloc = 0;
2251         newsk->localroute = sk->localroute;
2252 
2253         newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2254 
2255         newsk->err = 0;
2256         newsk->shutdown = 0;
2257         newsk->ack_backlog = 0;
2258         newsk->acked_seq = skb->h.th->seq+1;
2259         newsk->fin_seq = skb->h.th->seq;
2260         newsk->copied_seq = skb->h.th->seq;
2261         newsk->state = TCP_SYN_RECV;
2262         newsk->timeout = 0;
2263         newsk->write_seq = seq; 
2264         newsk->window_seq = newsk->write_seq;
2265         newsk->rcv_ack_seq = newsk->write_seq;
2266         newsk->urg_data = 0;
2267         newsk->retransmits = 0;
2268         newsk->linger=0;
2269         newsk->destroy = 0;
2270         init_timer(&newsk->timer);
2271         newsk->timer.data = (unsigned long)newsk;
2272         newsk->timer.function = &net_timer;
2273         newsk->dummy_th.source = skb->h.th->dest;
2274         newsk->dummy_th.dest = skb->h.th->source;
2275         
2276         /*
2277          *      Swap these two, they are from our point of view. 
2278          */
2279          
2280         newsk->daddr = saddr;
2281         newsk->saddr = daddr;
2282 
2283         put_sock(newsk->num,newsk);
2284         newsk->dummy_th.res1 = 0;
2285         newsk->dummy_th.doff = 6;
2286         newsk->dummy_th.fin = 0;
2287         newsk->dummy_th.syn = 0;
2288         newsk->dummy_th.rst = 0;        
2289         newsk->dummy_th.psh = 0;
2290         newsk->dummy_th.ack = 0;
2291         newsk->dummy_th.urg = 0;
2292         newsk->dummy_th.res2 = 0;
2293         newsk->acked_seq = skb->h.th->seq + 1;
2294         newsk->copied_seq = skb->h.th->seq;
2295         newsk->socket = NULL;
2296 
2297         /*
2298          *      Grab the ttl and tos values and use them 
2299          */
2300 
2301         newsk->ip_ttl=sk->ip_ttl;
2302         newsk->ip_tos=skb->ip_hdr->tos;
2303 
2304         /*
2305          *      Use 512 or whatever user asked for 
2306          */
2307 
2308         /*
2309          *      Note use of sk->user_mss, since user has no direct access to newsk 
2310          */
2311 
2312         rt=ip_rt_route(saddr, NULL,NULL);
2313         
2314         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2315                 newsk->window_clamp = rt->rt_window;
2316         else
2317                 newsk->window_clamp = 0;
2318                 
2319         if (sk->user_mss)
2320                 newsk->mtu = sk->user_mss;
2321         else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2322                 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2323         else 
2324         {
2325 #ifdef CONFIG_INET_SNARL        /* Sub Nets Are Local */
2326                 if ((saddr ^ daddr) & default_mask(saddr))
2327 #else
2328                 if ((saddr ^ daddr) & dev->pa_mask)
2329 #endif
2330                         newsk->mtu = 576 - HEADER_SIZE;
2331                 else
2332                         newsk->mtu = MAX_WINDOW;
2333         }
2334 
2335         /*
2336          *      But not bigger than device MTU 
2337          */
2338 
2339         newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2340 
2341         /*
2342          *      This will min with what arrived in the packet 
2343          */
2344 
2345         tcp_options(newsk,skb->h.th);
2346 
2347         buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2348         if (buff == NULL) 
2349         {
2350                 sk->err = -ENOMEM;
2351                 newsk->dead = 1;
2352                 release_sock(newsk);
2353                 kfree_skb(skb, FREE_READ);
2354                 tcp_statistics.TcpAttemptFails++;
2355                 return;
2356         }
2357   
2358         buff->len = sizeof(struct tcphdr)+4;
2359         buff->sk = newsk;
2360         buff->localroute = newsk->localroute;
2361 
2362         t1 =(struct tcphdr *) buff->data;
2363 
2364         /*
2365          *      Put in the IP header and routing stuff. 
2366          */
2367 
2368         tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2369                                IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2370 
2371         /*
2372          *      Something went wrong. 
2373          */
2374 
2375         if (tmp < 0) 
2376         {
2377                 sk->err = tmp;
2378                 buff->free=1;
2379                 kfree_skb(buff,FREE_WRITE);
2380                 newsk->dead = 1;
2381                 release_sock(newsk);
2382                 skb->sk = sk;
2383                 kfree_skb(skb, FREE_READ);
2384                 tcp_statistics.TcpAttemptFails++;
2385                 return;
2386         }
2387 
2388         buff->len += tmp;
2389         t1 =(struct tcphdr *)((char *)t1 +tmp);
2390   
2391         memcpy(t1, skb->h.th, sizeof(*t1));
2392         buff->h.seq = newsk->write_seq;
2393         /*
2394          *      Swap the send and the receive. 
2395          */
2396         t1->dest = skb->h.th->source;
2397         t1->source = newsk->dummy_th.source;
2398         t1->seq = ntohl(newsk->write_seq++);
2399         t1->ack = 1;
2400         newsk->window = tcp_select_window(newsk);
2401         newsk->sent_seq = newsk->write_seq;
2402         t1->window = ntohs(newsk->window);
2403         t1->res1 = 0;
2404         t1->res2 = 0;
2405         t1->rst = 0;
2406         t1->urg = 0;
2407         t1->psh = 0;
2408         t1->syn = 1;
2409         t1->ack_seq = ntohl(skb->h.th->seq+1);
2410         t1->doff = sizeof(*t1)/4+1;
2411         ptr =(unsigned char *)(t1+1);
2412         ptr[0] = 2;
2413         ptr[1] = 4;
2414         ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2415         ptr[3] =(newsk->mtu) & 0xff;
2416 
2417         tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2418         newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2419 
2420         reset_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2421         skb->sk = newsk;
2422 
2423         /*
2424          *      Charge the sock_buff to newsk. 
2425          */
2426          
2427         sk->rmem_alloc -= skb->mem_len;
2428         newsk->rmem_alloc += skb->mem_len;
2429         
2430         skb_queue_tail(&sk->receive_queue,skb);
2431         sk->ack_backlog++;
2432         release_sock(newsk);
2433         tcp_statistics.TcpOutSegs++;
2434 }
2435 
2436 
2437 static void tcp_close(struct sock *sk, int timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
2438 {
2439         struct sk_buff *buff;
2440         struct tcphdr *t1, *th;
2441         struct proto *prot;
2442         struct device *dev=NULL;
2443         int tmp;
2444 
2445         /*
2446          * We need to grab some memory, and put together a FIN, 
2447          * and then put it into the queue to be sent.
2448          */
2449         sk->inuse = 1;
2450         sk->keepopen = 1;
2451         sk->shutdown = SHUTDOWN_MASK;
2452 
2453         if (!sk->dead) 
2454                 sk->state_change(sk);
2455 
2456         if (timeout == 0) 
2457         {
2458                 /*
2459                  *  We need to flush the recv. buffs.  We do this only on the
2460                  *  descriptor close, not protocol-sourced closes, because the
2461                  *  reader process may not have drained the data yet!
2462                  */
2463 
2464                 if (skb_peek(&sk->receive_queue) != NULL) 
2465                 {
2466                         struct sk_buff *skb;
2467                         if(sk->debug)
2468                                 printk("Clean rcv queue\n");
2469                         while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2470                                 kfree_skb(skb, FREE_READ);
2471                         if(sk->debug)
2472                                 printk("Cleaned.\n");
2473                 }
2474         }
2475 
2476         /*
2477          *      Get rid off any half-completed packets. 
2478          */
2479          
2480         if (sk->partial) 
2481         {
2482                 tcp_send_partial(sk);
2483         }
2484 
2485         switch(sk->state) 
2486         {
2487                 case TCP_FIN_WAIT1:
2488                 case TCP_FIN_WAIT2:
2489                 case TCP_CLOSING:
2490                         /*
2491                          * These states occur when we have already closed out
2492                          * our end.  If there is no timeout, we do not do
2493                          * anything.  We may still be in the middle of sending
2494                          * the remainder of our buffer, for example...
2495                          * resetting the timer would be inappropriate.
2496                          *
2497                          * XXX if retransmit count reaches limit, is tcp_close()
2498                          * called with timeout == 1 ? if not, we need to fix that.
2499                          */
2500                         if (!timeout) {
2501                                 int timer_active;
2502 
2503                                 timer_active = del_timer(&sk->timer);
2504                                 if (timer_active)
2505                                         add_timer(&sk->timer);
2506                                 else
2507                                         reset_timer(sk, TIME_CLOSE, 4 * sk->rto);
2508                         }
2509                         if (timeout) 
2510                                 tcp_time_wait(sk);
2511                         release_sock(sk);
2512                         return; /* break causes a double release - messy */
2513                 case TCP_TIME_WAIT:
2514                 case TCP_LAST_ACK:
2515                         /*
2516                          * A timeout from these states terminates the TCB.
2517                          */
2518                         if (timeout) 
2519                         {
2520                                 tcp_set_state(sk,TCP_CLOSE);
2521                         }
2522                         release_sock(sk);
2523                         return;
2524                 case TCP_LISTEN:
2525                         /* we need to drop any sockets which have been connected,
2526                            but have not yet been accepted. */
2527                         tcp_set_state(sk,TCP_CLOSE);
2528                         tcp_close_pending(sk, timeout);
2529                         release_sock(sk);
2530                         return;
2531                 case TCP_CLOSE:
2532                         release_sock(sk);
2533                         return;
2534                 case TCP_CLOSE_WAIT:
2535                 case TCP_ESTABLISHED:
2536                 case TCP_SYN_SENT:
2537                 case TCP_SYN_RECV:
2538                         prot =(struct proto *)sk->prot;
2539                         th =(struct tcphdr *)&sk->dummy_th;
2540                         buff = prot->wmalloc(sk, MAX_FIN_SIZE, 1, GFP_ATOMIC);
2541                         if (buff == NULL) 
2542                         {
2543                                 /* This will force it to try again later. */
2544                                 /* Or it would have if someone released the socket
2545                                    first. Anyway it might work now */
2546                                 release_sock(sk);
2547                                 if (sk->state != TCP_CLOSE_WAIT)
2548                                         tcp_set_state(sk,TCP_ESTABLISHED);
2549                                 reset_timer(sk, TIME_CLOSE, 100);
2550                                 return;
2551                         }
2552                         buff->sk = sk;
2553                         buff->free = 1;
2554                         buff->len = sizeof(*t1);
2555                         buff->localroute = sk->localroute;
2556                         t1 =(struct tcphdr *) buff->data;
2557         
2558                         /*
2559                          *      Put in the IP header and routing stuff. 
2560                          */
2561                         tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2562                                          IPPROTO_TCP, sk->opt,
2563                                          sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2564                         if (tmp < 0) 
2565                         {
2566                                 sk->write_seq++;        /* Very important 8) */
2567                                 kfree_skb(buff,FREE_WRITE);
2568 
2569                                 /*
2570                                  * Enter FIN_WAIT1 to await completion of
2571                                  * written out data and ACK to our FIN.
2572                                  */
2573 
2574                                 if(sk->state==TCP_ESTABLISHED)
2575                                         tcp_set_state(sk,TCP_FIN_WAIT1);
2576                                 else
2577                                         tcp_set_state(sk,TCP_FIN_WAIT2);
2578                                 reset_timer(sk, TIME_CLOSE,4*sk->rto);
2579                                 if(timeout)
2580                                         tcp_time_wait(sk);
2581 
2582                                 release_sock(sk);
2583                                 return;
2584                         }
2585 
2586                         t1 =(struct tcphdr *)((char *)t1 +tmp);
2587                         buff->len += tmp;
2588                         buff->dev = dev;
2589                         memcpy(t1, th, sizeof(*t1));
2590                         t1->seq = ntohl(sk->write_seq);
2591                         sk->write_seq++;
2592                         buff->h.seq = sk->write_seq;
2593                         t1->ack = 1;
2594         
2595                         /* 
2596                          *      Ack everything immediately from now on. 
2597                          */
2598 
2599                         sk->delay_acks = 0;
2600                         t1->ack_seq = ntohl(sk->acked_seq);
2601                         t1->window = ntohs(sk->window=tcp_select_window(sk));
2602                         t1->fin = 1;
2603                         t1->rst = 0;
2604                         t1->doff = sizeof(*t1)/4;
2605                         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2606 
2607                         tcp_statistics.TcpOutSegs++;
2608         
2609                         if (skb_peek(&sk->write_queue) == NULL) 
2610                         {
2611                                 sk->sent_seq = sk->write_seq;
2612                                 prot->queue_xmit(sk, dev, buff, 0);
2613                         } 
2614                         else 
2615                         {
2616                                 reset_timer(sk, TIME_WRITE, sk->rto);
2617                                 if (buff->next != NULL) 
2618                                 {
2619                                         printk("tcp_close: next != NULL\n");
2620                                         skb_unlink(buff);
2621                                 }
2622                                 skb_queue_tail(&sk->write_queue, buff);
2623                         }
2624 
2625                         /*
2626                          * If established (normal close), enter FIN_WAIT1.
2627                          * If in CLOSE_WAIT, enter LAST_ACK
2628                          * If in CLOSING, remain in CLOSING
2629                          * otherwise enter FIN_WAIT2
2630                          */
2631 
2632                         if (sk->state == TCP_ESTABLISHED)
2633                                 tcp_set_state(sk,TCP_FIN_WAIT1);
2634                         else if (sk->state == TCP_CLOSE_WAIT)
2635                                 tcp_set_state(sk,TCP_LAST_ACK);
2636                         else if (sk->state != TCP_CLOSING)
2637                                 tcp_set_state(sk,TCP_FIN_WAIT2);
2638         }
2639         release_sock(sk);
2640 }
2641 
2642 
2643 /*
2644  * This routine takes stuff off of the write queue,
2645  * and puts it in the xmit queue.
2646  */
2647 static void
2648 tcp_write_xmit(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
2649 {
2650         struct sk_buff *skb;
2651 
2652         /*
2653          *      The bytes will have to remain here. In time closedown will
2654          *      empty the write queue and all will be happy 
2655          */
2656 
2657         if(sk->zapped)
2658                 return;
2659 
2660         while((skb = skb_peek(&sk->write_queue)) != NULL &&
2661                 before(skb->h.seq, sk->window_seq + 1) &&
2662                 (sk->retransmits == 0 ||
2663                  sk->timeout != TIME_WRITE ||
2664                  before(skb->h.seq, sk->rcv_ack_seq + 1))
2665                 && sk->packets_out < sk->cong_window) 
2666         {
2667                 IS_SKB(skb);
2668                 skb_unlink(skb);
2669                 /* See if we really need to send the packet. */
2670                 if (before(skb->h.seq, sk->rcv_ack_seq +1)) 
2671                 {
2672                         sk->retransmits = 0;
2673                         kfree_skb(skb, FREE_WRITE);
2674                         if (!sk->dead) 
2675                                 sk->write_space(sk);
2676                 } 
2677                 else
2678                 {
2679                         struct tcphdr *th;
2680                         struct iphdr *iph;
2681                         int size;
2682 /*
2683  * put in the ack seq and window at this point rather than earlier,
2684  * in order to keep them monotonic.  We really want to avoid taking
2685  * back window allocations.  That's legal, but RFC1122 says it's frowned on.
2686  * Ack and window will in general have changed since this packet was put
2687  * on the write queue.
2688  */
2689                         iph = (struct iphdr *)(skb->data +
2690                                                skb->dev->hard_header_len);
2691                         th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
2692                         size = skb->len - (((unsigned char *) th) - skb->data);
2693                         
2694                         th->ack_seq = ntohl(sk->acked_seq);
2695                         th->window = ntohs(tcp_select_window(sk));
2696 
2697                         tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
2698 
2699                         sk->sent_seq = skb->h.seq;
2700                         sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
2701                 }
2702         }
2703 }
2704 
2705 
2706 /*
2707  *      This routine deals with incoming acks, but not outgoing ones.
2708  */
2709 
2710 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
     /* [previous][next][first][last][top][bottom][index][help] */
2711 {
2712         unsigned long ack;
2713         int flag = 0;
2714 
2715         /* 
2716          * 1 - there was data in packet as well as ack or new data is sent or 
2717          *     in shutdown state
2718          * 2 - data from retransmit queue was acked and removed
2719          * 4 - window shrunk or data from retransmit queue was acked and removed
2720          */
2721 
2722         if(sk->zapped)
2723                 return(1);      /* Dead, cant ack any more so why bother */
2724 
2725         ack = ntohl(th->ack_seq);
2726         if (ntohs(th->window) > sk->max_window) 
2727         {
2728                 sk->max_window = ntohs(th->window);
2729 #ifdef CONFIG_INET_PCTCP
2730                 sk->mss = min(sk->max_window>>1, sk->mtu);
2731 #else
2732                 sk->mss = min(sk->max_window, sk->mtu);
2733 #endif  
2734         }
2735 
2736         if (sk->retransmits && sk->timeout == TIME_KEEPOPEN)
2737                 sk->retransmits = 0;
2738 
2739         if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
2740         {
2741                 if(sk->debug)
2742                         printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
2743                         
2744                 /*
2745                  *      Keepalive processing.
2746                  */
2747                  
2748                 if (after(ack, sk->sent_seq)) 
2749                 {
2750                         return(0);
2751                 }
2752                 if (sk->keepopen) 
2753                 {
2754                         if(sk->timeout==TIME_KEEPOPEN)
2755                                 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
2756                 }
2757                 return(1);
2758         }
2759 
2760         if (len != th->doff*4) 
2761                 flag |= 1;
2762 
2763         /* See if our window has been shrunk. */
2764 
2765         if (after(sk->window_seq, ack+ntohs(th->window))) 
2766         {
2767                 /*
2768                  * We may need to move packets from the send queue
2769                  * to the write queue, if the window has been shrunk on us.
2770                  * The RFC says you are not allowed to shrink your window
2771                  * like this, but if the other end does, you must be able
2772                  * to deal with it.
2773                  */
2774                 struct sk_buff *skb;
2775                 struct sk_buff *skb2;
2776                 struct sk_buff *wskb = NULL;
2777         
2778                 skb2 = sk->send_head;
2779                 sk->send_head = NULL;
2780                 sk->send_tail = NULL;
2781         
2782                 /*
2783                  *      This is an artefact of a flawed concept. We want one
2784                  *      queue and a smarter send routine when we send all.
2785                  */
2786         
2787                 flag |= 4;
2788         
2789                 sk->window_seq = ack + ntohs(th->window);
2790                 cli();
2791                 while (skb2 != NULL) 
2792                 {
2793                         skb = skb2;
2794                         skb2 = skb->link3;
2795                         skb->link3 = NULL;
2796                         if (after(skb->h.seq, sk->window_seq)) 
2797                         {
2798                                 if (sk->packets_out > 0) 
2799                                         sk->packets_out--;
2800                                 /* We may need to remove this from the dev send list. */
2801                                 if (skb->next != NULL) 
2802                                 {
2803                                         skb_unlink(skb);                                
2804                                 }
2805                                 /* Now add it to the write_queue. */
2806                                 if (wskb == NULL)
2807                                         skb_queue_head(&sk->write_queue,skb);
2808                                 else
2809                                         skb_append(wskb,skb);
2810                                 wskb = skb;
2811                         } 
2812                         else 
2813                         {
2814                                 if (sk->send_head == NULL) 
2815                                 {
2816                                         sk->send_head = skb;
2817                                         sk->send_tail = skb;
2818                                 }
2819                                 else
2820                                 {
2821                                         sk->send_tail->link3 = skb;
2822                                         sk->send_tail = skb;
2823                                 }
2824                                 skb->link3 = NULL;
2825                         }
2826                 }
2827                 sti();
2828         }
2829 
2830         /*
2831          *      Pipe has emptied
2832          */
2833          
2834         if (sk->send_tail == NULL || sk->send_head == NULL) 
2835         {
2836                 sk->send_head = NULL;
2837                 sk->send_tail = NULL;
2838                 sk->packets_out= 0;
2839         }
2840 
2841         sk->window_seq = ack + ntohs(th->window);
2842 
2843         /* We don't want too many packets out there. */
2844         if (sk->timeout == TIME_WRITE && 
2845                 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
2846         {
2847                 /* 
2848                  * This is Jacobson's slow start and congestion avoidance. 
2849                  * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
2850                  * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
2851                  * counter and increment it once every cwnd times.  It's possible
2852                  * that this should be done only if sk->retransmits == 0.  I'm
2853                  * interpreting "new data is acked" as including data that has
2854                  * been retransmitted but is just now being acked.
2855                  */
2856                 if (sk->cong_window < sk->ssthresh)  
2857                         /* 
2858                          *      In "safe" area, increase
2859                          */
2860                         sk->cong_window++;
2861                 else 
2862                 {
2863                         /*
2864                          *      In dangerous area, increase slowly.  In theory this is
2865                          *      sk->cong_window += 1 / sk->cong_window
2866                          */
2867                         if (sk->cong_count >= sk->cong_window) 
2868                         {
2869                                 sk->cong_window++;
2870                                 sk->cong_count = 0;
2871                         }
2872                         else 
2873                                 sk->cong_count++;
2874                 }
2875         }
2876 
2877         sk->rcv_ack_seq = ack;
2878 
2879         /*
2880          *      If this ack opens up a zero window, clear backoff.  It was
2881          *      being used to time the probes, and is probably far higher than
2882          *      it needs to be for normal retransmission.
2883          */
2884 
2885         if (sk->timeout == TIME_PROBE0) 
2886         {
2887                 if (skb_peek(&sk->write_queue) != NULL &&   /* should always be non-null */
2888                     ! before (sk->window_seq, sk->write_queue.next->h.seq)) 
2889                 {
2890                         sk->retransmits = 0;
2891                         sk->backoff = 0;
2892                         
2893                         /*
2894                          *      Recompute rto from rtt.  this eliminates any backoff.
2895                          */
2896 
2897                         sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2898                         if (sk->rto > 120*HZ)
2899                                 sk->rto = 120*HZ;
2900                         if (sk->rto < 20)       /* Was 1*HZ, then 1 - turns out we must allow about
2901                                                    .2 of a second because of BSD delayed acks - on a 100Mb/sec link
2902                                                    .2 of a second is going to need huge windows (SIGH) */
2903                         sk->rto = 20;
2904                 }
2905         }
2906 
2907         /* 
2908          *      See if we can take anything off of the retransmit queue.
2909          */
2910    
2911         while(sk->send_head != NULL) 
2912         {
2913                 /* Check for a bug. */
2914                 if (sk->send_head->link3 &&
2915                     after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) 
2916                         printk("INET: tcp.c: *** bug send_list out of order.\n");
2917                 if (before(sk->send_head->h.seq, ack+1)) 
2918                 {
2919                         struct sk_buff *oskb;   
2920                         if (sk->retransmits) 
2921                         {       
2922                                 /*
2923                                  *      We were retransmitting.  don't count this in RTT est 
2924                                  */
2925                                 flag |= 2;
2926 
2927                                 /*
2928                                  * even though we've gotten an ack, we're still
2929                                  * retransmitting as long as we're sending from
2930                                  * the retransmit queue.  Keeping retransmits non-zero
2931                                  * prevents us from getting new data interspersed with
2932                                  * retransmissions.
2933                                  */
2934 
2935                                 if (sk->send_head->link3)
2936                                         sk->retransmits = 1;
2937                                 else
2938                                         sk->retransmits = 0;
2939                         }
2940                         /*
2941                          * Note that we only reset backoff and rto in the
2942                          * rtt recomputation code.  And that doesn't happen
2943                          * if there were retransmissions in effect.  So the
2944                          * first new packet after the retransmissions is
2945                          * sent with the backoff still in effect.  Not until
2946                          * we get an ack from a non-retransmitted packet do
2947                          * we reset the backoff and rto.  This allows us to deal
2948                          * with a situation where the network delay has increased
2949                          * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2950                          */
2951 
2952                         /*
2953                          *      We have one less packet out there. 
2954                          */
2955                          
2956                         if (sk->packets_out > 0) 
2957                                 sk->packets_out --;
2958                         /* 
2959                          *      Wake up the process, it can probably write more. 
2960                          */
2961                         if (!sk->dead) 
2962                                 sk->write_space(sk);
2963                         oskb = sk->send_head;
2964 
2965                         if (!(flag&2)) 
2966                         {
2967                                 long m;
2968         
2969                                 /*
2970                                  *      The following amusing code comes from Jacobson's
2971                                  *      article in SIGCOMM '88.  Note that rtt and mdev
2972                                  *      are scaled versions of rtt and mean deviation.
2973                                  *      This is designed to be as fast as possible 
2974                                  *      m stands for "measurement".
2975                                  */
2976         
2977                                 m = jiffies - oskb->when;  /* RTT */
2978                                 if(m<=0)
2979                                         m=1;            /* IS THIS RIGHT FOR <0 ??? */
2980                                 m -= (sk->rtt >> 3);    /* m is now error in rtt est */
2981                                 sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
2982                                 if (m < 0)
2983                                         m = -m;         /* m is now abs(error) */
2984                                 m -= (sk->mdev >> 2);   /* similar update on mdev */
2985                                 sk->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
2986         
2987                                 /*
2988                                  *      Now update timeout.  Note that this removes any backoff.
2989                                  */
2990                          
2991                                 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2992                                 if (sk->rto > 120*HZ)
2993                                         sk->rto = 120*HZ;
2994                                 if (sk->rto < 20)       /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
2995                                         sk->rto = 20;
2996                                 sk->backoff = 0;
2997                         }
2998                         flag |= (2|4);
2999                         cli();
3000                         oskb = sk->send_head;
3001                         IS_SKB(oskb);
3002                         sk->send_head = oskb->link3;
3003                         if (sk->send_head == NULL) 
3004                         {
3005                                 sk->send_tail = NULL;
3006                         }
3007 
3008                 /*
3009                  *      We may need to remove this from the dev send list. 
3010                  */
3011 
3012                         if (oskb->next)
3013                                 skb_unlink(oskb);
3014                         sti();
3015                         kfree_skb(oskb, FREE_WRITE); /* write. */
3016                         if (!sk->dead) 
3017                                 sk->write_space(sk);
3018                 }
3019                 else
3020                 {
3021                         break;
3022                 }
3023         }
3024 
3025         /*
3026          * XXX someone ought to look at this too.. at the moment, if skb_peek()
3027          * returns non-NULL, we complete ignore the timer stuff in the else
3028          * clause.  We ought to organize the code so that else clause can
3029          * (should) be executed regardless, possibly moving the PROBE timer
3030          * reset over.  The skb_peek() thing should only move stuff to the
3031          * write queue, NOT also manage the timer functions.
3032          */
3033 
3034         /*
3035          * Maybe we can take some stuff off of the write queue,
3036          * and put it onto the xmit queue.
3037          */
3038         if (skb_peek(&sk->write_queue) != NULL) 
3039         {
3040                 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3041                         (sk->retransmits == 0 || 
3042                          sk->timeout != TIME_WRITE ||
3043                          before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3044                         && sk->packets_out < sk->cong_window) 
3045                 {
3046                         flag |= 1;
3047                         tcp_write_xmit(sk);
3048                 }
3049                 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3050                         sk->send_head == NULL &&
3051                         sk->ack_backlog == 0 &&
3052                         sk->state != TCP_TIME_WAIT) 
3053                 {
3054                         reset_timer(sk, TIME_PROBE0, sk->rto);
3055                 }               
3056         }
3057         else
3058         {
3059                 /*
3060                  * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3061                  * from TCP_CLOSE we don't do anything
3062                  *
3063                  * from anything else, if there is write data (or fin) pending,
3064                  * we use a TIME_WRITE timeout, else if keepalive we reset to
3065                  * a KEEPALIVE timeout, else we delete the timer.
3066                  *
3067                  * We do not set flag for nominal write data, otherwise we may
3068                  * force a state where we start to write itsy bitsy tidbits
3069                  * of data.
3070                  */
3071 
3072                 switch(sk->state) {
3073                 case TCP_TIME_WAIT:
3074                         /*
3075                          * keep us in TIME_WAIT until we stop getting packets,
3076                          * reset the timeout.
3077                          */
3078                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3079                         break;
3080                 case TCP_CLOSE:
3081                         /*
3082                          * don't touch the timer.
3083                          */
3084                         break;
3085                 default:
3086                         /*
3087                          * must check send_head, write_queue, and ack_backlog
3088                          * to determine which timeout to use.
3089                          */
3090                         if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3091                                 reset_timer(sk, TIME_WRITE, sk->rto);
3092                         } else if (sk->keepopen) {
3093                                 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3094                         } else {
3095                                 delete_timer(sk);
3096                         }
3097                         break;
3098                 }
3099         }
3100 
3101         if (sk->packets_out == 0 && sk->partial != NULL &&
3102                 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) 
3103         {
3104                 flag |= 1;
3105                 tcp_send_partial(sk);
3106         }
3107 
3108         /*
3109          * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
3110          * we are now waiting for an acknowledge to our FIN.  The other end is
3111          * already in TIME_WAIT.
3112          *
3113          * Move to TCP_CLOSE on success.
3114          */
3115 
3116         if (sk->state == TCP_LAST_ACK) 
3117         {
3118                 if (!sk->dead)
3119                         sk->state_change(sk);
3120                 if(sk->debug)
3121                         printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
3122                                 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3123                 if (sk->rcv_ack_seq == sk->write_seq && sk->acked_seq == sk->fin_seq) 
3124                 {
3125                         flag |= 1;
3126                         tcp_set_state(sk,TCP_CLOSE);
3127                         sk->shutdown = SHUTDOWN_MASK;
3128                 }
3129         }
3130 
3131         /*
3132          *      Incoming ACK to a FIN we sent in the case of our initiating the close.
3133          *
3134          *      Move to FIN_WAIT2 to await a FIN from the other end. Set
3135          *      SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3136          */
3137 
3138         if (sk->state == TCP_FIN_WAIT1) 
3139         {
3140 
3141                 if (!sk->dead) 
3142                         sk->state_change(sk);
3143                 if (sk->rcv_ack_seq == sk->write_seq) 
3144                 {
3145                         flag |= 1;
3146                         sk->shutdown |= SEND_SHUTDOWN;
3147                         tcp_set_state(sk, TCP_FIN_WAIT2);
3148                 }
3149         }
3150 
3151         /*
3152          *      Incoming ACK to a FIN we sent in the case of a simultaneous close.
3153          *
3154          *      Move to TIME_WAIT
3155          */
3156 
3157         if (sk->state == TCP_CLOSING) 
3158         {
3159 
3160                 if (!sk->dead) 
3161                         sk->state_change(sk);
3162                 if (sk->rcv_ack_seq == sk->write_seq) 
3163                 {
3164                         flag |= 1;
3165                         tcp_time_wait(sk);
3166                 }
3167         }
3168         
3169         /*
3170          *      Final ack of a three way shake 
3171          */
3172          
3173         if(sk->state==TCP_SYN_RECV)
3174         {
3175                 tcp_set_state(sk, TCP_ESTABLISHED);
3176                 tcp_options(sk,th);
3177                 sk->dummy_th.dest=th->source;
3178                 sk->copied_seq=sk->acked_seq-1;
3179                 if(!sk->dead)
3180                         sk->state_change(sk);
3181                 if(sk->max_window==0)
3182                 {
3183                         sk->max_window=32;
3184                         sk->mss=min(sk->max_window,sk->mtu);
3185                 }
3186         }
3187         
3188         /*
3189          * I make no guarantees about the first clause in the following
3190          * test, i.e. "(!flag) || (flag&4)".  I'm not entirely sure under
3191          * what conditions "!flag" would be true.  However I think the rest
3192          * of the conditions would prevent that from causing any
3193          * unnecessary retransmission. 
3194          *   Clearly if the first packet has expired it should be 
3195          * retransmitted.  The other alternative, "flag&2 && retransmits", is
3196          * harder to explain:  You have to look carefully at how and when the
3197          * timer is set and with what timeout.  The most recent transmission always
3198          * sets the timer.  So in general if the most recent thing has timed
3199          * out, everything before it has as well.  So we want to go ahead and
3200          * retransmit some more.  If we didn't explicitly test for this
3201          * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3202          * would not be true.  If you look at the pattern of timing, you can
3203          * show that rto is increased fast enough that the next packet would
3204          * almost never be retransmitted immediately.  Then you'd end up
3205          * waiting for a timeout to send each packet on the retransmission
3206          * queue.  With my implementation of the Karn sampling algorithm,
3207          * the timeout would double each time.  The net result is that it would
3208          * take a hideous amount of time to recover from a single dropped packet.
3209          * It's possible that there should also be a test for TIME_WRITE, but
3210          * I think as long as "send_head != NULL" and "retransmit" is on, we've
3211          * got to be in real retransmission mode.
3212          *   Note that tcp_do_retransmit is called with all==1.  Setting cong_window
3213          * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3214          * As long as no further losses occur, this seems reasonable.
3215          */
3216         
3217         if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3218                (((flag&2) && sk->retransmits) ||
3219                (sk->send_head->when + sk->rto < jiffies))) 
3220         {
3221                 if(sk->send_head->when + sk->rto < jiffies)
3222                         tcp_retransmit(sk,0);   
3223                 else
3224                 {
3225                         tcp_do_retransmit(sk, 1);
3226                         reset_timer(sk, TIME_WRITE, sk->rto);
3227                 }
3228         }
3229 
3230         return(1);
3231 }
3232 
3233 
3234 /*
3235  *      Process the FIN bit. This now behaves as it is supposed to work
3236  *      and the FIN takes effect when it is validly part of sequence
3237  *      space. Not before when we get holes.
3238  *
3239  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3240  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3241  *      TIME-WAIT)
3242  *
3243  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3244  *      close and we go into CLOSING (and later onto TIME-WAIT)
3245  *
3246  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3247  *
3248  */
3249  
3250 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
     /* [previous][next][first][last][top][bottom][index][help] */
3251 {
3252         sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3253 
3254         if (!sk->dead) 
3255         {
3256                 sk->state_change(sk);
3257         }
3258 
3259         switch(sk->state) 
3260         {
3261                 case TCP_SYN_RECV:
3262                 case TCP_SYN_SENT:
3263                 case TCP_ESTABLISHED:
3264                         /*
3265                          * move to CLOSE_WAIT, tcp_data() already handled
3266                          * sending the ack.
3267                          */     /* Check me --------------vvvvvvv */
3268                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3269                         tcp_set_state(sk,TCP_CLOSE_WAIT);
3270                         if (th->rst)
3271                                 sk->shutdown = SHUTDOWN_MASK;
3272                         break;
3273 
3274                 case TCP_CLOSE_WAIT:
3275                 case TCP_CLOSING:
3276                         /*
3277                          * received a retransmission of the FIN, do
3278                          * nothing.
3279                          */
3280                         break;
3281                 case TCP_TIME_WAIT:
3282                         /*
3283                          * received a retransmission of the FIN,
3284                          * restart the TIME_WAIT timer.
3285                          */
3286                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3287                         return(0);
3288                 case TCP_FIN_WAIT1:
3289                         /*
3290                          * This case occurs when a simultaneous close
3291                          * happens, we must ack the received FIN and
3292                          * enter the CLOSING state.
3293                          *
3294                          * This causes a WRITE timeout, which will either
3295                          * move on to TIME_WAIT when we timeout, or resend
3296                          * the FIN properly (maybe we get rid of that annoying
3297                          * FIN lost hang). The TIME_WRITE code is already correct
3298                          * for handling this timeout.
3299                          */
3300 
3301                         if(sk->timeout != TIME_WRITE)
3302                                 reset_timer(sk, TIME_WRITE, sk->rto);
3303                         tcp_set_state(sk,TCP_CLOSING);
3304                         break;
3305                 case TCP_FIN_WAIT2:
3306                         /*
3307                          * received a FIN -- send ACK and enter TIME_WAIT
3308                          */
3309                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3310                         sk->shutdown|=SHUTDOWN_MASK;
3311                         tcp_set_state(sk,TCP_TIME_WAIT);
3312                         break;
3313                 case TCP_CLOSE:
3314                         /*
3315                          * already in CLOSE
3316                          */
3317                         break;
3318                 default:
3319                         tcp_set_state(sk,TCP_LAST_ACK);
3320         
3321                         /* Start the timers. */
3322                         reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3323                         return(0);
3324         }
3325 
3326         return(0);
3327 }
3328 
3329 
3330 
3331 /*
3332  *      This routine handles the data.  If there is room in the buffer,
3333  *      it will be have already been moved into it.  If there is no
3334  *      room, then we will just have to discard the packet.
3335  */
3336 
3337 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, 
     /* [previous][next][first][last][top][bottom][index][help] */
3338          unsigned long saddr, unsigned short len)
3339 {
3340         struct sk_buff *skb1, *skb2;
3341         struct tcphdr *th;
3342         int dup_dumped=0;
3343         unsigned long new_seq;
3344         unsigned long shut_seq;
3345 
3346         th = skb->h.th;
3347         skb->len = len -(th->doff*4);
3348 
3349         /*
3350          *      The bytes in the receive read/assembly queue has increased. Needed for the
3351          *      low memory discard algorithm 
3352          */
3353            
3354         sk->bytes_rcv += skb->len;
3355         
3356         if (skb->len == 0 && !th->fin && !th->urg && !th->psh) 
3357         {
3358                 /* 
3359                  *      Don't want to keep passing ack's back and forth. 
3360                  *      (someone sent us dataless, boring frame)
3361                  */
3362                 if (!th->ack)
3363                         tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3364                 kfree_skb(skb, FREE_READ);
3365                 return(0);
3366         }
3367         
3368         /*
3369          *      We no longer have anyone receiving data on this connection.
3370          */
3371 
3372         if(sk->shutdown & RCV_SHUTDOWN)
3373         {
3374                 /*
3375                  *      FIXME: BSD has some magic to avoid sending resets to
3376                  *      broken 4.2 BSD keepalives. Much to my suprise a few none
3377                  *      BSD stacks still have broken keepalives so we want to
3378                  *      cope with it.
3379                  */
3380                  
3381                 if(skb->len)    /* We don't care if its just an ack or
3382                                    a keepalive/window probe */
3383                 {
3384                         new_seq= th->seq + skb->len + th->syn;  /* Right edge of _data_ part of frame */
3385                         
3386                         /* Do this the way 4.4BSD treats it. Not what I'd
3387                            regard as the meaning of the spec but its what BSD
3388                            does and clearly they know everything 8) */
3389                         
3390                         /*
3391                          *      This is valid because of two things
3392                          *
3393                          *      a) The way tcp_data behaves at the bottom.
3394                          *      b) A fin takes effect when read not when received.
3395                          */
3396                          
3397                         shut_seq=sk->acked_seq+1;       /* Last byte */
3398                         
3399                         if(after(new_seq,shut_seq))
3400                         {
3401                                 if(sk->debug)
3402                                         printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n",
3403                                                 sk, new_seq, shut_seq, sk->blog);
3404                                 if(sk->dead)
3405                                 {
3406                                         sk->acked_seq = new_seq + th->fin;
3407                                         tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3408                                                 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3409                                         tcp_statistics.TcpEstabResets++;
3410                                         tcp_set_state(sk,TCP_CLOSE);
3411                                         sk->err = EPIPE;
3412                                         sk->shutdown = SHUTDOWN_MASK;
3413                                         kfree_skb(skb, FREE_READ);
3414                                         return 0;
3415                                 }
3416                         }
3417                 }
3418         }
3419 
3420         /*
3421          *      Now we have to walk the chain, and figure out where this one
3422          *      goes into it.  This is set up so that the last packet we received
3423          *      will be the first one we look at, that way if everything comes
3424          *      in order, there will be no performance loss, and if they come
3425          *      out of order we will be able to fit things in nicely.
3426          *
3427          *      [AC: This is wrong. We should assume in order first and then walk
3428          *       forwards from the first hole based upon real traffic patterns.]
3429          *      
3430          */
3431 
3432         /* 
3433          *      This should start at the last one, and then go around forwards.
3434          */
3435 
3436         if (skb_peek(&sk->receive_queue) == NULL)       /* Empty queue is easy case */
3437         {
3438                 skb_queue_head(&sk->receive_queue,skb);
3439                 skb1= NULL;
3440         } 
3441         else
3442         {
3443                 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) 
3444                 {
3445                         if(sk->debug)
3446                         {
3447                                 printk("skb1=%p :", skb1);
3448                                 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
3449                                 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
3450                                 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
3451                                                 sk->acked_seq);
3452                         }
3453                         
3454                         /*
3455                          *      Optimisation: Duplicate frame or extension of previous frame from
3456                          *      same sequence point (lost ack case).
3457                          *      The frame contains duplicate data or replaces a previous frame
3458                          *      discard the previous frame (safe as sk->inuse is set) and put
3459                          *      the new one in its place.
3460                          */
3461                          
3462                         if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3463                         {
3464                                 skb_append(skb1,skb);
3465                                 skb_unlink(skb1);
3466                                 kfree_skb(skb1,FREE_READ);
3467                                 dup_dumped=1;
3468                                 skb1=NULL;
3469                                 break;
3470                         }
3471                         
3472                         /*
3473                          *      Found where it fits
3474                          */
3475                          
3476                         if (after(th->seq+1, skb1->h.th->seq))
3477                         {
3478                                 skb_append(skb1,skb);
3479                                 break;
3480                         }
3481                         
3482                         /*
3483                          *      See if we've hit the start. If so insert.
3484                          */
3485                         if (skb1 == skb_peek(&sk->receive_queue))
3486                         {
3487                                 skb_queue_head(&sk->receive_queue, skb);
3488                                 break;
3489                         }
3490                 }
3491         }
3492 
3493         /*
3494          *      Figure out what the ack value for this frame is
3495          */
3496          
3497         th->ack_seq = th->seq + skb->len;
3498         if (th->syn) 
3499                 th->ack_seq++;
3500         if (th->fin)
3501                 th->ack_seq++;
3502 
3503         if (before(sk->acked_seq, sk->copied_seq)) 
3504         {
3505                 printk("*** tcp.c:tcp_data bug acked < copied\n");
3506                 sk->acked_seq = sk->copied_seq;
3507         }
3508 
3509         /*
3510          *      Now figure out if we can ack anything. This is very messy because we really want two
3511          *      receive queues, a completed and an assembly queue. We also want only one transmit
3512          *      queue.
3513          */
3514 
3515         if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) 
3516         {
3517                 if (before(th->seq, sk->acked_seq+1)) 
3518                 {
3519                         int newwindow;
3520 
3521                         if (after(th->ack_seq, sk->acked_seq)) 
3522                         {
3523                                 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3524                                 if (newwindow < 0)
3525                                         newwindow = 0;  
3526                                 sk->window = newwindow;
3527                                 sk->acked_seq = th->ack_seq;
3528                         }
3529                         skb->acked = 1;
3530 
3531                         /*
3532                          *      When we ack the fin, we turn on the RCV_SHUTDOWN flag. Also do the FIN 
3533                          *      processing.
3534                          */
3535 
3536                         if (skb->h.th->fin) 
3537                         {
3538                                 if (!sk->dead) 
3539                                         sk->state_change(sk);
3540                                 sk->shutdown |= RCV_SHUTDOWN;
3541                                 tcp_fin(skb,sk,skb->h.th);
3542                         }
3543           
3544                         for(skb2 = skb->next;
3545                             skb2 != (struct sk_buff *)&sk->receive_queue;
3546                             skb2 = skb2->next) 
3547                         {
3548                                 if (before(skb2->h.th->seq, sk->acked_seq+1)) 
3549                                 {
3550                                         if (after(skb2->h.th->ack_seq, sk->acked_seq))
3551                                         {
3552                                                 newwindow = sk->window -
3553                                                  (skb2->h.th->ack_seq - sk->acked_seq);
3554                                                 if (newwindow < 0)
3555                                                         newwindow = 0;  
3556                                                 sk->window = newwindow;
3557                                                 sk->acked_seq = skb2->h.th->ack_seq;
3558                                         }
3559                                         skb2->acked = 1;
3560                                         /*
3561                                          *      When we ack the fin, we turn on
3562                                          *      the RCV_SHUTDOWN flag.
3563                                          */
3564                                         if (skb2->h.th->fin) 
3565                                         {
3566                                                 sk->shutdown |= RCV_SHUTDOWN;
3567                                                 if (!sk->dead)
3568                                                         sk->state_change(sk);
3569                                                 tcp_fin(skb,sk,skb->h.th);
3570                                         }
3571 
3572                                         /*
3573                                          *      Force an immediate ack.
3574                                          */
3575                                          
3576                                         sk->ack_backlog = sk->max_ack_backlog;
3577                                 }
3578                                 else
3579                                 {
3580                                         break;
3581                                 }
3582                         }
3583 
3584                         /*
3585                          *      This also takes care of updating the window.
3586                          *      This if statement needs to be simplified.
3587                          */
3588                         if (!sk->delay_acks ||
3589                             sk->ack_backlog >= sk->max_ack_backlog || 
3590                             sk->bytes_rcv > sk->max_unacked || th->fin) {
3591         /*                      tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
3592                         }
3593                         else 
3594                         {
3595                                 sk->ack_backlog++;
3596                                 if(sk->debug)
3597                                         printk("Ack queued.\n");
3598                                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
3599                         }
3600                 }
3601         }
3602 
3603         /*
3604          *      If we've missed a packet, send an ack.
3605          *      Also start a timer to send another.
3606          */
3607          
3608         if (!skb->acked) 
3609         {
3610         
3611         /*
3612          *      This is important.  If we don't have much room left,
3613          *      we need to throw out a few packets so we have a good
3614          *      window.  Note that mtu is used, not mss, because mss is really
3615          *      for the send side.  He could be sending us stuff as large as mtu.
3616          */
3617                  
3618                 while (sk->prot->rspace(sk) < sk->mtu) 
3619                 {
3620                         skb1 = skb_peek(&sk->receive_queue);
3621                         if (skb1 == NULL) 
3622                         {
3623                                 printk("INET: tcp.c:tcp_data memory leak detected.\n");
3624                                 break;
3625                         }
3626 
3627                         /*
3628                          *      Don't throw out something that has been acked. 
3629                          */
3630                  
3631                         if (skb1->acked) 
3632                         {
3633                                 break;
3634                         }
3635                 
3636                         skb_unlink(skb1);
3637                         kfree_skb(skb1, FREE_READ);
3638                 }
3639                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
3640                 sk->ack_backlog++;
3641                 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
3642         }
3643         else
3644         {
3645                 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
3646         }
3647 
3648         /*
3649          *      Now tell the user we may have some data. 
3650          */
3651          
3652         if (!sk->dead) 
3653         {
3654                 if(sk->debug)
3655                         printk("Data wakeup.\n");
3656                 sk->data_ready(sk,0);
3657         } 
3658         return(0);
3659 }
3660 
3661 
3662 /*
3663  *      This routine is only called when we have urgent data
3664  *      signalled. Its the 'slow' part of tcp_urg. It could be
3665  *      moved inline now as tcp_urg is only called from one
3666  *      place. We handle URGent data wrong. We have to - as
3667  *      BSD still doesn't use the correction from RFC961.
3668  */
3669  
3670 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
     /* [previous][next][first][last][top][bottom][index][help] */
3671 {
3672         unsigned long ptr = ntohs(th->urg_ptr);
3673 
3674         if (ptr)
3675                 ptr--;
3676         ptr += th->seq;
3677 
3678         /* ignore urgent data that we've already seen and read */
3679         if (after(sk->copied_seq+1, ptr))
3680                 return;
3681 
3682         /* do we already have a newer (or duplicate) urgent pointer? */
3683         if (sk->urg_data && !after(ptr, sk->urg_seq))
3684                 return;
3685 
3686         /* tell the world about our new urgent pointer */
3687         if (sk->proc != 0) {
3688                 if (sk->proc > 0) {
3689                         kill_proc(sk->proc, SIGURG, 1);
3690                 } else {
3691                         kill_pg(-sk->proc, SIGURG, 1);
3692                 }
3693         }
3694         sk->urg_data = URG_NOTYET;
3695         sk->urg_seq = ptr;
3696 }
3697 
3698 /*
3699  *      This is the 'fast' part of urgent handling.
3700  */
3701  
3702 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
     /* [previous][next][first][last][top][bottom][index][help] */
3703         unsigned long saddr, unsigned long len)
3704 {
3705         unsigned long ptr;
3706 
3707         /*
3708          *      Check if we get a new urgent pointer - normally not 
3709          */
3710          
3711         if (th->urg)
3712                 tcp_check_urg(sk,th);
3713 
3714         /*
3715          *      Do we wait for any urgent data? - normally not
3716          */
3717          
3718         if (sk->urg_data != URG_NOTYET)
3719                 return 0;
3720 
3721         /*
3722          *      Is the urgent pointer pointing into this packet? 
3723          */
3724          
3725         ptr = sk->urg_seq - th->seq + th->doff*4;
3726         if (ptr >= len)
3727                 return 0;
3728 
3729         /*
3730          *      Ok, got the correct packet, update info 
3731          */
3732          
3733         sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
3734         if (!sk->dead)
3735                 sk->data_ready(sk,0);
3736         return 0;
3737 }
3738 
3739 /*
3740  *      This will accept the next outstanding connection. 
3741  */
3742  
3743 static struct sock *tcp_accept(struct sock *sk, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
3744 {
3745         struct sock *newsk;
3746         struct sk_buff *skb;
3747   
3748   /*
3749    * We need to make sure that this socket is listening,
3750    * and that it has something pending.
3751    */
3752 
3753         if (sk->state != TCP_LISTEN) 
3754         {
3755                 sk->err = EINVAL;
3756                 return(NULL); 
3757         }
3758 
3759         /* Avoid the race. */
3760         cli();
3761         sk->inuse = 1;
3762 
3763         while((skb = tcp_dequeue_established(sk)) == NULL) 
3764         {
3765                 if (flags & O_NONBLOCK) 
3766                 {
3767                         sti();
3768                         release_sock(sk);
3769                         sk->err = EAGAIN;
3770                         return(NULL);
3771                 }
3772 
3773                 release_sock(sk);
3774                 interruptible_sleep_on(sk->sleep);
3775                 if (current->signal & ~current->blocked) 
3776                 {
3777                         sti();
3778                         sk->err = ERESTARTSYS;
3779                         return(NULL);
3780                 }
3781                 sk->inuse = 1;
3782         }
3783         sti();
3784 
3785         /*
3786          *      Now all we need to do is return skb->sk. 
3787          */
3788 
3789         newsk = skb->sk;
3790 
3791         kfree_skb(skb, FREE_READ);
3792         sk->ack_backlog--;
3793         release_sock(sk);
3794         return(newsk);
3795 }
3796 
3797 
3798 /*
3799  *      This will initiate an outgoing connection. 
3800  */
3801  
3802 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
     /* [previous][next][first][last][top][bottom][index][help] */
3803 {
3804         struct sk_buff *buff;
3805         struct device *dev=NULL;
3806         unsigned char *ptr;
3807         int tmp;
3808         int atype;
3809         struct tcphdr *t1;
3810         struct rtable *rt;
3811 
3812         if (sk->state != TCP_CLOSE) 
3813         {
3814                 return(-EISCONN);
3815         }
3816         
3817         if (addr_len < 8) 
3818                 return(-EINVAL);
3819 
3820         if (usin->sin_family && usin->sin_family != AF_INET) 
3821                 return(-EAFNOSUPPORT);
3822 
3823         /*
3824          *      connect() to INADDR_ANY means loopback (BSD'ism).
3825          */
3826         
3827         if(usin->sin_addr.s_addr==INADDR_ANY)
3828                 usin->sin_addr.s_addr=ip_my_addr();
3829                   
3830         /*
3831          *      Don't want a TCP connection going to a broadcast address 
3832          */
3833 
3834         if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) 
3835                 return -ENETUNREACH;
3836   
3837         sk->inuse = 1;
3838         sk->daddr = usin->sin_addr.s_addr;
3839         sk->write_seq = jiffies * SEQ_TICK - seq_offset;
3840         sk->window_seq = sk->write_seq;
3841         sk->rcv_ack_seq = sk->write_seq -1;
3842         sk->err = 0;
3843         sk->dummy_th.dest = usin->sin_port;
3844         release_sock(sk);
3845 
3846         buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
3847         if (buff == NULL) 
3848         {
3849                 return(-ENOMEM);
3850         }
3851         sk->inuse = 1;
3852         buff->len = 24;
3853         buff->sk = sk;
3854         buff->free = 1;
3855         buff->localroute = sk->localroute;
3856         
3857         t1 = (struct tcphdr *) buff->data;
3858 
3859         /*
3860          *      Put in the IP header and routing stuff. 
3861          */
3862          
3863         rt=ip_rt_route(sk->daddr, NULL, NULL);
3864         
3865 
3866         /*
3867          *      We need to build the routing stuff from the things saved in skb. 
3868          */
3869 
3870         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
3871                                         IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3872         if (tmp < 0) 
3873         {
3874                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
3875                 release_sock(sk);
3876                 return(-ENETUNREACH);
3877         }
3878 
3879         buff->len += tmp;
3880         t1 = (struct tcphdr *)((char *)t1 +tmp);
3881 
3882         memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
3883         t1->seq = ntohl(sk->write_seq++);
3884         sk->sent_seq = sk->write_seq;
3885         buff->h.seq = sk->write_seq;
3886         t1->ack = 0;
3887         t1->window = 2;
3888         t1->res1=0;
3889         t1->res2=0;
3890         t1->rst = 0;
3891         t1->urg = 0;
3892         t1->psh = 0;
3893         t1->syn = 1;
3894         t1->urg_ptr = 0;
3895         t1->doff = 6;
3896         /* use 512 or whatever user asked for */
3897         
3898         if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3899                 sk->window_clamp=rt->rt_window;
3900         else
3901                 sk->window_clamp=0;
3902 
3903         if (sk->user_mss)
3904                 sk->mtu = sk->user_mss;
3905         else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
3906                 sk->mtu = rt->rt_mss;
3907         else 
3908         {
3909 #ifdef CONFIG_INET_SNARL
3910                 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
3911 #else
3912                 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
3913 #endif
3914                         sk->mtu = 576 - HEADER_SIZE;
3915                 else
3916                         sk->mtu = MAX_WINDOW;
3917         }
3918         /*
3919          *      but not bigger than device MTU 
3920          */
3921 
3922         if(sk->mtu <32)
3923                 sk->mtu = 32;   /* Sanity limit */
3924                 
3925         sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
3926         
3927         /*
3928          *      Put in the TCP options to say MTU. 
3929          */
3930 
3931         ptr = (unsigned char *)(t1+1);
3932         ptr[0] = 2;
3933         ptr[1] = 4;
3934         ptr[2] = (sk->mtu) >> 8;
3935         ptr[3] = (sk->mtu) & 0xff;
3936         tcp_send_check(t1, sk->saddr, sk->daddr,
3937                   sizeof(struct tcphdr) + 4, sk);
3938 
3939         /*
3940          *      This must go first otherwise a really quick response will get reset. 
3941          */
3942 
3943         tcp_set_state(sk,TCP_SYN_SENT);
3944         sk->rto = TCP_TIMEOUT_INIT;
3945         reset_timer(sk, TIME_WRITE, sk->rto);   /* Timer for repeating the SYN until an answer */
3946         sk->retransmits = TCP_RETR2 - TCP_SYN_RETRIES;
3947 
3948         sk->prot->queue_xmit(sk, dev, buff, 0);  
3949         tcp_statistics.TcpActiveOpens++;
3950         tcp_statistics.TcpOutSegs++;
3951   
3952         release_sock(sk);
3953         return(0);
3954 }
3955 
3956 
3957 /* This functions checks to see if the tcp header is actually acceptable. */
3958 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
     /* [previous][next][first][last][top][bottom][index][help] */
3959              struct options *opt, unsigned long saddr, struct device *dev)
3960 {
3961         unsigned long next_seq;
3962 
3963         next_seq = len - 4*th->doff;
3964         if (th->fin)
3965                 next_seq++;
3966         /* if we have a zero window, we can't have any data in the packet.. */
3967         if (next_seq && !sk->window)
3968                 goto ignore_it;
3969         next_seq += th->seq;
3970 
3971         /*
3972          * This isn't quite right.  sk->acked_seq could be more recent
3973          * than sk->window.  This is however close enough.  We will accept
3974          * slightly more packets than we should, but it should not cause
3975          * problems unless someone is trying to forge packets.
3976          */
3977 
3978         /* have we already seen all of this packet? */
3979         if (!after(next_seq+1, sk->acked_seq))
3980                 goto ignore_it;
3981         /* or does it start beyond the window? */
3982         if (!before(th->seq, sk->acked_seq + sk->window + 1))
3983                 goto ignore_it;
3984 
3985         /* ok, at least part of this packet would seem interesting.. */
3986         return 1;
3987 
3988 ignore_it:
3989         if (th->rst)
3990                 return 0;
3991 
3992         /*
3993          *      Send a reset if we get something not ours and we are
3994          *      unsynchronized. Note: We don't do anything to our end. We
3995          *      are just killing the bogus remote connection then we will
3996          *      connect again and it will work (with luck).
3997          */
3998          
3999         if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
4000         {
4001                 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4002                 return 1;
4003         }
4004 
4005         /* Try to resync things. */
4006         tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4007         return 0;
4008 }
4009 
4010 /*
4011  *      When we get a reset we do this.
4012  */
4013 
4014 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
     /* [previous][next][first][last][top][bottom][index][help] */
4015 {
4016         sk->zapped = 1;
4017         sk->err = ECONNRESET;
4018         if (sk->state == TCP_SYN_SENT)
4019                 sk->err = ECONNREFUSED;
4020         if (sk->state == TCP_CLOSE_WAIT)
4021                 sk->err = EPIPE;
4022 #ifdef TCP_DO_RFC1337           
4023         /*
4024          *      Time wait assassination protection [RFC1337]
4025          */
4026         if(sk->state!=TCP_TIME_WAIT)
4027         {       
4028                 tcp_set_state(sk,TCP_CLOSE);
4029                 sk->shutdown = SHUTDOWN_MASK;
4030         }
4031 #else   
4032         tcp_set_state(sk,TCP_CLOSE);
4033         sk->shutdown = SHUTDOWN_MASK;
4034 #endif  
4035         if (!sk->dead) 
4036                 sk->state_change(sk);
4037         kfree_skb(skb, FREE_READ);
4038         release_sock(sk);
4039         return(0);
4040 }
4041 
4042 /*
4043  *      A TCP packet has arrived.
4044  */
4045  
4046 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
     /* [previous][next][first][last][top][bottom][index][help] */
4047         unsigned long daddr, unsigned short len,
4048         unsigned long saddr, int redo, struct inet_protocol * protocol)
4049 {
4050         struct tcphdr *th;
4051         struct sock *sk;
4052         int syn_ok=0;
4053         
4054         if (!skb) 
4055         {
4056                 printk("IMPOSSIBLE 1\n");
4057                 return(0);
4058         }
4059 
4060         if (!dev) 
4061         {
4062                 printk("IMPOSSIBLE 2\n");
4063                 return(0);
4064         }
4065   
4066         tcp_statistics.TcpInSegs++;
4067   
4068         if(skb->pkt_type!=PACKET_HOST)
4069         {
4070                 kfree_skb(skb,FREE_READ);
4071                 return(0);
4072         }
4073   
4074         th = skb->h.th;
4075 
4076         /*
4077          *      Find the socket.
4078          */
4079 
4080         sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4081 
4082         /*
4083          *      If this socket has got a reset its to all intents and purposes 
4084          *      really dead. Count closed sockets as dead.
4085          *
4086          *      Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4087          *      simply drops data. This seems incorrect as a 'closed' TCP doesn't
4088          *      exist so should cause resets as if the port was unreachable.
4089          */
4090          
4091         if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4092                 sk=NULL;
4093 
4094         if (!redo) 
4095         {
4096                 if (tcp_check(th, len, saddr, daddr )) 
4097                 {
4098                         skb->sk = NULL;
4099                         kfree_skb(skb,FREE_READ);
4100                         /*
4101                          * We don't release the socket because it was
4102                          * never marked in use.
4103                          */
4104                         return(0);
4105                 }
4106                 th->seq = ntohl(th->seq);
4107 
4108                 /* See if we know about the socket. */
4109                 if (sk == NULL) 
4110                 {
4111                         /*
4112                          *      No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4113                          */
4114                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4115                         skb->sk = NULL;
4116                         /*
4117                          *      Discard frame
4118                          */
4119                         kfree_skb(skb, FREE_READ);
4120                         return(0);
4121                 }
4122 
4123                 skb->len = len;
4124                 skb->acked = 0;
4125                 skb->used = 0;
4126                 skb->free = 0;
4127                 skb->saddr = daddr;
4128                 skb->daddr = saddr;
4129         
4130                 /* We may need to add it to the backlog here. */
4131                 cli();
4132                 if (sk->inuse) 
4133                 {
4134                         skb_queue_tail(&sk->back_log, skb);
4135                         sti();
4136                         return(0);
4137                 }
4138                 sk->inuse = 1;
4139                 sti();
4140         }
4141         else
4142         {
4143                 if (sk==NULL) 
4144                 {
4145                         tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4146                         skb->sk = NULL;
4147                         kfree_skb(skb, FREE_READ);
4148                         return(0);
4149                 }
4150         }
4151 
4152 
4153         if (!sk->prot) 
4154         {
4155                 printk("IMPOSSIBLE 3\n");
4156                 return(0);
4157         }
4158 
4159 
4160         /*
4161          *      Charge the memory to the socket. 
4162          */
4163          
4164         if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
4165         {
4166                 kfree_skb(skb, FREE_READ);
4167                 release_sock(sk);
4168                 return(0);
4169         }
4170 
4171         skb->sk=sk;
4172         sk->rmem_alloc += skb->mem_len;
4173 
4174         /*
4175          *      This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4176          *      don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4177          *      compatibility. We also set up variables more throughroughly [Karn notes in the
4178          *      KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4179          */
4180 
4181         if(sk->state!=TCP_ESTABLISHED)          /* Skip this lot for normal flow */
4182         {
4183         
4184                 /*
4185                  *      Now deal with unusual cases.
4186                  */
4187          
4188                 if(sk->state==TCP_LISTEN)
4189                 {
4190                         if(th->ack)     /* These use the socket TOS.. might want to be the received TOS */
4191                                 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4192 
4193                         /*
4194                          *      We don't care for RST, and non SYN are absorbed (old segments)
4195                          *      Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4196                          *      netmask on a running connection it can go broadcast. Even Sun's have
4197                          *      this problem so I'm ignoring it 
4198                          */
4199                            
4200                         if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4201                         {
4202                                 kfree_skb(skb, FREE_READ);
4203                                 release_sock(sk);
4204                                 return 0;
4205                         }
4206                 
4207                         /*      
4208                          *      Guess we need to make a new socket up 
4209                          */
4210                 
4211                         tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4212                 
4213                         /*
4214                          *      Now we have several options: In theory there is nothing else
4215                          *      in the frame. KA9Q has an option to send data with the syn,
4216                          *      BSD accepts data with the syn up to the [to be] advertised window
4217                          *      and Solaris 2.1 gives you a protocol error. For now we just ignore
4218                          *      it, that fits the spec precisely and avoids incompatibilities. It
4219                          *      would be nice in future to drop through and process the data.
4220                          */
4221                          
4222                         release_sock(sk);
4223                         return 0;
4224                 }
4225         
4226                 /*
4227                  *      SYN sent means we have to look for a suitable ack and either reset
4228                  *      for bad matches or go to connected 
4229                  */
4230            
4231                 else if(sk->state==TCP_SYN_SENT)
4232                 {
4233                         /* Crossed SYN or previous junk segment */
4234                         if(th->ack)
4235                         {
4236                                 /* We got an ack, but its not a good ack */
4237                                 if(!tcp_ack(sk,th,saddr,len))
4238                                 {
4239                                         /* Reset the ack - its an ack from a 
4240                                            different connection  [ th->rst is checked in tcp_reset()] */
4241                                         tcp_statistics.TcpAttemptFails++;
4242                                         tcp_reset(daddr, saddr, th,
4243                                                 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4244                                         kfree_skb(skb, FREE_READ);
4245                                         release_sock(sk);
4246                                         return(0);
4247                                 }
4248                                 if(th->rst)
4249                                         return tcp_std_reset(sk,skb);
4250                                 if(!th->syn)
4251                                 {
4252                                         /* A valid ack from a different connection
4253                                            start. Shouldn't happen but cover it */
4254                                         kfree_skb(skb, FREE_READ);
4255                                         release_sock(sk);
4256                                         return 0;
4257                                 }
4258                                 /*
4259                                  *      Ok.. its good. Set up sequence numbers and
4260                                  *      move to established.
4261                                  */
4262                                 syn_ok=1;       /* Don't reset this connection for the syn */
4263                                 sk->acked_seq=th->seq+1;
4264                                 sk->fin_seq=th->seq;
4265                                 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4266                                 tcp_set_state(sk, TCP_ESTABLISHED);
4267                                 tcp_options(sk,th);
4268                                 sk->dummy_th.dest=th->source;
4269                                 sk->copied_seq=sk->acked_seq-1;
4270                                 if(!sk->dead)
4271                                         sk->state_change(sk);
4272                                 if(sk->max_window==0)
4273                                 {
4274                                         sk->max_window = 32;
4275                                         sk->mss = min(sk->max_window, sk->mtu);
4276                                 }
4277                         }
4278                         else
4279                         {
4280                                 /* See if SYN's cross. Drop if boring */
4281                                 if(th->syn && !th->rst)
4282                                 {
4283                                         /* Crossed SYN's are fine - but talking to
4284                                            yourself is right out... */
4285                                         if(sk->saddr==saddr && sk->daddr==daddr &&
4286                                                 sk->dummy_th.source==th->source &&
4287                                                 sk->dummy_th.dest==th->dest)
4288                                         {
4289                                                 tcp_statistics.TcpAttemptFails++;
4290                                                 return tcp_std_reset(sk,skb);
4291                                         }
4292                                         tcp_set_state(sk,TCP_SYN_RECV);
4293                                         
4294                                         /*
4295                                          *      FIXME:
4296                                          *      Must send SYN|ACK here
4297                                          */
4298                                 }               
4299                                 /* Discard junk segment */
4300                                 kfree_skb(skb, FREE_READ);
4301                                 release_sock(sk);
4302                                 return 0;
4303                         }
4304                         /*
4305                          *      SYN_RECV with data maybe.. drop through
4306                          */
4307                         goto rfc_step6;
4308                 }
4309                 
4310         }
4311                 
4312         /* BSD has a funny hack with TIME_WAIT and fast reuse of a port. We
4313            don't use it, we don't need it and its not in the spec. There is
4314            a more complex suggestion for fixing these reuse issues in RFC1644
4315            but not yet ready for general use. Also see RFC1379.*/
4316         
4317         /* We are now in normal data flow (see the step list in the RFC) */
4318         /* Note most of these are inline now. I'll inline the lot when
4319            I have time to test it hard and look at what gcc outputs */
4320 
4321 #define BSD_TIME_WAIT
4322 #ifdef BSD_TIME_WAIT
4323         if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
4324                 after(th->seq, sk->acked_seq) && !th->rst)
4325         {
4326                 long seq=sk->write_seq;
4327                 if(sk->debug)
4328                         printk("Doing a BSD time wait\n");
4329                 tcp_statistics.TcpEstabResets++;           
4330                 sk->rmem_alloc -= skb->mem_len;
4331                 skb->sk = NULL;
4332                 sk->err=ECONNRESET;
4333                 tcp_set_state(sk, TCP_CLOSE);
4334                 sk->shutdown = SHUTDOWN_MASK;
4335                 release_sock(sk);
4336                 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4337                 if(sk && sk->state==TCP_LISTEN)
4338                 {
4339                         sk->inuse=1;
4340                         skb->sk = sk;
4341                         sk->rmem_alloc += skb->mem_len;
4342                         tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4343                         release_sock(sk);
4344                         return 0;
4345                 }
4346                 kfree_skb(skb, FREE_READ);
4347                 return 0;
4348         }
4349 #endif  
4350         
4351         if(/*sk->state!=TCP_SYN_RECV &&*/ !tcp_sequence(sk,th,len,opt,saddr,dev))
4352         {
4353                 kfree_skb(skb, FREE_READ);
4354                 return 0;
4355         }
4356 
4357         if(th->rst)
4358                 return tcp_std_reset(sk,skb);
4359         
4360         /*
4361          *      !syn_ok is effectively the state test in RFC793.
4362          */
4363          
4364         if(th->syn && !syn_ok)
4365         {
4366                 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4367                 return tcp_std_reset(sk,skb);   
4368         }
4369 
4370         /*
4371          *      Process the ACK
4372          */
4373          
4374 
4375         if(th->ack && !tcp_ack(sk,th,saddr,len))
4376         {
4377                 /*
4378                  *      Our three way handshake failed.
4379                  */
4380                  
4381                 if(sk->state==TCP_SYN_RECV)
4382                 {
4383                         tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4384                 }
4385                 kfree_skb(skb, FREE_READ);
4386                 release_sock(sk);
4387                 return 0;
4388         }
4389         
4390 rfc_step6:              /* I'll clean this up later */
4391 
4392         /*
4393          *      Process urgent data
4394          */
4395                 
4396         if(tcp_urg(sk, th, saddr, len))
4397         {
4398                 kfree_skb(skb, FREE_READ);
4399                 release_sock(sk);
4400                 return 0;
4401         }
4402         
4403         
4404         /*
4405          *      Process the encapsulated data
4406          */
4407         
4408         if(tcp_data(skb,sk, saddr, len))
4409         {
4410                 kfree_skb(skb, FREE_READ);
4411                 release_sock(sk);
4412                 return 0;
4413         }
4414 
4415         /*
4416          *      And done
4417          */     
4418         
4419         release_sock(sk);
4420         return 0;
4421 }
4422 
4423 /*
4424  *      This routine sends a packet with an out of date sequence
4425  *      number. It assumes the other end will try to ack it.
4426  */
4427 
4428 static void tcp_write_wakeup(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4429 {
4430         struct sk_buff *buff;
4431         struct tcphdr *t1;
4432         struct device *dev=NULL;
4433         int tmp;
4434 
4435         if (sk->zapped)
4436                 return; /* After a valid reset we can send no more */
4437 
4438         /*
4439          *      Write data can still be transmitted/retransmitted in the
4440          *      following states.  If any other state is encountered, return.
4441          */
4442 
4443         if (sk->state != TCP_ESTABLISHED && 
4444             sk->state != TCP_CLOSE_WAIT &&
4445             sk->state != TCP_FIN_WAIT1 && 
4446             sk->state != TCP_LAST_ACK &&
4447             sk->state != TCP_CLOSING
4448         ) 
4449         {
4450                 return;
4451         }
4452 
4453         buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4454         if (buff == NULL) 
4455                 return;
4456 
4457         buff->len = sizeof(struct tcphdr);
4458         buff->free = 1;
4459         buff->sk = sk;
4460         buff->localroute = sk->localroute;
4461 
4462         t1 = (struct tcphdr *) buff->data;
4463 
4464         /* Put in the IP header and routing stuff. */
4465         tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4466                                 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
4467         if (tmp < 0) 
4468         {
4469                 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4470                 return;
4471         }
4472 
4473         buff->len += tmp;
4474         t1 = (struct tcphdr *)((char *)t1 +tmp);
4475 
4476         memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
4477 
4478         /*
4479          * Use a previous sequence.
4480          * This should cause the other end to send an ack.
4481          */
4482         t1->seq = htonl(sk->sent_seq-1);
4483         t1->ack = 1; 
4484         t1->res1= 0;
4485         t1->res2= 0;
4486         t1->rst = 0;
4487         t1->urg = 0;
4488         t1->psh = 0;
4489         t1->fin = 0;    /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
4490         t1->syn = 0;
4491         t1->ack_seq = ntohl(sk->acked_seq);
4492         t1->window = ntohs(tcp_select_window(sk));
4493         t1->doff = sizeof(*t1)/4;
4494         tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
4495 
4496          /*     Send it and free it.
4497           *     This will prevent the timer from automatically being restarted.
4498           */
4499         sk->prot->queue_xmit(sk, dev, buff, 1);
4500         tcp_statistics.TcpOutSegs++;
4501 }
4502 
4503 /*
4504  *      A window probe timeout has occured.
4505  */
4506 
4507 void tcp_send_probe0(struct sock *sk)
     /* [previous][next][first][last][top][bottom][index][help] */
4508 {
4509         if (sk->zapped)
4510                 return;         /* After a valid reset we can send no more */
4511 
4512         tcp_write_wakeup(sk);
4513 
4514         sk->backoff++;
4515         sk->rto = min(sk->rto << 1, 120*HZ);
4516         reset_timer (sk, TIME_PROBE0, sk->rto);
4517         sk->retransmits++;
4518         sk->prot->retransmits ++;
4519 }
4520 
4521 /*
4522  *      Socket option code for TCP. 
4523  */
4524   
4525 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4526 {
4527         int val,err;
4528 
4529         if(level!=SOL_TCP)
4530                 return ip_setsockopt(sk,level,optname,optval,optlen);
4531 
4532         if (optval == NULL) 
4533                 return(-EINVAL);
4534 
4535         err=verify_area(VERIFY_READ, optval, sizeof(int));
4536         if(err)
4537                 return err;
4538         
4539         val = get_fs_long((unsigned long *)optval);
4540 
4541         switch(optname)
4542         {
4543                 case TCP_MAXSEG:
4544 /*
4545  * values greater than interface MTU won't take effect.  however at
4546  * the point when this call is done we typically don't yet know
4547  * which interface is going to be used
4548  */
4549                         if(val<1||val>MAX_WINDOW)
4550                                 return -EINVAL;
4551                         sk->user_mss=val;
4552                         return 0;
4553                 case TCP_NODELAY:
4554                         sk->nonagle=(val==0)?0:1;
4555                         return 0;
4556                 default:
4557                         return(-ENOPROTOOPT);
4558         }
4559 }
4560 
4561 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /* [previous][next][first][last][top][bottom][index][help] */
4562 {
4563         int val,err;
4564 
4565         if(level!=SOL_TCP)
4566                 return ip_getsockopt(sk,level,optname,optval,optlen);
4567                         
4568         switch(optname)
4569         {
4570                 case TCP_MAXSEG:
4571                         val=sk->user_mss;
4572                         break;
4573                 case TCP_NODELAY:
4574                         val=sk->nonagle;
4575                         break;
4576                 default:
4577                         return(-ENOPROTOOPT);
4578         }
4579         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
4580         if(err)
4581                 return err;
4582         put_fs_long(sizeof(int),(unsigned long *) optlen);
4583 
4584         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
4585         if(err)
4586                 return err;
4587         put_fs_long(val,(unsigned long *)optval);
4588 
4589         return(0);
4590 }       
4591 
4592 
4593 struct proto tcp_prot = {
4594         sock_wmalloc,
4595         sock_rmalloc,
4596         sock_wfree,
4597         sock_rfree,
4598         sock_rspace,
4599         sock_wspace,
4600         tcp_close,
4601         tcp_read,
4602         tcp_write,
4603         tcp_sendto,
4604         tcp_recvfrom,
4605         ip_build_header,
4606         tcp_connect,
4607         tcp_accept,
4608         ip_queue_xmit,
4609         tcp_retransmit,
4610         tcp_write_wakeup,
4611         tcp_read_wakeup,
4612         tcp_rcv,
4613         tcp_select,
4614         tcp_ioctl,
4615         NULL,
4616         tcp_shutdown,
4617         tcp_setsockopt,
4618         tcp_getsockopt,
4619         128,
4620         0,
4621         {NULL,},
4622         "TCP"
4623 };

/* [previous][next][first][last][top][bottom][index][help] */